[PATCH RFC 1/2] docs: kdoc: add a class to parse data items

From: Mauro Carvalho Chehab

Date: Fri Mar 20 2026 - 05:46:57 EST

Instead of using very complex regular expressions and hamming
inner structs/unions, use CTokenizer to handle data types.

It should be noticed that this doesn't handle "typedef".

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@xxxxxxxxxx>
---
tools/lib/python/kdoc/data_parser.py | 211 +++++++++++++++++++++++++++
1 file changed, 211 insertions(+)
create mode 100644 tools/lib/python/kdoc/data_parser.py

diff --git a/tools/lib/python/kdoc/data_parser.py b/tools/lib/python/kdoc/data_parser.py
new file mode 100644
index 000000000000..f04915b67d6b
--- /dev/null
+++ b/tools/lib/python/kdoc/data_parser.py
@@ -0,0 +1,211 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@xxxxxxxxxx>.
+
+"""
+C lexical parser for variables.
+"""
+
+import logging
+import re
+
+from .c_lex import CTokenizer, CToken
+
+class CDataItem:
+ """
+ Represent a data declaration.
+ """
+ def __init__(self):
+ self.decl_name = None
+ self.decl_type = None
+ self.parameterlist = []
+ self.parametertypes = {}
+
+ def __repr__(self) -> str:
+ """
+ Return contents of the CDataItem.
+ Useful for debugging purposes.
+ """
+ return (f"CDataItem(decl_type={self.decl_type!r}, "
+ f"decl_name={self.decl_name!r}, "
+ f"parameterlist={self.parameterlist!r}, "
+ f"parametertypes={self.parametertypes!r})")
+
+class CDataParser:
+ """
+ Handles a C data prototype, converting it into a data element
+ describing it.
+ """
+
+ IGNORE_TOKENS = [CToken.SPACE, CToken.COMMENT]
+
+ def __init__(self, source):
+ self.source = source
+ self.item = CDataItem()
+
+ self._parse()
+
+ def _push_struct(self, tokens, stack, prev_kind, i):
+ """
+ Handles Structs and enums, picking the identifier just after
+ ``struct`` or ``union``.
+ """
+
+ if prev_kind:
+ j = prev_kind + 1
+ while j < len(tokens) and tokens[j].kind in self.IGNORE_TOKENS:
+ j += 1
+
+ if j < len(tokens) and tokens[j].kind == CToken.NAME:
+ stack.append(tokens[j].value)
+ return
+
+ name = "{unnamed " + tokens[prev_kind].value + "}"
+ stack.append(name)
+ self.item.parameterlist.append(name)
+ return
+
+ #
+ # Empty block. We still need to append for stack levels to match
+ #
+ stack.append(None)
+
+ def _parse(self):
+ """
+ Core algorithm it is a lightweight rewrite of the
+ walk-the-tokens logic we sketched in the previous answer.
+ """
+ tokens = CTokenizer(self.source).tokens
+
+ stack= []
+ current_type = []
+ parameters = []
+ types = {}
+
+ prev_kind = None
+ get_id = False
+ level = 0
+
+ for i in range(0, len(tokens)):
+ tok = tokens[i]
+ if tok.kind == CToken.COMMENT:
+ continue
+
+ if tok.kind in [CToken.STRUCT, CToken.UNION, CToken.ENUM]:
+ prev_kind = i
+
+ if tok.kind == CToken.BEGIN:
+ if tok.value == "{":
+ if (prev_kind and
+ tokens[prev_kind].kind in [CToken.STRUCT, CToken.UNION]):
+
+ self._push_struct(tokens, stack, prev_kind, i)
+ if not self.item.decl_name:
+ self.item.decl_name = stack[0]
+ else:
+ stack.append(None)
+
+ #
+ # Add previous tokens
+ #
+ if prev_kind:
+ get_id = True
+
+ if not self.item.decl_type:
+ self.item.decl_type = tokens[prev_kind].value
+
+ current_type = []
+
+ continue
+
+ level += 1
+
+ if tok.kind == CToken.END:
+ if tok.value == "}":
+ if stack:
+ stack.pop()
+
+ if get_id and prev_kind:
+ current_type = []
+ for j in range(prev_kind, i + 1):
+ current_type.append((level, tokens[j]))
+ if tok.kind == CToken.BEGIN:
+ break
+
+ while j < len(tokens):
+ if tokens[j].kind not in self.IGNORE_TOKENS:
+ break
+ j += 1
+
+ name = None
+
+ if tokens[j].kind == CToken.NAME:
+ name = tokens[j].value
+
+ if not self.item.decl_type and len(stack) == 1:
+ self.item.decl_name = stack[0]
+
+ self.item.parameterlist.append(name)
+ current_type.append((level, tok))
+
+ get_id = False
+ prev_kind = None
+ continue
+
+ level -= 1
+
+ if tok.kind != CToken.ENDSTMT:
+ current_type.append((level, tok))
+ continue
+
+ #
+ # End of an statement. Parse it if tokens are present
+ #
+
+ if not current_type:
+ current_type = []
+ continue
+
+ #
+ # the last NAME token with level 0 is the field name
+ #
+ name_token = None
+ for pos, t in enumerate(reversed(current_type)):
+ cur_level, cur_tok = t
+ if not cur_level and cur_tok.kind == CToken.NAME:
+ name_token = cur_tok. value
+ break
+
+ #
+ # TODO: we should likely emit a Warning here
+ #
+
+ if not name_token:
+ current_type = []
+ continue
+
+ #
+ # As we used reversed, we need to adjust pos here
+ #
+ pos = len(current_type) - pos - 1
+
+ #
+ # For the type, pick everything but the name
+ #
+
+ out = ""
+ for l, t in current_type:
+ out += t.value
+
+ names = []
+ for n in stack[1:] + [name_token]:
+ if n:
+ if not "{unnamed" in n:
+ names.append(n)
+
+ full_name = ".".join(names)
+
+ self.item.parameterlist.append(full_name)
+ self.item.parametertypes[full_name] = out.strip()
+
+ current_type = []
--
2.53.0