aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'Tools/c-analyzer/c_parser/parser')
-rw-r--r--Tools/c-analyzer/c_parser/parser/__init__.py212
-rw-r--r--Tools/c-analyzer/c_parser/parser/_alt.py6
-rw-r--r--Tools/c-analyzer/c_parser/parser/_common.py115
-rw-r--r--Tools/c-analyzer/c_parser/parser/_compound_decl_body.py158
-rw-r--r--Tools/c-analyzer/c_parser/parser/_delim.py54
-rw-r--r--Tools/c-analyzer/c_parser/parser/_func_body.py278
-rw-r--r--Tools/c-analyzer/c_parser/parser/_global.py179
-rw-r--r--Tools/c-analyzer/c_parser/parser/_info.py168
-rw-r--r--Tools/c-analyzer/c_parser/parser/_regexes.py796
9 files changed, 1966 insertions, 0 deletions
diff --git a/Tools/c-analyzer/c_parser/parser/__init__.py b/Tools/c-analyzer/c_parser/parser/__init__.py
new file mode 100644
index 00000000000..7cb34caf09e
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/__init__.py
@@ -0,0 +1,212 @@
+"""A simple non-validating parser for C99.
+
+The functions and regex patterns here are not entirely suitable for
+validating C syntax. Please rely on a proper compiler for that.
+Instead our goal here is merely matching and extracting information from
+valid C code.
+
+Furthermore, the grammar rules for the C syntax (particularly as
+described in the K&R book) actually describe a superset, of which the
+full C langage is a proper subset. Here are some of the extra
+conditions that must be applied when parsing C code:
+
+* ...
+
+(see: http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1256.pdf)
+
+We have taken advantage of the elements of the C grammar that are used
+only in a few limited contexts, mostly as delimiters. They allow us to
+focus the regex patterns confidently. Here are the relevant tokens and
+in which grammar rules they are used:
+
+separators:
+* ";"
+ + (decl) struct/union: at end of each member decl
+ + (decl) declaration: at end of each (non-compound) decl
+ + (stmt) expr stmt: at end of each stmt
+ + (stmt) for: between exprs in "header"
+ + (stmt) goto: at end
+ + (stmt) continue: at end
+ + (stmt) break: at end
+ + (stmt) return: at end
+* ","
+ + (decl) struct/union: between member declators
+ + (decl) param-list: between params
+ + (decl) enum: between enumerators
+ + (decl) initializer (compound): between initializers
+ + (expr) postfix: between func call args
+ + (expr) expression: between "assignment" exprs
+* ":"
+ + (decl) struct/union: in member declators
+ + (stmt) label: between label and stmt
+ + (stmt) case: between expression and stmt
+ + (stmt) default: between "default" and stmt
+* "="
+ + (decl) delaration: between decl and initializer
+ + (decl) enumerator: between identifier and "initializer"
+ + (expr) assignment: between "var" and expr
+
+wrappers:
+* "(...)"
+ + (decl) declarator (func ptr): to wrap ptr/name
+ + (decl) declarator (func ptr): around params
+ + (decl) declarator: around sub-declarator (for readability)
+ + (expr) postfix (func call): around args
+ + (expr) primary: around sub-expr
+ + (stmt) if: around condition
+ + (stmt) switch: around source expr
+ + (stmt) while: around condition
+ + (stmt) do-while: around condition
+ + (stmt) for: around "header"
+* "{...}"
+ + (decl) enum: around enumerators
+ + (decl) func: around body
+ + (stmt) compound: around stmts
+* "[...]"
+ * (decl) declarator: for arrays
+ * (expr) postfix: array access
+
+other:
+* "*"
+ + (decl) declarator: for pointer types
+ + (expr) unary: for pointer deref
+
+
+To simplify the regular expressions used here, we've takens some
+shortcuts and made certain assumptions about the code we are parsing.
+Some of these allow us to skip context-sensitive matching (e.g. braces)
+or otherwise still match arbitrary C code unambiguously. However, in
+some cases there are certain corner cases where the patterns are
+ambiguous relative to arbitrary C code. However, they are still
+unambiguous in the specific code we are parsing.
+
+Here are the cases where we've taken shortcuts or made assumptions:
+
+* there is no overlap syntactically between the local context (func
+ bodies) and the global context (other than variable decls), so we
+ do not need to worry about ambiguity due to the overlap:
+ + the global context has no expressions or statements
+ + the local context has no function definitions or type decls
+* no "inline" type declarations (struct, union, enum) in function
+ parameters ~(including function pointers)~
+* no "inline" type decls in function return types
+* no superflous parentheses in declarators
+* var decls in for loops are always "simple" (e.g. no inline types)
+* only inline struct/union/enum decls may be anonymouns (without a name)
+* no function pointers in function pointer parameters
+* for loop "headers" do not have curly braces (e.g. compound init)
+* syntactically, variable decls do not overlap with stmts/exprs, except
+ in the following case:
+ spam (*eggs) (...)
+ This could be either a function pointer variable named "eggs"
+ or a call to a function named "spam", which returns a function
+ pointer that gets called. The only differentiator is the
+ syntax used in the "..." part. It will be comma-separated
+ parameters for the former and comma-separated expressions for
+ the latter. Thus, if we expect such decls or calls then we must
+ parse the decl params.
+"""
+
+"""
+TODO:
+* extract CPython-specific code
+* drop include injection (or only add when needed)
+* track position instead of slicing "text"
+* Parser class instead of the _iter_source() mess
+* alt impl using a state machine (& tokenizer or split on delimiters)
+"""
+
+from ..info import ParsedItem
+from ._info import SourceInfo
+
+
+def parse(srclines):
+ if isinstance(srclines, str): # a filename
+ raise NotImplementedError
+
+ anon_name = anonymous_names()
+ for result in _parse(srclines, anon_name):
+ yield ParsedItem.from_raw(result)
+
+
+# XXX Later: Add a separate function to deal with preprocessor directives
+# parsed out of raw source.
+
+
+def anonymous_names():
+ counter = 1
+ def anon_name(prefix='anon-'):
+ nonlocal counter
+ name = f'{prefix}{counter}'
+ counter += 1
+ return name
+ return anon_name
+
+
+#############################
+# internal impl
+
+import logging
+
+
+_logger = logging.getLogger(__name__)
+
+
+def _parse(srclines, anon_name):
+ from ._global import parse_globals
+
+ source = _iter_source(srclines)
+ #source = _iter_source(srclines, showtext=True)
+ for result in parse_globals(source, anon_name):
+ # XXX Handle blocks here insted of in parse_globals().
+ yield result
+
+
+def _iter_source(lines, *, maxtext=20_000, maxlines=700, showtext=False):
+ filestack = []
+ allinfo = {}
+ # "lines" should be (fileinfo, data), as produced by the preprocessor code.
+ for fileinfo, line in lines:
+ if fileinfo.filename in filestack:
+ while fileinfo.filename != filestack[-1]:
+ filename = filestack.pop()
+ del allinfo[filename]
+ filename = fileinfo.filename
+ srcinfo = allinfo[filename]
+ else:
+ filename = fileinfo.filename
+ srcinfo = SourceInfo(filename)
+ filestack.append(filename)
+ allinfo[filename] = srcinfo
+
+ _logger.debug(f'-> {line}')
+ srcinfo._add_line(line, fileinfo.lno)
+ if len(srcinfo.text) > maxtext:
+ break
+ if srcinfo.end - srcinfo.start > maxlines:
+ break
+ while srcinfo._used():
+ yield srcinfo
+ if showtext:
+ _logger.debug(f'=> {srcinfo.text}')
+ else:
+ if not filestack:
+ srcinfo = SourceInfo('???')
+ else:
+ filename = filestack[-1]
+ srcinfo = allinfo[filename]
+ while srcinfo._used():
+ yield srcinfo
+ if showtext:
+ _logger.debug(f'=> {srcinfo.text}')
+ yield srcinfo
+ if showtext:
+ _logger.debug(f'=> {srcinfo.text}')
+ if not srcinfo._ready:
+ return
+ # At this point either the file ended prematurely
+ # or there's "too much" text.
+ filename, lno, text = srcinfo.filename, srcinfo._start, srcinfo.text
+ if len(text) > 500:
+ text = text[:500] + '...'
+ raise Exception(f'unmatched text ({filename} starting at line {lno}):\n{text}')
diff --git a/Tools/c-analyzer/c_parser/parser/_alt.py b/Tools/c-analyzer/c_parser/parser/_alt.py
new file mode 100644
index 00000000000..05a9101b4f5
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/_alt.py
@@ -0,0 +1,6 @@
+
+def _parse(srclines, anon_name):
+ text = ' '.join(l for _, l in srclines)
+
+ from ._delim import parse
+ yield from parse(text, anon_name)
diff --git a/Tools/c-analyzer/c_parser/parser/_common.py b/Tools/c-analyzer/c_parser/parser/_common.py
new file mode 100644
index 00000000000..40c36039f3f
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/_common.py
@@ -0,0 +1,115 @@
+import re
+
+from ._regexes import (
+ _ind,
+ STRING_LITERAL,
+ VAR_DECL as _VAR_DECL,
+)
+
+
+def log_match(group, m):
+ from . import _logger
+ _logger.debug(f'matched <{group}> ({m.group(0)})')
+
+
+#############################
+# regex utils
+
+def set_capture_group(pattern, group, *, strict=True):
+ old = f'(?: # <{group}>'
+ if strict and f'(?: # <{group}>' not in pattern:
+ raise ValueError(f'{old!r} not found in pattern')
+ return pattern.replace(old, f'( # <{group}>', 1)
+
+
+def set_capture_groups(pattern, groups, *, strict=True):
+ for group in groups:
+ pattern = set_capture_group(pattern, group, strict=strict)
+ return pattern
+
+
+#############################
+# syntax-related utils
+
+_PAREN_RE = re.compile(rf'''
+ (?:
+ (?:
+ [^'"()]*
+ {_ind(STRING_LITERAL, 3)}
+ )*
+ [^'"()]*
+ (?:
+ ( [(] )
+ |
+ ( [)] )
+ )
+ )
+ ''', re.VERBOSE)
+
+
+def match_paren(text, depth=0):
+ pos = 0
+ while (m := _PAREN_RE.match(text, pos)):
+ pos = m.end()
+ _open, _close = m.groups()
+ if _open:
+ depth += 1
+ else: # _close
+ depth -= 1
+ if depth == 0:
+ return pos
+ else:
+ raise ValueError(f'could not find matching parens for {text!r}')
+
+
+VAR_DECL = set_capture_groups(_VAR_DECL, (
+ 'STORAGE',
+ 'TYPE_QUAL',
+ 'TYPE_SPEC',
+ 'DECLARATOR',
+ 'IDENTIFIER',
+ 'WRAPPED_IDENTIFIER',
+ 'FUNC_IDENTIFIER',
+))
+
+
+def parse_var_decl(decl):
+ m = re.match(VAR_DECL, decl, re.VERBOSE)
+ (storage, typequal, typespec, declarator,
+ name,
+ wrappedname,
+ funcptrname,
+ ) = m.groups()
+ if name:
+ kind = 'simple'
+ elif wrappedname:
+ kind = 'wrapped'
+ name = wrappedname
+ elif funcptrname:
+ kind = 'funcptr'
+ name = funcptrname
+ else:
+ raise NotImplementedError
+ abstract = declarator.replace(name, '')
+ vartype = {
+ 'storage': storage,
+ 'typequal': typequal,
+ 'typespec': typespec,
+ 'abstract': abstract,
+ }
+ return (kind, name, vartype)
+
+
+#############################
+# parser state utils
+
+# XXX Drop this or use it!
+def iter_results(results):
+ if not results:
+ return
+ if callable(results):
+ results = results()
+
+ for result, text in results():
+ if result:
+ yield result, text
diff --git a/Tools/c-analyzer/c_parser/parser/_compound_decl_body.py b/Tools/c-analyzer/c_parser/parser/_compound_decl_body.py
new file mode 100644
index 00000000000..eb5bc67607b
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/_compound_decl_body.py
@@ -0,0 +1,158 @@
+import re
+
+from ._regexes import (
+ STRUCT_MEMBER_DECL as _STRUCT_MEMBER_DECL,
+ ENUM_MEMBER_DECL as _ENUM_MEMBER_DECL,
+)
+from ._common import (
+ log_match,
+ parse_var_decl,
+ set_capture_groups,
+)
+
+
+#############################
+# struct / union
+
+STRUCT_MEMBER_DECL = set_capture_groups(_STRUCT_MEMBER_DECL, (
+ 'COMPOUND_TYPE_KIND',
+ 'COMPOUND_TYPE_NAME',
+ 'SPECIFIER_QUALIFIER',
+ 'DECLARATOR',
+ 'SIZE',
+ 'ENDING',
+ 'CLOSE',
+))
+STRUCT_MEMBER_RE = re.compile(rf'^ \s* {STRUCT_MEMBER_DECL}', re.VERBOSE)
+
+
+def parse_struct_body(source, anon_name, parent):
+ done = False
+ while not done:
+ done = True
+ for srcinfo in source:
+ m = STRUCT_MEMBER_RE.match(srcinfo.text)
+ if m:
+ break
+ else:
+ # We ran out of lines.
+ if srcinfo is not None:
+ srcinfo.done()
+ return
+ for item in _parse_struct_next(m, srcinfo, anon_name, parent):
+ if callable(item):
+ parse_body = item
+ yield from parse_body(source)
+ else:
+ yield item
+ done = False
+
+
+def _parse_struct_next(m, srcinfo, anon_name, parent):
+ (inline_kind, inline_name,
+ qualspec, declarator,
+ size,
+ ending,
+ close,
+ ) = m.groups()
+ remainder = srcinfo.text[m.end():]
+
+ if close:
+ log_match('compound close', m)
+ srcinfo.advance(remainder)
+
+ elif inline_kind:
+ log_match('compound inline', m)
+ kind = inline_kind
+ name = inline_name or anon_name('inline-')
+ # Immediately emit a forward declaration.
+ yield srcinfo.resolve(kind, name=name, data=None)
+
+ # un-inline the decl. Note that it might not actually be inline.
+ # We handle the case in the "maybe_inline_actual" branch.
+ srcinfo.nest(
+ remainder,
+ f'{kind} {name}',
+ )
+ def parse_body(source):
+ _parse_body = DECL_BODY_PARSERS[kind]
+
+ data = [] # members
+ ident = f'{kind} {name}'
+ for item in _parse_body(source, anon_name, ident):
+ if item.kind == 'field':
+ data.append(item)
+ else:
+ yield item
+ # XXX Should "parent" really be None for inline type decls?
+ yield srcinfo.resolve(kind, data, name, parent=None)
+
+ srcinfo.resume()
+ yield parse_body
+
+ else:
+ # not inline (member)
+ log_match('compound member', m)
+ if qualspec:
+ _, name, data = parse_var_decl(f'{qualspec} {declarator}')
+ if not name:
+ name = anon_name('struct-field-')
+ if size:
+# data = (data, size)
+ data['size'] = int(size)
+ else:
+ # This shouldn't happen (we expect each field to have a name).
+ raise NotImplementedError
+ name = sized_name or anon_name('struct-field-')
+ data = int(size)
+
+ yield srcinfo.resolve('field', data, name, parent) # XXX Restart?
+ if ending == ',':
+ remainder = rf'{qualspec} {remainder}'
+ srcinfo.advance(remainder)
+
+
+#############################
+# enum
+
+ENUM_MEMBER_DECL = set_capture_groups(_ENUM_MEMBER_DECL, (
+ 'CLOSE',
+ 'NAME',
+ 'INIT',
+ 'ENDING',
+))
+ENUM_MEMBER_RE = re.compile(rf'{ENUM_MEMBER_DECL}', re.VERBOSE)
+
+
+def parse_enum_body(source, _anon_name, _parent):
+ ending = None
+ while ending != '}':
+ for srcinfo in source:
+ m = ENUM_MEMBER_RE.match(srcinfo.text)
+ if m:
+ break
+ else:
+ # We ran out of lines.
+ if srcinfo is not None:
+ srcinfo.done()
+ return
+ remainder = srcinfo.text[m.end():]
+
+ (close,
+ name, init, ending,
+ ) = m.groups()
+ if close:
+ ending = '}'
+ else:
+ data = init
+ yield srcinfo.resolve('field', data, name, _parent)
+ srcinfo.advance(remainder)
+
+
+#############################
+
+DECL_BODY_PARSERS = {
+ 'struct': parse_struct_body,
+ 'union': parse_struct_body,
+ 'enum': parse_enum_body,
+}
diff --git a/Tools/c-analyzer/c_parser/parser/_delim.py b/Tools/c-analyzer/c_parser/parser/_delim.py
new file mode 100644
index 00000000000..51433a629d3
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/_delim.py
@@ -0,0 +1,54 @@
+import re
+import textwrap
+
+from ._regexes import _ind, STRING_LITERAL
+
+
+def parse(text, anon_name):
+ context = None
+ data = None
+ for m in DELIMITER_RE.find_iter(text):
+ before, opened, closed = m.groups()
+ delim = opened or closed
+
+ handle_segment = HANDLERS[context][delim]
+ result, context, data = handle_segment(before, delim, data)
+ if result:
+ yield result
+
+
+DELIMITER = textwrap.dedent(rf'''
+ (
+ (?:
+ [^'"()\[\]{};]*
+ {_ind(STRING_LITERAL, 3)}
+ }*
+ [^'"()\[\]{};]+
+ )? # <before>
+ (?:
+ (
+ [(\[{]
+ ) # <open>
+ |
+ (
+ [)\]};]
+ ) # <close>
+ )?
+ ''')
+DELIMITER_RE = re.compile(DELIMITER, re.VERBOSE)
+
+_HANDLERS = {
+ None: { # global
+ # opened
+ '{': ...,
+ '[': None,
+ '(': None,
+ # closed
+ '}': None,
+ ']': None,
+ ')': None,
+ ';': ...,
+ },
+ '': {
+ },
+}
diff --git a/Tools/c-analyzer/c_parser/parser/_func_body.py b/Tools/c-analyzer/c_parser/parser/_func_body.py
new file mode 100644
index 00000000000..42fd459e111
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/_func_body.py
@@ -0,0 +1,278 @@
+import re
+
+from ._regexes import (
+ LOCAL as _LOCAL,
+ LOCAL_STATICS as _LOCAL_STATICS,
+)
+from ._common import (
+ log_match,
+ parse_var_decl,
+ set_capture_groups,
+ match_paren,
+)
+from ._compound_decl_body import DECL_BODY_PARSERS
+
+
+LOCAL = set_capture_groups(_LOCAL, (
+ 'EMPTY',
+ 'INLINE_LEADING',
+ 'INLINE_PRE',
+ 'INLINE_KIND',
+ 'INLINE_NAME',
+ 'STORAGE',
+ 'VAR_DECL',
+ 'VAR_INIT',
+ 'VAR_ENDING',
+ 'COMPOUND_BARE',
+ 'COMPOUND_LABELED',
+ 'COMPOUND_PAREN',
+ 'BLOCK_LEADING',
+ 'BLOCK_OPEN',
+ 'SIMPLE_STMT',
+ 'SIMPLE_ENDING',
+ 'BLOCK_CLOSE',
+))
+LOCAL_RE = re.compile(rf'^ \s* {LOCAL}', re.VERBOSE)
+
+
+# Note that parse_function_body() still has trouble with a few files
+# in the CPython codebase.
+
+def parse_function_body(source, name, anon_name):
+ # XXX
+ raise NotImplementedError
+
+
+def parse_function_body(name, text, resolve, source, anon_name, parent):
+ raise NotImplementedError
+ # For now we do not worry about locals declared in for loop "headers".
+ depth = 1;
+ while depth > 0:
+ m = LOCAL_RE.match(text)
+ while not m:
+ text, resolve = continue_text(source, text or '{', resolve)
+ m = LOCAL_RE.match(text)
+ text = text[m.end():]
+ (
+ empty,
+ inline_leading, inline_pre, inline_kind, inline_name,
+ storage, decl,
+ var_init, var_ending,
+ compound_bare, compound_labeled, compound_paren,
+ block_leading, block_open,
+ simple_stmt, simple_ending,
+ block_close,
+ ) = m.groups()
+
+ if empty:
+ log_match('', m)
+ resolve(None, None, None, text)
+ yield None, text
+ elif inline_kind:
+ log_match('', m)
+ kind = inline_kind
+ name = inline_name or anon_name('inline-')
+ data = [] # members
+ # We must set the internal "text" from _iter_source() to the
+ # start of the inline compound body,
+ # Note that this is effectively like a forward reference that
+ # we do not emit.
+ resolve(kind, None, name, text, None)
+ _parse_body = DECL_BODY_PARSERS[kind]
+ before = []
+ ident = f'{kind} {name}'
+ for member, inline, text in _parse_body(text, resolve, source, anon_name, ident):
+ if member:
+ data.append(member)
+ if inline:
+ yield from inline
+ # un-inline the decl. Note that it might not actually be inline.
+ # We handle the case in the "maybe_inline_actual" branch.
+ text = f'{inline_leading or ""} {inline_pre or ""} {kind} {name} {text}'
+ # XXX Should "parent" really be None for inline type decls?
+ yield resolve(kind, data, name, text, None), text
+ elif block_close:
+ log_match('', m)
+ depth -= 1
+ resolve(None, None, None, text)
+ # XXX This isn't great. Calling resolve() should have
+ # cleared the closing bracket. However, some code relies
+ # on the yielded value instead of the resolved one. That
+ # needs to be fixed.
+ yield None, text
+ elif compound_bare:
+ log_match('', m)
+ yield resolve('statement', compound_bare, None, text, parent), text
+ elif compound_labeled:
+ log_match('', m)
+ yield resolve('statement', compound_labeled, None, text, parent), text
+ elif compound_paren:
+ log_match('', m)
+ try:
+ pos = match_paren(text)
+ except ValueError:
+ text = f'{compound_paren} {text}'
+ #resolve(None, None, None, text)
+ text, resolve = continue_text(source, text, resolve)
+ yield None, text
+ else:
+ head = text[:pos]
+ text = text[pos:]
+ if compound_paren == 'for':
+ # XXX Parse "head" as a compound statement.
+ stmt1, stmt2, stmt3 = head.split(';', 2)
+ data = {
+ 'compound': compound_paren,
+ 'statements': (stmt1, stmt2, stmt3),
+ }
+ else:
+ data = {
+ 'compound': compound_paren,
+ 'statement': head,
+ }
+ yield resolve('statement', data, None, text, parent), text
+ elif block_open:
+ log_match('', m)
+ depth += 1
+ if block_leading:
+ # An inline block: the last evaluated expression is used
+ # in place of the block.
+ # XXX Combine it with the remainder after the block close.
+ stmt = f'{block_open}{{<expr>}}...;'
+ yield resolve('statement', stmt, None, text, parent), text
+ else:
+ resolve(None, None, None, text)
+ yield None, text
+ elif simple_ending:
+ log_match('', m)
+ yield resolve('statement', simple_stmt, None, text, parent), text
+ elif var_ending:
+ log_match('', m)
+ kind = 'variable'
+ _, name, vartype = parse_var_decl(decl)
+ data = {
+ 'storage': storage,
+ 'vartype': vartype,
+ }
+ after = ()
+ if var_ending == ',':
+ # It was a multi-declaration, so queue up the next one.
+ _, qual, typespec, _ = vartype.values()
+ text = f'{storage or ""} {qual or ""} {typespec} {text}'
+ yield resolve(kind, data, name, text, parent), text
+ if var_init:
+ _data = f'{name} = {var_init.strip()}'
+ yield resolve('statement', _data, None, text, parent), text
+ else:
+ # This should be unreachable.
+ raise NotImplementedError
+
+
+#############################
+# static local variables
+
+LOCAL_STATICS = set_capture_groups(_LOCAL_STATICS, (
+ 'INLINE_LEADING',
+ 'INLINE_PRE',
+ 'INLINE_KIND',
+ 'INLINE_NAME',
+ 'STATIC_DECL',
+ 'STATIC_INIT',
+ 'STATIC_ENDING',
+ 'DELIM_LEADING',
+ 'BLOCK_OPEN',
+ 'BLOCK_CLOSE',
+ 'STMT_END',
+))
+LOCAL_STATICS_RE = re.compile(rf'^ \s* {LOCAL_STATICS}', re.VERBOSE)
+
+
+def parse_function_statics(source, func, anon_name):
+ # For now we do not worry about locals declared in for loop "headers".
+ depth = 1;
+ while depth > 0:
+ for srcinfo in source:
+ m = LOCAL_STATICS_RE.match(srcinfo.text)
+ if m:
+ break
+ else:
+ # We ran out of lines.
+ if srcinfo is not None:
+ srcinfo.done()
+ return
+ for item, depth in _parse_next_local_static(m, srcinfo,
+ anon_name, func, depth):
+ if callable(item):
+ parse_body = item
+ yield from parse_body(source)
+ elif item is not None:
+ yield item
+
+
+def _parse_next_local_static(m, srcinfo, anon_name, func, depth):
+ (inline_leading, inline_pre, inline_kind, inline_name,
+ static_decl, static_init, static_ending,
+ _delim_leading,
+ block_open,
+ block_close,
+ stmt_end,
+ ) = m.groups()
+ remainder = srcinfo.text[m.end():]
+
+ if inline_kind:
+ log_match('func inline', m)
+ kind = inline_kind
+ name = inline_name or anon_name('inline-')
+ # Immediately emit a forward declaration.
+ yield srcinfo.resolve(kind, name=name, data=None), depth
+
+ # un-inline the decl. Note that it might not actually be inline.
+ # We handle the case in the "maybe_inline_actual" branch.
+ srcinfo.nest(
+ remainder,
+ f'{inline_leading or ""} {inline_pre or ""} {kind} {name}'
+ )
+ def parse_body(source):
+ _parse_body = DECL_BODY_PARSERS[kind]
+
+ data = [] # members
+ ident = f'{kind} {name}'
+ for item in _parse_body(source, anon_name, ident):
+ if item.kind == 'field':
+ data.append(item)
+ else:
+ yield item
+ # XXX Should "parent" really be None for inline type decls?
+ yield srcinfo.resolve(kind, data, name, parent=None)
+
+ srcinfo.resume()
+ yield parse_body, depth
+
+ elif static_decl:
+ log_match('local variable', m)
+ _, name, data = parse_var_decl(static_decl)
+
+ yield srcinfo.resolve('variable', data, name, parent=func), depth
+
+ if static_init:
+ srcinfo.advance(f'{name} {static_init} {remainder}')
+ elif static_ending == ',':
+ # It was a multi-declaration, so queue up the next one.
+ _, qual, typespec, _ = data.values()
+ srcinfo.advance(f'static {qual or ""} {typespec} {remainder}')
+ else:
+ srcinfo.advance('')
+
+ else:
+ log_match('func other', m)
+ if block_open:
+ depth += 1
+ elif block_close:
+ depth -= 1
+ elif stmt_end:
+ pass
+ else:
+ # This should be unreachable.
+ raise NotImplementedError
+ srcinfo.advance(remainder)
+ yield None, depth
diff --git a/Tools/c-analyzer/c_parser/parser/_global.py b/Tools/c-analyzer/c_parser/parser/_global.py
new file mode 100644
index 00000000000..35947c12998
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/_global.py
@@ -0,0 +1,179 @@
+import re
+
+from ._regexes import (
+ GLOBAL as _GLOBAL,
+)
+from ._common import (
+ log_match,
+ parse_var_decl,
+ set_capture_groups,
+)
+from ._compound_decl_body import DECL_BODY_PARSERS
+#from ._func_body import parse_function_body
+from ._func_body import parse_function_statics as parse_function_body
+
+
+GLOBAL = set_capture_groups(_GLOBAL, (
+ 'EMPTY',
+ 'COMPOUND_LEADING',
+ 'COMPOUND_KIND',
+ 'COMPOUND_NAME',
+ 'FORWARD_KIND',
+ 'FORWARD_NAME',
+ 'MAYBE_INLINE_ACTUAL',
+ 'TYPEDEF_DECL',
+ 'TYPEDEF_FUNC_PARAMS',
+ 'VAR_STORAGE',
+ 'FUNC_INLINE',
+ 'VAR_DECL',
+ 'FUNC_PARAMS',
+ 'FUNC_DELIM',
+ 'FUNC_LEGACY_PARAMS',
+ 'VAR_INIT',
+ 'VAR_ENDING',
+))
+GLOBAL_RE = re.compile(rf'^ \s* {GLOBAL}', re.VERBOSE)
+
+
+def parse_globals(source, anon_name):
+ for srcinfo in source:
+ m = GLOBAL_RE.match(srcinfo.text)
+ if not m:
+ # We need more text.
+ continue
+ for item in _parse_next(m, srcinfo, anon_name):
+ if callable(item):
+ parse_body = item
+ yield from parse_body(source)
+ else:
+ yield item
+ else:
+ # We ran out of lines.
+ if srcinfo is not None:
+ srcinfo.done()
+ return
+
+
+def _parse_next(m, srcinfo, anon_name):
+ (
+ empty,
+ # compound type decl (maybe inline)
+ compound_leading, compound_kind, compound_name,
+ forward_kind, forward_name, maybe_inline_actual,
+ # typedef
+ typedef_decl, typedef_func_params,
+ # vars and funcs
+ storage, func_inline, decl,
+ func_params, func_delim, func_legacy_params,
+ var_init, var_ending,
+ ) = m.groups()
+ remainder = srcinfo.text[m.end():]
+
+ if empty:
+ log_match('global empty', m)
+ srcinfo.advance(remainder)
+
+ elif maybe_inline_actual:
+ log_match('maybe_inline_actual', m)
+ # Ignore forward declarations.
+ # XXX Maybe return them too (with an "isforward" flag)?
+ if not maybe_inline_actual.strip().endswith(';'):
+ remainder = maybe_inline_actual + remainder
+ yield srcinfo.resolve(forward_kind, None, forward_name)
+ if maybe_inline_actual.strip().endswith('='):
+ # We use a dummy prefix for a fake typedef.
+ # XXX Ideally this case would not be caught by MAYBE_INLINE_ACTUAL.
+ _, name, data = parse_var_decl(f'{forward_kind} {forward_name} fake_typedef_{forward_name}')
+ yield srcinfo.resolve('typedef', data, name, parent=None)
+ remainder = f'{name} {remainder}'
+ srcinfo.advance(remainder)
+
+ elif compound_kind:
+ kind = compound_kind
+ name = compound_name or anon_name('inline-')
+ # Immediately emit a forward declaration.
+ yield srcinfo.resolve(kind, name=name, data=None)
+
+ # un-inline the decl. Note that it might not actually be inline.
+ # We handle the case in the "maybe_inline_actual" branch.
+ srcinfo.nest(
+ remainder,
+ f'{compound_leading or ""} {compound_kind} {name}',
+ )
+ def parse_body(source):
+ _parse_body = DECL_BODY_PARSERS[compound_kind]
+
+ data = [] # members
+ ident = f'{kind} {name}'
+ for item in _parse_body(source, anon_name, ident):
+ if item.kind == 'field':
+ data.append(item)
+ else:
+ yield item
+ # XXX Should "parent" really be None for inline type decls?
+ yield srcinfo.resolve(kind, data, name, parent=None)
+
+ srcinfo.resume()
+ yield parse_body
+
+ elif typedef_decl:
+ log_match('typedef', m)
+ kind = 'typedef'
+ _, name, data = parse_var_decl(typedef_decl)
+ if typedef_func_params:
+ return_type = data
+ # This matches the data for func declarations.
+ data = {
+ 'storage': None,
+ 'inline': None,
+ 'params': f'({typedef_func_params})',
+ 'returntype': return_type,
+ 'isforward': True,
+ }
+ yield srcinfo.resolve(kind, data, name, parent=None)
+ srcinfo.advance(remainder)
+
+ elif func_delim or func_legacy_params:
+ log_match('function', m)
+ kind = 'function'
+ _, name, return_type = parse_var_decl(decl)
+ func_params = func_params or func_legacy_params
+ data = {
+ 'storage': storage,
+ 'inline': func_inline,
+ 'params': f'({func_params})',
+ 'returntype': return_type,
+ 'isforward': func_delim == ';',
+ }
+
+ yield srcinfo.resolve(kind, data, name, parent=None)
+ srcinfo.advance(remainder)
+
+ if func_delim == '{' or func_legacy_params:
+ def parse_body(source):
+ yield from parse_function_body(source, name, anon_name)
+ yield parse_body
+
+ elif var_ending:
+ log_match('global variable', m)
+ kind = 'variable'
+ _, name, vartype = parse_var_decl(decl)
+ data = {
+ 'storage': storage,
+ 'vartype': vartype,
+ }
+ yield srcinfo.resolve(kind, data, name, parent=None)
+
+ if var_ending == ',':
+ # It was a multi-declaration, so queue up the next one.
+ _, qual, typespec, _ = vartype.values()
+ remainder = f'{storage or ""} {qual or ""} {typespec} {remainder}'
+ srcinfo.advance(remainder)
+
+ if var_init:
+ _data = f'{name} = {var_init.strip()}'
+ yield srcinfo.resolve('statement', _data, name=None)
+
+ else:
+ # This should be unreachable.
+ raise NotImplementedError
diff --git a/Tools/c-analyzer/c_parser/parser/_info.py b/Tools/c-analyzer/c_parser/parser/_info.py
new file mode 100644
index 00000000000..2dcd5e5e760
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/_info.py
@@ -0,0 +1,168 @@
+from ..info import KIND, ParsedItem, FileInfo
+
+
+class TextInfo:
+
+ def __init__(self, text, start=None, end=None):
+ # immutable:
+ if not start:
+ start = 1
+ self.start = start
+
+ # mutable:
+ lines = text.splitlines() or ['']
+ self.text = text.strip()
+ if not end:
+ end = start + len(lines) - 1
+ self.end = end
+ self.line = lines[-1]
+
+ def __repr__(self):
+ args = (f'{a}={getattr(self, a)!r}'
+ for a in ['text', 'start', 'end'])
+ return f'{type(self).__name__}({", ".join(args)})'
+
+ def add_line(self, line, lno=None):
+ if lno is None:
+ lno = self.end + 1
+ else:
+ if isinstance(lno, FileInfo):
+ fileinfo = lno
+ if fileinfo.filename != self.filename:
+ raise NotImplementedError((fileinfo, self.filename))
+ lno = fileinfo.lno
+ # XXX
+ #if lno < self.end:
+ # raise NotImplementedError((lno, self.end))
+ line = line.lstrip()
+ self.text += ' ' + line
+ self.line = line
+ self.end = lno
+
+
+class SourceInfo:
+
+ _ready = False
+
+ def __init__(self, filename, _current=None):
+ # immutable:
+ self.filename = filename
+ # mutable:
+ if isinstance(_current, str):
+ _current = TextInfo(_current)
+ self._current = _current
+ start = -1
+ self._start = _current.start if _current else -1
+ self._nested = []
+ self._set_ready()
+
+ def __repr__(self):
+ args = (f'{a}={getattr(self, a)!r}'
+ for a in ['filename', '_current'])
+ return f'{type(self).__name__}({", ".join(args)})'
+
+ @property
+ def start(self):
+ if self._current is None:
+ return self._start
+ return self._current.start
+
+ @property
+ def end(self):
+ if self._current is None:
+ return self._start
+ return self._current.end
+
+ @property
+ def text(self):
+ if self._current is None:
+ return ''
+ return self._current.text
+
+ def nest(self, text, before, start=None):
+ if self._current is None:
+ raise Exception('nesting requires active source text')
+ current = self._current
+ current.text = before
+ self._nested.append(current)
+ self._replace(text, start)
+
+ def resume(self, remainder=None):
+ if not self._nested:
+ raise Exception('no nested text to resume')
+ if self._current is None:
+ raise Exception('un-nesting requires active source text')
+ if remainder is None:
+ remainder = self._current.text
+ self._clear()
+ self._current = self._nested.pop()
+ self._current.text += ' ' + remainder
+ self._set_ready()
+
+ def advance(self, remainder, start=None):
+ if self._current is None:
+ raise Exception('advancing requires active source text')
+ if remainder.strip():
+ self._replace(remainder, start, fixnested=True)
+ else:
+ if self._nested:
+ self._replace('', start, fixnested=True)
+ #raise Exception('cannot advance while nesting')
+ else:
+ self._clear(start)
+
+ def resolve(self, kind, data, name, parent=None):
+ # "field" isn't a top-level kind, so we leave it as-is.
+ if kind and kind != 'field':
+ kind = KIND._from_raw(kind)
+ fileinfo = FileInfo(self.filename, self._start)
+ return ParsedItem(fileinfo, kind, parent, name, data)
+
+ def done(self):
+ self._set_ready()
+
+ def _set_ready(self):
+ if self._current is None:
+ self._ready = False
+ else:
+ self._ready = self._current.text.strip() != ''
+
+ def _used(self):
+ ready = self._ready
+ self._ready = False
+ return ready
+
+ def _clear(self, start=None):
+ old = self._current
+ if self._current is not None:
+ # XXX Fail if self._current wasn't used up?
+ if start is None:
+ start = self._current.end
+ self._current = None
+ if start is not None:
+ self._start = start
+ self._set_ready()
+ return old
+
+ def _replace(self, text, start=None, *, fixnested=False):
+ end = self._current.end
+ old = self._clear(start)
+ self._current = TextInfo(text, self._start, end)
+ if fixnested and self._nested and self._nested[-1] is old:
+ self._nested[-1] = self._current
+ self._set_ready()
+
+ def _add_line(self, line, lno=None):
+ if not line.strip():
+ # We don't worry about multi-line string literals.
+ return
+ if self._current is None:
+ self._start = lno
+ self._current = TextInfo(line, lno)
+ else:
+ # XXX
+ #if lno < self._current.end:
+ # # A circular include?
+ # raise NotImplementedError((lno, self))
+ self._current.add_line(line, lno)
+ self._ready = True
diff --git a/Tools/c-analyzer/c_parser/parser/_regexes.py b/Tools/c-analyzer/c_parser/parser/_regexes.py
new file mode 100644
index 00000000000..e9bc31d335a
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/_regexes.py
@@ -0,0 +1,796 @@
+# Regular expression patterns for C syntax.
+#
+# None of these patterns has any capturing. However, a number of them
+# have capturing markers compatible with utils.set_capture_groups().
+
+import textwrap
+
+
+def _ind(text, level=1, edges='both'):
+ indent = ' ' * level
+ text = textwrap.indent(text, indent)
+ if edges == 'pre' or edges == 'both':
+ text = '\n' + indent + text.lstrip()
+ if edges == 'post' or edges == 'both':
+ text = text.rstrip() + '\n' + ' ' * (level - 1)
+ return text
+
+
+#######################################
+# general
+
+HEX = r'(?: [0-9a-zA-Z] )'
+
+STRING_LITERAL = textwrap.dedent(rf'''
+ (?:
+ # character literal
+ (?:
+ ['] [^'] [']
+ |
+ ['] \\ . [']
+ |
+ ['] \\x{HEX}{HEX} [']
+ |
+ ['] \\0\d\d [']
+ |
+ (?:
+ ['] \\o[01]\d\d [']
+ |
+ ['] \\o2[0-4]\d [']
+ |
+ ['] \\o25[0-5] [']
+ )
+ )
+ |
+ # string literal
+ (?:
+ ["] (?: [^"\\]* \\ . )* [^"\\]* ["]
+ )
+ # end string literal
+ )
+ ''')
+
+_KEYWORD = textwrap.dedent(r'''
+ (?:
+ \b
+ (?:
+ auto |
+ extern |
+ register |
+ static |
+ typedef |
+
+ const |
+ volatile |
+
+ signed |
+ unsigned |
+ char |
+ short |
+ int |
+ long |
+ float |
+ double |
+ void |
+
+ struct |
+ union |
+ enum |
+
+ goto |
+ return |
+ sizeof |
+ break |
+ continue |
+ if |
+ else |
+ for |
+ do |
+ while |
+ switch |
+ case |
+ default |
+ entry
+ )
+ \b
+ )
+ ''')
+KEYWORD = rf'''
+ # keyword
+ {_KEYWORD}
+ # end keyword
+ '''
+_KEYWORD = ''.join(_KEYWORD.split())
+
+IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )'
+# We use a negative lookahead to filter out keywords.
+STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )'
+ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )'
+
+
+#######################################
+# types
+
+SIMPLE_TYPE = textwrap.dedent(rf'''
+ # simple type
+ (?:
+ \b
+ (?:
+ void
+ |
+ (?: signed | unsigned ) # implies int
+ |
+ (?:
+ (?: (?: signed | unsigned ) \s+ )?
+ (?: (?: long | short ) \s+ )?
+ (?: char | short | int | long | float | double )
+ )
+ )
+ \b
+ )
+ # end simple type
+ ''')
+
+COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )'
+
+
+#######################################
+# variable declarations
+
+STORAGE_CLASS = r'(?: \b (?: auto | register | static | extern ) \b )'
+TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )'
+PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )'
+
+TYPE_SPEC = textwrap.dedent(rf'''
+ # type spec
+ (?:
+ {_ind(SIMPLE_TYPE, 2)}
+ |
+ (?:
+ [_]*typeof[_]*
+ \s* [(]
+ (?: \s* [*&] )*
+ \s* {STRICT_IDENTIFIER}
+ \s* [)]
+ )
+ |
+ # reference to a compound type
+ (?:
+ {COMPOUND_TYPE_KIND}
+ (?: \s* {ANON_IDENTIFIER} )?
+ )
+ |
+ # reference to a typedef
+ {STRICT_IDENTIFIER}
+ )
+ # end type spec
+ ''')
+
+DECLARATOR = textwrap.dedent(rf'''
+ # declarator (possibly abstract)
+ (?:
+ (?: {PTR_QUALIFIER} \s* )*
+ (?:
+ (?:
+ (?: # <IDENTIFIER>
+ {STRICT_IDENTIFIER}
+ )
+ (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
+ )
+ |
+ (?:
+ [(] \s*
+ (?: # <WRAPPED_IDENTIFIER>
+ {STRICT_IDENTIFIER}
+ )
+ (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
+ \s* [)]
+ )
+ |
+ # func ptr
+ (?:
+ [(] (?: \s* {PTR_QUALIFIER} )? \s*
+ (?: # <FUNC_IDENTIFIER>
+ {STRICT_IDENTIFIER}
+ )
+ (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
+ \s* [)]
+ # We allow for a single level of paren nesting in parameters.
+ \s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)]
+ )
+ )
+ )
+ # end declarator
+ ''')
+
+VAR_DECL = textwrap.dedent(rf'''
+ # var decl (and typedef and func return type)
+ (?:
+ (?:
+ (?: # <STORAGE>
+ {STORAGE_CLASS}
+ )
+ \s*
+ )?
+ (?:
+ (?: # <TYPE_QUAL>
+ {TYPE_QUALIFIER}
+ )
+ \s*
+ )?
+ (?:
+ (?: # <TYPE_SPEC>
+ {_ind(TYPE_SPEC, 4)}
+ )
+ )
+ \s*
+ (?:
+ (?: # <DECLARATOR>
+ {_ind(DECLARATOR, 4)}
+ )
+ )
+ )
+ # end var decl
+ ''')
+
+INITIALIZER = textwrap.dedent(rf'''
+ # initializer
+ (?:
+ (?:
+ [(]
+ # no nested parens (e.g. func ptr)
+ [^)]*
+ [)]
+ \s*
+ )?
+ (?:
+ # a string literal
+ (?:
+ (?: {_ind(STRING_LITERAL, 4)} \s* )*
+ {_ind(STRING_LITERAL, 4)}
+ )
+ |
+
+ # a simple initializer
+ (?:
+ (?:
+ [^'",;{{]*
+ {_ind(STRING_LITERAL, 4)}
+ )*
+ [^'",;{{]*
+ )
+ |
+
+ # a struct/array literal
+ (?:
+ # We only expect compound initializers with
+ # single-variable declarations.
+ {{
+ (?:
+ [^'";]*?
+ {_ind(STRING_LITERAL, 5)}
+ )*
+ [^'";]*?
+ }}
+ (?= \s* ; ) # Note this lookahead.
+ )
+ )
+ )
+ # end initializer
+ ''')
+
+
+#######################################
+# compound type declarations
+
+STRUCT_MEMBER_DECL = textwrap.dedent(rf'''
+ (?:
+ # inline compound type decl
+ (?:
+ (?: # <COMPOUND_TYPE_KIND>
+ {COMPOUND_TYPE_KIND}
+ )
+ (?:
+ \s+
+ (?: # <COMPOUND_TYPE_NAME>
+ {STRICT_IDENTIFIER}
+ )
+ )?
+ \s* {{
+ )
+ |
+ (?:
+ # typed member
+ (?:
+ # Technically it doesn't have to have a type...
+ (?: # <SPECIFIER_QUALIFIER>
+ (?: {TYPE_QUALIFIER} \s* )?
+ {_ind(TYPE_SPEC, 5)}
+ )
+ (?:
+ # If it doesn't have a declarator then it will have
+ # a size and vice versa.
+ \s*
+ (?: # <DECLARATOR>
+ {_ind(DECLARATOR, 6)}
+ )
+ )?
+ )
+
+ # sized member
+ (?:
+ \s* [:] \s*
+ (?: # <SIZE>
+ \d+
+ )
+ )?
+ \s*
+ (?: # <ENDING>
+ [,;]
+ )
+ )
+ |
+ (?:
+ \s*
+ (?: # <CLOSE>
+ }}
+ )
+ )
+ )
+ ''')
+
+ENUM_MEMBER_DECL = textwrap.dedent(rf'''
+ (?:
+ (?:
+ \s*
+ (?: # <CLOSE>
+ }}
+ )
+ )
+ |
+ (?:
+ \s*
+ (?: # <NAME>
+ {IDENTIFIER}
+ )
+ (?:
+ \s* = \s*
+ (?: # <INIT>
+ {_ind(STRING_LITERAL, 4)}
+ |
+ [^'",}}]+
+ )
+ )?
+ \s*
+ (?: # <ENDING>
+ , | }}
+ )
+ )
+ )
+ ''')
+
+
+#######################################
+# statements
+
+SIMPLE_STMT_BODY = textwrap.dedent(rf'''
+ # simple statement body
+ (?:
+ (?:
+ [^'"{{}};]*
+ {_ind(STRING_LITERAL, 3)}
+ )*
+ [^'"{{}};]*
+ #(?= [;{{] ) # Note this lookahead.
+ )
+ # end simple statement body
+ ''')
+SIMPLE_STMT = textwrap.dedent(rf'''
+ # simple statement
+ (?:
+ (?: # <SIMPLE_STMT>
+ # stmt-inline "initializer"
+ (?:
+ return \b
+ (?:
+ \s*
+ {_ind(INITIALIZER, 5)}
+ )?
+ )
+ |
+ # variable assignment
+ (?:
+ (?: [*] \s* )?
+ (?:
+ {STRICT_IDENTIFIER} \s*
+ (?: . | -> ) \s*
+ )*
+ {STRICT_IDENTIFIER}
+ (?: \s* \[ \s* \d+ \s* \] )?
+ \s* = \s*
+ {_ind(INITIALIZER, 4)}
+ )
+ |
+ # catchall return statement
+ (?:
+ return \b
+ (?:
+ (?:
+ [^'";]*
+ {_ind(STRING_LITERAL, 6)}
+ )*
+ \s* [^'";]*
+ )?
+ )
+ |
+ # simple statement
+ (?:
+ {_ind(SIMPLE_STMT_BODY, 4)}
+ )
+ )
+ \s*
+ (?: # <SIMPLE_ENDING>
+ ;
+ )
+ )
+ # end simple statement
+ ''')
+COMPOUND_STMT = textwrap.dedent(rf'''
+ # compound statement
+ (?:
+ \b
+ (?:
+ (?:
+ (?: # <COMPOUND_BARE>
+ else | do
+ )
+ \b
+ )
+ |
+ (?:
+ (?: # <COMPOUND_LABELED>
+ (?:
+ case \b
+ (?:
+ [^'":]*
+ {_ind(STRING_LITERAL, 7)}
+ )*
+ \s* [^'":]*
+ )
+ |
+ default
+ |
+ {STRICT_IDENTIFIER}
+ )
+ \s* [:]
+ )
+ |
+ (?:
+ (?: # <COMPOUND_PAREN>
+ for | while | if | switch
+ )
+ \s* (?= [(] ) # Note this lookahead.
+ )
+ )
+ \s*
+ )
+ # end compound statement
+ ''')
+
+
+#######################################
+# function bodies
+
+LOCAL = textwrap.dedent(rf'''
+ (?:
+ # an empty statement
+ (?: # <EMPTY>
+ ;
+ )
+ |
+ # inline type decl
+ (?:
+ (?:
+ (?: # <INLINE_LEADING>
+ [^;{{}}]+?
+ )
+ \s*
+ )?
+ (?: # <INLINE_PRE>
+ (?: {STORAGE_CLASS} \s* )?
+ (?: {TYPE_QUALIFIER} \s* )?
+ )? # </INLINE_PRE>
+ (?: # <INLINE_KIND>
+ {COMPOUND_TYPE_KIND}
+ )
+ (?:
+ \s+
+ (?: # <INLINE_NAME>
+ {STRICT_IDENTIFIER}
+ )
+ )?
+ \s* {{
+ )
+ |
+ # var decl
+ (?:
+ (?: # <STORAGE>
+ {STORAGE_CLASS}
+ )? # </STORAGE>
+ (?:
+ \s*
+ (?: # <VAR_DECL>
+ {_ind(VAR_DECL, 5)}
+ )
+ )
+ (?:
+ (?:
+ # initializer
+ # We expect only basic initializers.
+ \s* = \s*
+ (?: # <VAR_INIT>
+ {_ind(INITIALIZER, 6)}
+ )
+ )?
+ (?:
+ \s*
+ (?: # <VAR_ENDING>
+ [,;]
+ )
+ )
+ )
+ )
+ |
+ {_ind(COMPOUND_STMT, 2)}
+ |
+ # start-of-block
+ (?:
+ (?: # <BLOCK_LEADING>
+ (?:
+ [^'"{{}};]*
+ {_ind(STRING_LITERAL, 5)}
+ )*
+ [^'"{{}};]*
+ # Presumably we will not see "== {{".
+ [^\s='"{{}});]
+ \s*
+ )? # </BLOCK_LEADING>
+ (?: # <BLOCK_OPEN>
+ {{
+ )
+ )
+ |
+ {_ind(SIMPLE_STMT, 2)}
+ |
+ # end-of-block
+ (?: # <BLOCK_CLOSE>
+ }}
+ )
+ )
+ ''')
+
+LOCAL_STATICS = textwrap.dedent(rf'''
+ (?:
+ # inline type decl
+ (?:
+ (?:
+ (?: # <INLINE_LEADING>
+ [^;{{}}]+?
+ )
+ \s*
+ )?
+ (?: # <INLINE_PRE>
+ (?: {STORAGE_CLASS} \s* )?
+ (?: {TYPE_QUALIFIER} \s* )?
+ )?
+ (?: # <INLINE_KIND>
+ {COMPOUND_TYPE_KIND}
+ )
+ (?:
+ \s+
+ (?: # <INLINE_NAME>
+ {STRICT_IDENTIFIER}
+ )
+ )?
+ \s* {{
+ )
+ |
+ # var decl
+ (?:
+ # We only look for static variables.
+ (?: # <STATIC_DECL>
+ static \b
+ (?: \s* {TYPE_QUALIFIER} )?
+ \s* {_ind(TYPE_SPEC, 4)}
+ \s* {_ind(DECLARATOR, 4)}
+ )
+ \s*
+ (?:
+ (?: # <STATIC_INIT>
+ = \s*
+ {_ind(INITIALIZER, 4)}
+ \s*
+ [,;{{]
+ )
+ |
+ (?: # <STATIC_ENDING>
+ [,;]
+ )
+ )
+ )
+ |
+ # everything else
+ (?:
+ (?: # <DELIM_LEADING>
+ (?:
+ [^'"{{}};]*
+ {_ind(STRING_LITERAL, 4)}
+ )*
+ \s* [^'"{{}};]*
+ )
+ (?:
+ (?: # <BLOCK_OPEN>
+ {{
+ )
+ |
+ (?: # <BLOCK_CLOSE>
+ }}
+ )
+ |
+ (?: # <STMT_END>
+ ;
+ )
+ )
+ )
+ )
+ ''')
+
+
+#######################################
+# global declarations
+
+GLOBAL = textwrap.dedent(rf'''
+ (?:
+ # an empty statement
+ (?: # <EMPTY>
+ ;
+ )
+ |
+
+ # compound type decl (maybe inline)
+ (?:
+ (?:
+ (?: # <COMPOUND_LEADING>
+ [^;{{}}]+?
+ )
+ \s*
+ )?
+ (?: # <COMPOUND_KIND>
+ {COMPOUND_TYPE_KIND}
+ )
+ (?:
+ \s+
+ (?: # <COMPOUND_NAME>
+ {STRICT_IDENTIFIER}
+ )
+ )?
+ \s* {{
+ )
+ |
+ # bogus inline decl artifact
+ # This simplifies resolving the relative syntactic ambiguity of
+ # inline structs.
+ (?:
+ (?: # <FORWARD_KIND>
+ {COMPOUND_TYPE_KIND}
+ )
+ \s*
+ (?: # <FORWARD_NAME>
+ {ANON_IDENTIFIER}
+ )
+ (?: # <MAYBE_INLINE_ACTUAL>
+ [^=,;({{[*\]]*
+ [=,;({{]
+ )
+ )
+ |
+
+ # typedef
+ (?:
+ \b typedef \b \s*
+ (?: # <TYPEDEF_DECL>
+ {_ind(VAR_DECL, 4)}
+ )
+ (?:
+ # We expect no inline type definitions in the parameters.
+ \s* [(] \s*
+ (?: # <TYPEDEF_FUNC_PARAMS>
+ [^{{;]*
+ )
+ \s* [)]
+ )?
+ \s* ;
+ )
+ |
+
+ # func decl/definition & var decls
+ # XXX dedicated pattern for funcs (more restricted)?
+ (?:
+ (?:
+ (?: # <VAR_STORAGE>
+ {STORAGE_CLASS}
+ )
+ \s*
+ )?
+ (?:
+ (?: # <FUNC_INLINE>
+ \b inline \b
+ )
+ \s*
+ )?
+ (?: # <VAR_DECL>
+ {_ind(VAR_DECL, 4)}
+ )
+ (?:
+ # func decl / definition
+ (?:
+ (?:
+ # We expect no inline type definitions in the parameters.
+ \s* [(] \s*
+ (?: # <FUNC_PARAMS>
+ [^{{;]*
+ )
+ \s* [)] \s*
+ (?: # <FUNC_DELIM>
+ [{{;]
+ )
+ )
+ |
+ (?:
+ # This is some old-school syntax!
+ \s* [(] \s*
+ # We throw away the bare names:
+ {STRICT_IDENTIFIER}
+ (?: \s* , \s* {STRICT_IDENTIFIER} )*
+ \s* [)] \s*
+
+ # We keep the trailing param declarations:
+ (?: # <FUNC_LEGACY_PARAMS>
+ # There's at least one!
+ (?: {TYPE_QUALIFIER} \s* )?
+ {_ind(TYPE_SPEC, 7)}
+ \s*
+ {_ind(DECLARATOR, 7)}
+ \s* ;
+ (?:
+ \s*
+ (?: {TYPE_QUALIFIER} \s* )?
+ {_ind(TYPE_SPEC, 8)}
+ \s*
+ {_ind(DECLARATOR, 8)}
+ \s* ;
+ )*
+ )
+ \s* {{
+ )
+ )
+ |
+ # var / typedef
+ (?:
+ (?:
+ # initializer
+ # We expect only basic initializers.
+ \s* = \s*
+ (?: # <VAR_INIT>
+ {_ind(INITIALIZER, 6)}
+ )
+ )?
+ \s*
+ (?: # <VAR_ENDING>
+ [,;]
+ )
+ )
+ )
+ )
+ )
+ ''')