diff options
author | Eric Snow <ericsnowcurrently@gmail.com> | 2020-10-22 18:42:51 -0600 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-10-22 18:42:51 -0600 |
commit | 345cd37abe324ad4f60f80e2c3133b8849e54e9b (patch) | |
tree | 5d965e662dca9dcac19e7eddd63a3d9d0b816fed /Tools/c-analyzer/c_parser/parser | |
parent | bpo-38486: Fix dead qmail links in the mailbox docs (GH-22239) (diff) | |
download | cpython-345cd37abe324ad4f60f80e2c3133b8849e54e9b.tar.gz cpython-345cd37abe324ad4f60f80e2c3133b8849e54e9b.tar.bz2 cpython-345cd37abe324ad4f60f80e2c3133b8849e54e9b.zip |
bpo-36876: Fix the C analyzer tool. (GH-22841)
The original tool wasn't working right and it was simpler to create a new one, partially re-using some of the old code. At this point the tool runs properly on the master. (Try: ./python Tools/c-analyzer/c-analyzer.py analyze.) It take ~40 seconds on my machine to analyze the full CPython code base.
Note that we'll need to iron out some OS-specific stuff (e.g. preprocessor). We're okay though since this tool isn't used yet in our workflow. We will also need to verify the analysis results in detail before activating the check in CI, though I'm pretty sure it's close.
https://bugs.python.org/issue36876
Diffstat (limited to 'Tools/c-analyzer/c_parser/parser')
-rw-r--r-- | Tools/c-analyzer/c_parser/parser/__init__.py | 212 | ||||
-rw-r--r-- | Tools/c-analyzer/c_parser/parser/_alt.py | 6 | ||||
-rw-r--r-- | Tools/c-analyzer/c_parser/parser/_common.py | 115 | ||||
-rw-r--r-- | Tools/c-analyzer/c_parser/parser/_compound_decl_body.py | 158 | ||||
-rw-r--r-- | Tools/c-analyzer/c_parser/parser/_delim.py | 54 | ||||
-rw-r--r-- | Tools/c-analyzer/c_parser/parser/_func_body.py | 278 | ||||
-rw-r--r-- | Tools/c-analyzer/c_parser/parser/_global.py | 179 | ||||
-rw-r--r-- | Tools/c-analyzer/c_parser/parser/_info.py | 168 | ||||
-rw-r--r-- | Tools/c-analyzer/c_parser/parser/_regexes.py | 796 |
9 files changed, 1966 insertions, 0 deletions
diff --git a/Tools/c-analyzer/c_parser/parser/__init__.py b/Tools/c-analyzer/c_parser/parser/__init__.py new file mode 100644 index 00000000000..7cb34caf09e --- /dev/null +++ b/Tools/c-analyzer/c_parser/parser/__init__.py @@ -0,0 +1,212 @@ +"""A simple non-validating parser for C99. + +The functions and regex patterns here are not entirely suitable for +validating C syntax. Please rely on a proper compiler for that. +Instead our goal here is merely matching and extracting information from +valid C code. + +Furthermore, the grammar rules for the C syntax (particularly as +described in the K&R book) actually describe a superset, of which the +full C langage is a proper subset. Here are some of the extra +conditions that must be applied when parsing C code: + +* ... + +(see: http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1256.pdf) + +We have taken advantage of the elements of the C grammar that are used +only in a few limited contexts, mostly as delimiters. They allow us to +focus the regex patterns confidently. Here are the relevant tokens and +in which grammar rules they are used: + +separators: +* ";" + + (decl) struct/union: at end of each member decl + + (decl) declaration: at end of each (non-compound) decl + + (stmt) expr stmt: at end of each stmt + + (stmt) for: between exprs in "header" + + (stmt) goto: at end + + (stmt) continue: at end + + (stmt) break: at end + + (stmt) return: at end +* "," + + (decl) struct/union: between member declators + + (decl) param-list: between params + + (decl) enum: between enumerators + + (decl) initializer (compound): between initializers + + (expr) postfix: between func call args + + (expr) expression: between "assignment" exprs +* ":" + + (decl) struct/union: in member declators + + (stmt) label: between label and stmt + + (stmt) case: between expression and stmt + + (stmt) default: between "default" and stmt +* "=" + + (decl) delaration: between decl and initializer + + (decl) enumerator: between identifier and "initializer" + + (expr) assignment: between "var" and expr + +wrappers: +* "(...)" + + (decl) declarator (func ptr): to wrap ptr/name + + (decl) declarator (func ptr): around params + + (decl) declarator: around sub-declarator (for readability) + + (expr) postfix (func call): around args + + (expr) primary: around sub-expr + + (stmt) if: around condition + + (stmt) switch: around source expr + + (stmt) while: around condition + + (stmt) do-while: around condition + + (stmt) for: around "header" +* "{...}" + + (decl) enum: around enumerators + + (decl) func: around body + + (stmt) compound: around stmts +* "[...]" + * (decl) declarator: for arrays + * (expr) postfix: array access + +other: +* "*" + + (decl) declarator: for pointer types + + (expr) unary: for pointer deref + + +To simplify the regular expressions used here, we've takens some +shortcuts and made certain assumptions about the code we are parsing. +Some of these allow us to skip context-sensitive matching (e.g. braces) +or otherwise still match arbitrary C code unambiguously. However, in +some cases there are certain corner cases where the patterns are +ambiguous relative to arbitrary C code. However, they are still +unambiguous in the specific code we are parsing. + +Here are the cases where we've taken shortcuts or made assumptions: + +* there is no overlap syntactically between the local context (func + bodies) and the global context (other than variable decls), so we + do not need to worry about ambiguity due to the overlap: + + the global context has no expressions or statements + + the local context has no function definitions or type decls +* no "inline" type declarations (struct, union, enum) in function + parameters ~(including function pointers)~ +* no "inline" type decls in function return types +* no superflous parentheses in declarators +* var decls in for loops are always "simple" (e.g. no inline types) +* only inline struct/union/enum decls may be anonymouns (without a name) +* no function pointers in function pointer parameters +* for loop "headers" do not have curly braces (e.g. compound init) +* syntactically, variable decls do not overlap with stmts/exprs, except + in the following case: + spam (*eggs) (...) + This could be either a function pointer variable named "eggs" + or a call to a function named "spam", which returns a function + pointer that gets called. The only differentiator is the + syntax used in the "..." part. It will be comma-separated + parameters for the former and comma-separated expressions for + the latter. Thus, if we expect such decls or calls then we must + parse the decl params. +""" + +""" +TODO: +* extract CPython-specific code +* drop include injection (or only add when needed) +* track position instead of slicing "text" +* Parser class instead of the _iter_source() mess +* alt impl using a state machine (& tokenizer or split on delimiters) +""" + +from ..info import ParsedItem +from ._info import SourceInfo + + +def parse(srclines): + if isinstance(srclines, str): # a filename + raise NotImplementedError + + anon_name = anonymous_names() + for result in _parse(srclines, anon_name): + yield ParsedItem.from_raw(result) + + +# XXX Later: Add a separate function to deal with preprocessor directives +# parsed out of raw source. + + +def anonymous_names(): + counter = 1 + def anon_name(prefix='anon-'): + nonlocal counter + name = f'{prefix}{counter}' + counter += 1 + return name + return anon_name + + +############################# +# internal impl + +import logging + + +_logger = logging.getLogger(__name__) + + +def _parse(srclines, anon_name): + from ._global import parse_globals + + source = _iter_source(srclines) + #source = _iter_source(srclines, showtext=True) + for result in parse_globals(source, anon_name): + # XXX Handle blocks here insted of in parse_globals(). + yield result + + +def _iter_source(lines, *, maxtext=20_000, maxlines=700, showtext=False): + filestack = [] + allinfo = {} + # "lines" should be (fileinfo, data), as produced by the preprocessor code. + for fileinfo, line in lines: + if fileinfo.filename in filestack: + while fileinfo.filename != filestack[-1]: + filename = filestack.pop() + del allinfo[filename] + filename = fileinfo.filename + srcinfo = allinfo[filename] + else: + filename = fileinfo.filename + srcinfo = SourceInfo(filename) + filestack.append(filename) + allinfo[filename] = srcinfo + + _logger.debug(f'-> {line}') + srcinfo._add_line(line, fileinfo.lno) + if len(srcinfo.text) > maxtext: + break + if srcinfo.end - srcinfo.start > maxlines: + break + while srcinfo._used(): + yield srcinfo + if showtext: + _logger.debug(f'=> {srcinfo.text}') + else: + if not filestack: + srcinfo = SourceInfo('???') + else: + filename = filestack[-1] + srcinfo = allinfo[filename] + while srcinfo._used(): + yield srcinfo + if showtext: + _logger.debug(f'=> {srcinfo.text}') + yield srcinfo + if showtext: + _logger.debug(f'=> {srcinfo.text}') + if not srcinfo._ready: + return + # At this point either the file ended prematurely + # or there's "too much" text. + filename, lno, text = srcinfo.filename, srcinfo._start, srcinfo.text + if len(text) > 500: + text = text[:500] + '...' + raise Exception(f'unmatched text ({filename} starting at line {lno}):\n{text}') diff --git a/Tools/c-analyzer/c_parser/parser/_alt.py b/Tools/c-analyzer/c_parser/parser/_alt.py new file mode 100644 index 00000000000..05a9101b4f5 --- /dev/null +++ b/Tools/c-analyzer/c_parser/parser/_alt.py @@ -0,0 +1,6 @@ + +def _parse(srclines, anon_name): + text = ' '.join(l for _, l in srclines) + + from ._delim import parse + yield from parse(text, anon_name) diff --git a/Tools/c-analyzer/c_parser/parser/_common.py b/Tools/c-analyzer/c_parser/parser/_common.py new file mode 100644 index 00000000000..40c36039f3f --- /dev/null +++ b/Tools/c-analyzer/c_parser/parser/_common.py @@ -0,0 +1,115 @@ +import re + +from ._regexes import ( + _ind, + STRING_LITERAL, + VAR_DECL as _VAR_DECL, +) + + +def log_match(group, m): + from . import _logger + _logger.debug(f'matched <{group}> ({m.group(0)})') + + +############################# +# regex utils + +def set_capture_group(pattern, group, *, strict=True): + old = f'(?: # <{group}>' + if strict and f'(?: # <{group}>' not in pattern: + raise ValueError(f'{old!r} not found in pattern') + return pattern.replace(old, f'( # <{group}>', 1) + + +def set_capture_groups(pattern, groups, *, strict=True): + for group in groups: + pattern = set_capture_group(pattern, group, strict=strict) + return pattern + + +############################# +# syntax-related utils + +_PAREN_RE = re.compile(rf''' + (?: + (?: + [^'"()]* + {_ind(STRING_LITERAL, 3)} + )* + [^'"()]* + (?: + ( [(] ) + | + ( [)] ) + ) + ) + ''', re.VERBOSE) + + +def match_paren(text, depth=0): + pos = 0 + while (m := _PAREN_RE.match(text, pos)): + pos = m.end() + _open, _close = m.groups() + if _open: + depth += 1 + else: # _close + depth -= 1 + if depth == 0: + return pos + else: + raise ValueError(f'could not find matching parens for {text!r}') + + +VAR_DECL = set_capture_groups(_VAR_DECL, ( + 'STORAGE', + 'TYPE_QUAL', + 'TYPE_SPEC', + 'DECLARATOR', + 'IDENTIFIER', + 'WRAPPED_IDENTIFIER', + 'FUNC_IDENTIFIER', +)) + + +def parse_var_decl(decl): + m = re.match(VAR_DECL, decl, re.VERBOSE) + (storage, typequal, typespec, declarator, + name, + wrappedname, + funcptrname, + ) = m.groups() + if name: + kind = 'simple' + elif wrappedname: + kind = 'wrapped' + name = wrappedname + elif funcptrname: + kind = 'funcptr' + name = funcptrname + else: + raise NotImplementedError + abstract = declarator.replace(name, '') + vartype = { + 'storage': storage, + 'typequal': typequal, + 'typespec': typespec, + 'abstract': abstract, + } + return (kind, name, vartype) + + +############################# +# parser state utils + +# XXX Drop this or use it! +def iter_results(results): + if not results: + return + if callable(results): + results = results() + + for result, text in results(): + if result: + yield result, text diff --git a/Tools/c-analyzer/c_parser/parser/_compound_decl_body.py b/Tools/c-analyzer/c_parser/parser/_compound_decl_body.py new file mode 100644 index 00000000000..eb5bc67607b --- /dev/null +++ b/Tools/c-analyzer/c_parser/parser/_compound_decl_body.py @@ -0,0 +1,158 @@ +import re + +from ._regexes import ( + STRUCT_MEMBER_DECL as _STRUCT_MEMBER_DECL, + ENUM_MEMBER_DECL as _ENUM_MEMBER_DECL, +) +from ._common import ( + log_match, + parse_var_decl, + set_capture_groups, +) + + +############################# +# struct / union + +STRUCT_MEMBER_DECL = set_capture_groups(_STRUCT_MEMBER_DECL, ( + 'COMPOUND_TYPE_KIND', + 'COMPOUND_TYPE_NAME', + 'SPECIFIER_QUALIFIER', + 'DECLARATOR', + 'SIZE', + 'ENDING', + 'CLOSE', +)) +STRUCT_MEMBER_RE = re.compile(rf'^ \s* {STRUCT_MEMBER_DECL}', re.VERBOSE) + + +def parse_struct_body(source, anon_name, parent): + done = False + while not done: + done = True + for srcinfo in source: + m = STRUCT_MEMBER_RE.match(srcinfo.text) + if m: + break + else: + # We ran out of lines. + if srcinfo is not None: + srcinfo.done() + return + for item in _parse_struct_next(m, srcinfo, anon_name, parent): + if callable(item): + parse_body = item + yield from parse_body(source) + else: + yield item + done = False + + +def _parse_struct_next(m, srcinfo, anon_name, parent): + (inline_kind, inline_name, + qualspec, declarator, + size, + ending, + close, + ) = m.groups() + remainder = srcinfo.text[m.end():] + + if close: + log_match('compound close', m) + srcinfo.advance(remainder) + + elif inline_kind: + log_match('compound inline', m) + kind = inline_kind + name = inline_name or anon_name('inline-') + # Immediately emit a forward declaration. + yield srcinfo.resolve(kind, name=name, data=None) + + # un-inline the decl. Note that it might not actually be inline. + # We handle the case in the "maybe_inline_actual" branch. + srcinfo.nest( + remainder, + f'{kind} {name}', + ) + def parse_body(source): + _parse_body = DECL_BODY_PARSERS[kind] + + data = [] # members + ident = f'{kind} {name}' + for item in _parse_body(source, anon_name, ident): + if item.kind == 'field': + data.append(item) + else: + yield item + # XXX Should "parent" really be None for inline type decls? + yield srcinfo.resolve(kind, data, name, parent=None) + + srcinfo.resume() + yield parse_body + + else: + # not inline (member) + log_match('compound member', m) + if qualspec: + _, name, data = parse_var_decl(f'{qualspec} {declarator}') + if not name: + name = anon_name('struct-field-') + if size: +# data = (data, size) + data['size'] = int(size) + else: + # This shouldn't happen (we expect each field to have a name). + raise NotImplementedError + name = sized_name or anon_name('struct-field-') + data = int(size) + + yield srcinfo.resolve('field', data, name, parent) # XXX Restart? + if ending == ',': + remainder = rf'{qualspec} {remainder}' + srcinfo.advance(remainder) + + +############################# +# enum + +ENUM_MEMBER_DECL = set_capture_groups(_ENUM_MEMBER_DECL, ( + 'CLOSE', + 'NAME', + 'INIT', + 'ENDING', +)) +ENUM_MEMBER_RE = re.compile(rf'{ENUM_MEMBER_DECL}', re.VERBOSE) + + +def parse_enum_body(source, _anon_name, _parent): + ending = None + while ending != '}': + for srcinfo in source: + m = ENUM_MEMBER_RE.match(srcinfo.text) + if m: + break + else: + # We ran out of lines. + if srcinfo is not None: + srcinfo.done() + return + remainder = srcinfo.text[m.end():] + + (close, + name, init, ending, + ) = m.groups() + if close: + ending = '}' + else: + data = init + yield srcinfo.resolve('field', data, name, _parent) + srcinfo.advance(remainder) + + +############################# + +DECL_BODY_PARSERS = { + 'struct': parse_struct_body, + 'union': parse_struct_body, + 'enum': parse_enum_body, +} diff --git a/Tools/c-analyzer/c_parser/parser/_delim.py b/Tools/c-analyzer/c_parser/parser/_delim.py new file mode 100644 index 00000000000..51433a629d3 --- /dev/null +++ b/Tools/c-analyzer/c_parser/parser/_delim.py @@ -0,0 +1,54 @@ +import re +import textwrap + +from ._regexes import _ind, STRING_LITERAL + + +def parse(text, anon_name): + context = None + data = None + for m in DELIMITER_RE.find_iter(text): + before, opened, closed = m.groups() + delim = opened or closed + + handle_segment = HANDLERS[context][delim] + result, context, data = handle_segment(before, delim, data) + if result: + yield result + + +DELIMITER = textwrap.dedent(rf''' + ( + (?: + [^'"()\[\]{};]* + {_ind(STRING_LITERAL, 3)} + }* + [^'"()\[\]{};]+ + )? # <before> + (?: + ( + [(\[{] + ) # <open> + | + ( + [)\]};] + ) # <close> + )? + ''') +DELIMITER_RE = re.compile(DELIMITER, re.VERBOSE) + +_HANDLERS = { + None: { # global + # opened + '{': ..., + '[': None, + '(': None, + # closed + '}': None, + ']': None, + ')': None, + ';': ..., + }, + '': { + }, +} diff --git a/Tools/c-analyzer/c_parser/parser/_func_body.py b/Tools/c-analyzer/c_parser/parser/_func_body.py new file mode 100644 index 00000000000..42fd459e111 --- /dev/null +++ b/Tools/c-analyzer/c_parser/parser/_func_body.py @@ -0,0 +1,278 @@ +import re + +from ._regexes import ( + LOCAL as _LOCAL, + LOCAL_STATICS as _LOCAL_STATICS, +) +from ._common import ( + log_match, + parse_var_decl, + set_capture_groups, + match_paren, +) +from ._compound_decl_body import DECL_BODY_PARSERS + + +LOCAL = set_capture_groups(_LOCAL, ( + 'EMPTY', + 'INLINE_LEADING', + 'INLINE_PRE', + 'INLINE_KIND', + 'INLINE_NAME', + 'STORAGE', + 'VAR_DECL', + 'VAR_INIT', + 'VAR_ENDING', + 'COMPOUND_BARE', + 'COMPOUND_LABELED', + 'COMPOUND_PAREN', + 'BLOCK_LEADING', + 'BLOCK_OPEN', + 'SIMPLE_STMT', + 'SIMPLE_ENDING', + 'BLOCK_CLOSE', +)) +LOCAL_RE = re.compile(rf'^ \s* {LOCAL}', re.VERBOSE) + + +# Note that parse_function_body() still has trouble with a few files +# in the CPython codebase. + +def parse_function_body(source, name, anon_name): + # XXX + raise NotImplementedError + + +def parse_function_body(name, text, resolve, source, anon_name, parent): + raise NotImplementedError + # For now we do not worry about locals declared in for loop "headers". + depth = 1; + while depth > 0: + m = LOCAL_RE.match(text) + while not m: + text, resolve = continue_text(source, text or '{', resolve) + m = LOCAL_RE.match(text) + text = text[m.end():] + ( + empty, + inline_leading, inline_pre, inline_kind, inline_name, + storage, decl, + var_init, var_ending, + compound_bare, compound_labeled, compound_paren, + block_leading, block_open, + simple_stmt, simple_ending, + block_close, + ) = m.groups() + + if empty: + log_match('', m) + resolve(None, None, None, text) + yield None, text + elif inline_kind: + log_match('', m) + kind = inline_kind + name = inline_name or anon_name('inline-') + data = [] # members + # We must set the internal "text" from _iter_source() to the + # start of the inline compound body, + # Note that this is effectively like a forward reference that + # we do not emit. + resolve(kind, None, name, text, None) + _parse_body = DECL_BODY_PARSERS[kind] + before = [] + ident = f'{kind} {name}' + for member, inline, text in _parse_body(text, resolve, source, anon_name, ident): + if member: + data.append(member) + if inline: + yield from inline + # un-inline the decl. Note that it might not actually be inline. + # We handle the case in the "maybe_inline_actual" branch. + text = f'{inline_leading or ""} {inline_pre or ""} {kind} {name} {text}' + # XXX Should "parent" really be None for inline type decls? + yield resolve(kind, data, name, text, None), text + elif block_close: + log_match('', m) + depth -= 1 + resolve(None, None, None, text) + # XXX This isn't great. Calling resolve() should have + # cleared the closing bracket. However, some code relies + # on the yielded value instead of the resolved one. That + # needs to be fixed. + yield None, text + elif compound_bare: + log_match('', m) + yield resolve('statement', compound_bare, None, text, parent), text + elif compound_labeled: + log_match('', m) + yield resolve('statement', compound_labeled, None, text, parent), text + elif compound_paren: + log_match('', m) + try: + pos = match_paren(text) + except ValueError: + text = f'{compound_paren} {text}' + #resolve(None, None, None, text) + text, resolve = continue_text(source, text, resolve) + yield None, text + else: + head = text[:pos] + text = text[pos:] + if compound_paren == 'for': + # XXX Parse "head" as a compound statement. + stmt1, stmt2, stmt3 = head.split(';', 2) + data = { + 'compound': compound_paren, + 'statements': (stmt1, stmt2, stmt3), + } + else: + data = { + 'compound': compound_paren, + 'statement': head, + } + yield resolve('statement', data, None, text, parent), text + elif block_open: + log_match('', m) + depth += 1 + if block_leading: + # An inline block: the last evaluated expression is used + # in place of the block. + # XXX Combine it with the remainder after the block close. + stmt = f'{block_open}{{<expr>}}...;' + yield resolve('statement', stmt, None, text, parent), text + else: + resolve(None, None, None, text) + yield None, text + elif simple_ending: + log_match('', m) + yield resolve('statement', simple_stmt, None, text, parent), text + elif var_ending: + log_match('', m) + kind = 'variable' + _, name, vartype = parse_var_decl(decl) + data = { + 'storage': storage, + 'vartype': vartype, + } + after = () + if var_ending == ',': + # It was a multi-declaration, so queue up the next one. + _, qual, typespec, _ = vartype.values() + text = f'{storage or ""} {qual or ""} {typespec} {text}' + yield resolve(kind, data, name, text, parent), text + if var_init: + _data = f'{name} = {var_init.strip()}' + yield resolve('statement', _data, None, text, parent), text + else: + # This should be unreachable. + raise NotImplementedError + + +############################# +# static local variables + +LOCAL_STATICS = set_capture_groups(_LOCAL_STATICS, ( + 'INLINE_LEADING', + 'INLINE_PRE', + 'INLINE_KIND', + 'INLINE_NAME', + 'STATIC_DECL', + 'STATIC_INIT', + 'STATIC_ENDING', + 'DELIM_LEADING', + 'BLOCK_OPEN', + 'BLOCK_CLOSE', + 'STMT_END', +)) +LOCAL_STATICS_RE = re.compile(rf'^ \s* {LOCAL_STATICS}', re.VERBOSE) + + +def parse_function_statics(source, func, anon_name): + # For now we do not worry about locals declared in for loop "headers". + depth = 1; + while depth > 0: + for srcinfo in source: + m = LOCAL_STATICS_RE.match(srcinfo.text) + if m: + break + else: + # We ran out of lines. + if srcinfo is not None: + srcinfo.done() + return + for item, depth in _parse_next_local_static(m, srcinfo, + anon_name, func, depth): + if callable(item): + parse_body = item + yield from parse_body(source) + elif item is not None: + yield item + + +def _parse_next_local_static(m, srcinfo, anon_name, func, depth): + (inline_leading, inline_pre, inline_kind, inline_name, + static_decl, static_init, static_ending, + _delim_leading, + block_open, + block_close, + stmt_end, + ) = m.groups() + remainder = srcinfo.text[m.end():] + + if inline_kind: + log_match('func inline', m) + kind = inline_kind + name = inline_name or anon_name('inline-') + # Immediately emit a forward declaration. + yield srcinfo.resolve(kind, name=name, data=None), depth + + # un-inline the decl. Note that it might not actually be inline. + # We handle the case in the "maybe_inline_actual" branch. + srcinfo.nest( + remainder, + f'{inline_leading or ""} {inline_pre or ""} {kind} {name}' + ) + def parse_body(source): + _parse_body = DECL_BODY_PARSERS[kind] + + data = [] # members + ident = f'{kind} {name}' + for item in _parse_body(source, anon_name, ident): + if item.kind == 'field': + data.append(item) + else: + yield item + # XXX Should "parent" really be None for inline type decls? + yield srcinfo.resolve(kind, data, name, parent=None) + + srcinfo.resume() + yield parse_body, depth + + elif static_decl: + log_match('local variable', m) + _, name, data = parse_var_decl(static_decl) + + yield srcinfo.resolve('variable', data, name, parent=func), depth + + if static_init: + srcinfo.advance(f'{name} {static_init} {remainder}') + elif static_ending == ',': + # It was a multi-declaration, so queue up the next one. + _, qual, typespec, _ = data.values() + srcinfo.advance(f'static {qual or ""} {typespec} {remainder}') + else: + srcinfo.advance('') + + else: + log_match('func other', m) + if block_open: + depth += 1 + elif block_close: + depth -= 1 + elif stmt_end: + pass + else: + # This should be unreachable. + raise NotImplementedError + srcinfo.advance(remainder) + yield None, depth diff --git a/Tools/c-analyzer/c_parser/parser/_global.py b/Tools/c-analyzer/c_parser/parser/_global.py new file mode 100644 index 00000000000..35947c12998 --- /dev/null +++ b/Tools/c-analyzer/c_parser/parser/_global.py @@ -0,0 +1,179 @@ +import re + +from ._regexes import ( + GLOBAL as _GLOBAL, +) +from ._common import ( + log_match, + parse_var_decl, + set_capture_groups, +) +from ._compound_decl_body import DECL_BODY_PARSERS +#from ._func_body import parse_function_body +from ._func_body import parse_function_statics as parse_function_body + + +GLOBAL = set_capture_groups(_GLOBAL, ( + 'EMPTY', + 'COMPOUND_LEADING', + 'COMPOUND_KIND', + 'COMPOUND_NAME', + 'FORWARD_KIND', + 'FORWARD_NAME', + 'MAYBE_INLINE_ACTUAL', + 'TYPEDEF_DECL', + 'TYPEDEF_FUNC_PARAMS', + 'VAR_STORAGE', + 'FUNC_INLINE', + 'VAR_DECL', + 'FUNC_PARAMS', + 'FUNC_DELIM', + 'FUNC_LEGACY_PARAMS', + 'VAR_INIT', + 'VAR_ENDING', +)) +GLOBAL_RE = re.compile(rf'^ \s* {GLOBAL}', re.VERBOSE) + + +def parse_globals(source, anon_name): + for srcinfo in source: + m = GLOBAL_RE.match(srcinfo.text) + if not m: + # We need more text. + continue + for item in _parse_next(m, srcinfo, anon_name): + if callable(item): + parse_body = item + yield from parse_body(source) + else: + yield item + else: + # We ran out of lines. + if srcinfo is not None: + srcinfo.done() + return + + +def _parse_next(m, srcinfo, anon_name): + ( + empty, + # compound type decl (maybe inline) + compound_leading, compound_kind, compound_name, + forward_kind, forward_name, maybe_inline_actual, + # typedef + typedef_decl, typedef_func_params, + # vars and funcs + storage, func_inline, decl, + func_params, func_delim, func_legacy_params, + var_init, var_ending, + ) = m.groups() + remainder = srcinfo.text[m.end():] + + if empty: + log_match('global empty', m) + srcinfo.advance(remainder) + + elif maybe_inline_actual: + log_match('maybe_inline_actual', m) + # Ignore forward declarations. + # XXX Maybe return them too (with an "isforward" flag)? + if not maybe_inline_actual.strip().endswith(';'): + remainder = maybe_inline_actual + remainder + yield srcinfo.resolve(forward_kind, None, forward_name) + if maybe_inline_actual.strip().endswith('='): + # We use a dummy prefix for a fake typedef. + # XXX Ideally this case would not be caught by MAYBE_INLINE_ACTUAL. + _, name, data = parse_var_decl(f'{forward_kind} {forward_name} fake_typedef_{forward_name}') + yield srcinfo.resolve('typedef', data, name, parent=None) + remainder = f'{name} {remainder}' + srcinfo.advance(remainder) + + elif compound_kind: + kind = compound_kind + name = compound_name or anon_name('inline-') + # Immediately emit a forward declaration. + yield srcinfo.resolve(kind, name=name, data=None) + + # un-inline the decl. Note that it might not actually be inline. + # We handle the case in the "maybe_inline_actual" branch. + srcinfo.nest( + remainder, + f'{compound_leading or ""} {compound_kind} {name}', + ) + def parse_body(source): + _parse_body = DECL_BODY_PARSERS[compound_kind] + + data = [] # members + ident = f'{kind} {name}' + for item in _parse_body(source, anon_name, ident): + if item.kind == 'field': + data.append(item) + else: + yield item + # XXX Should "parent" really be None for inline type decls? + yield srcinfo.resolve(kind, data, name, parent=None) + + srcinfo.resume() + yield parse_body + + elif typedef_decl: + log_match('typedef', m) + kind = 'typedef' + _, name, data = parse_var_decl(typedef_decl) + if typedef_func_params: + return_type = data + # This matches the data for func declarations. + data = { + 'storage': None, + 'inline': None, + 'params': f'({typedef_func_params})', + 'returntype': return_type, + 'isforward': True, + } + yield srcinfo.resolve(kind, data, name, parent=None) + srcinfo.advance(remainder) + + elif func_delim or func_legacy_params: + log_match('function', m) + kind = 'function' + _, name, return_type = parse_var_decl(decl) + func_params = func_params or func_legacy_params + data = { + 'storage': storage, + 'inline': func_inline, + 'params': f'({func_params})', + 'returntype': return_type, + 'isforward': func_delim == ';', + } + + yield srcinfo.resolve(kind, data, name, parent=None) + srcinfo.advance(remainder) + + if func_delim == '{' or func_legacy_params: + def parse_body(source): + yield from parse_function_body(source, name, anon_name) + yield parse_body + + elif var_ending: + log_match('global variable', m) + kind = 'variable' + _, name, vartype = parse_var_decl(decl) + data = { + 'storage': storage, + 'vartype': vartype, + } + yield srcinfo.resolve(kind, data, name, parent=None) + + if var_ending == ',': + # It was a multi-declaration, so queue up the next one. + _, qual, typespec, _ = vartype.values() + remainder = f'{storage or ""} {qual or ""} {typespec} {remainder}' + srcinfo.advance(remainder) + + if var_init: + _data = f'{name} = {var_init.strip()}' + yield srcinfo.resolve('statement', _data, name=None) + + else: + # This should be unreachable. + raise NotImplementedError diff --git a/Tools/c-analyzer/c_parser/parser/_info.py b/Tools/c-analyzer/c_parser/parser/_info.py new file mode 100644 index 00000000000..2dcd5e5e760 --- /dev/null +++ b/Tools/c-analyzer/c_parser/parser/_info.py @@ -0,0 +1,168 @@ +from ..info import KIND, ParsedItem, FileInfo + + +class TextInfo: + + def __init__(self, text, start=None, end=None): + # immutable: + if not start: + start = 1 + self.start = start + + # mutable: + lines = text.splitlines() or [''] + self.text = text.strip() + if not end: + end = start + len(lines) - 1 + self.end = end + self.line = lines[-1] + + def __repr__(self): + args = (f'{a}={getattr(self, a)!r}' + for a in ['text', 'start', 'end']) + return f'{type(self).__name__}({", ".join(args)})' + + def add_line(self, line, lno=None): + if lno is None: + lno = self.end + 1 + else: + if isinstance(lno, FileInfo): + fileinfo = lno + if fileinfo.filename != self.filename: + raise NotImplementedError((fileinfo, self.filename)) + lno = fileinfo.lno + # XXX + #if lno < self.end: + # raise NotImplementedError((lno, self.end)) + line = line.lstrip() + self.text += ' ' + line + self.line = line + self.end = lno + + +class SourceInfo: + + _ready = False + + def __init__(self, filename, _current=None): + # immutable: + self.filename = filename + # mutable: + if isinstance(_current, str): + _current = TextInfo(_current) + self._current = _current + start = -1 + self._start = _current.start if _current else -1 + self._nested = [] + self._set_ready() + + def __repr__(self): + args = (f'{a}={getattr(self, a)!r}' + for a in ['filename', '_current']) + return f'{type(self).__name__}({", ".join(args)})' + + @property + def start(self): + if self._current is None: + return self._start + return self._current.start + + @property + def end(self): + if self._current is None: + return self._start + return self._current.end + + @property + def text(self): + if self._current is None: + return '' + return self._current.text + + def nest(self, text, before, start=None): + if self._current is None: + raise Exception('nesting requires active source text') + current = self._current + current.text = before + self._nested.append(current) + self._replace(text, start) + + def resume(self, remainder=None): + if not self._nested: + raise Exception('no nested text to resume') + if self._current is None: + raise Exception('un-nesting requires active source text') + if remainder is None: + remainder = self._current.text + self._clear() + self._current = self._nested.pop() + self._current.text += ' ' + remainder + self._set_ready() + + def advance(self, remainder, start=None): + if self._current is None: + raise Exception('advancing requires active source text') + if remainder.strip(): + self._replace(remainder, start, fixnested=True) + else: + if self._nested: + self._replace('', start, fixnested=True) + #raise Exception('cannot advance while nesting') + else: + self._clear(start) + + def resolve(self, kind, data, name, parent=None): + # "field" isn't a top-level kind, so we leave it as-is. + if kind and kind != 'field': + kind = KIND._from_raw(kind) + fileinfo = FileInfo(self.filename, self._start) + return ParsedItem(fileinfo, kind, parent, name, data) + + def done(self): + self._set_ready() + + def _set_ready(self): + if self._current is None: + self._ready = False + else: + self._ready = self._current.text.strip() != '' + + def _used(self): + ready = self._ready + self._ready = False + return ready + + def _clear(self, start=None): + old = self._current + if self._current is not None: + # XXX Fail if self._current wasn't used up? + if start is None: + start = self._current.end + self._current = None + if start is not None: + self._start = start + self._set_ready() + return old + + def _replace(self, text, start=None, *, fixnested=False): + end = self._current.end + old = self._clear(start) + self._current = TextInfo(text, self._start, end) + if fixnested and self._nested and self._nested[-1] is old: + self._nested[-1] = self._current + self._set_ready() + + def _add_line(self, line, lno=None): + if not line.strip(): + # We don't worry about multi-line string literals. + return + if self._current is None: + self._start = lno + self._current = TextInfo(line, lno) + else: + # XXX + #if lno < self._current.end: + # # A circular include? + # raise NotImplementedError((lno, self)) + self._current.add_line(line, lno) + self._ready = True diff --git a/Tools/c-analyzer/c_parser/parser/_regexes.py b/Tools/c-analyzer/c_parser/parser/_regexes.py new file mode 100644 index 00000000000..e9bc31d335a --- /dev/null +++ b/Tools/c-analyzer/c_parser/parser/_regexes.py @@ -0,0 +1,796 @@ +# Regular expression patterns for C syntax. +# +# None of these patterns has any capturing. However, a number of them +# have capturing markers compatible with utils.set_capture_groups(). + +import textwrap + + +def _ind(text, level=1, edges='both'): + indent = ' ' * level + text = textwrap.indent(text, indent) + if edges == 'pre' or edges == 'both': + text = '\n' + indent + text.lstrip() + if edges == 'post' or edges == 'both': + text = text.rstrip() + '\n' + ' ' * (level - 1) + return text + + +####################################### +# general + +HEX = r'(?: [0-9a-zA-Z] )' + +STRING_LITERAL = textwrap.dedent(rf''' + (?: + # character literal + (?: + ['] [^'] ['] + | + ['] \\ . ['] + | + ['] \\x{HEX}{HEX} ['] + | + ['] \\0\d\d ['] + | + (?: + ['] \\o[01]\d\d ['] + | + ['] \\o2[0-4]\d ['] + | + ['] \\o25[0-5] ['] + ) + ) + | + # string literal + (?: + ["] (?: [^"\\]* \\ . )* [^"\\]* ["] + ) + # end string literal + ) + ''') + +_KEYWORD = textwrap.dedent(r''' + (?: + \b + (?: + auto | + extern | + register | + static | + typedef | + + const | + volatile | + + signed | + unsigned | + char | + short | + int | + long | + float | + double | + void | + + struct | + union | + enum | + + goto | + return | + sizeof | + break | + continue | + if | + else | + for | + do | + while | + switch | + case | + default | + entry + ) + \b + ) + ''') +KEYWORD = rf''' + # keyword + {_KEYWORD} + # end keyword + ''' +_KEYWORD = ''.join(_KEYWORD.split()) + +IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )' +# We use a negative lookahead to filter out keywords. +STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )' +ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )' + + +####################################### +# types + +SIMPLE_TYPE = textwrap.dedent(rf''' + # simple type + (?: + \b + (?: + void + | + (?: signed | unsigned ) # implies int + | + (?: + (?: (?: signed | unsigned ) \s+ )? + (?: (?: long | short ) \s+ )? + (?: char | short | int | long | float | double ) + ) + ) + \b + ) + # end simple type + ''') + +COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )' + + +####################################### +# variable declarations + +STORAGE_CLASS = r'(?: \b (?: auto | register | static | extern ) \b )' +TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )' +PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )' + +TYPE_SPEC = textwrap.dedent(rf''' + # type spec + (?: + {_ind(SIMPLE_TYPE, 2)} + | + (?: + [_]*typeof[_]* + \s* [(] + (?: \s* [*&] )* + \s* {STRICT_IDENTIFIER} + \s* [)] + ) + | + # reference to a compound type + (?: + {COMPOUND_TYPE_KIND} + (?: \s* {ANON_IDENTIFIER} )? + ) + | + # reference to a typedef + {STRICT_IDENTIFIER} + ) + # end type spec + ''') + +DECLARATOR = textwrap.dedent(rf''' + # declarator (possibly abstract) + (?: + (?: {PTR_QUALIFIER} \s* )* + (?: + (?: + (?: # <IDENTIFIER> + {STRICT_IDENTIFIER} + ) + (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays + ) + | + (?: + [(] \s* + (?: # <WRAPPED_IDENTIFIER> + {STRICT_IDENTIFIER} + ) + (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays + \s* [)] + ) + | + # func ptr + (?: + [(] (?: \s* {PTR_QUALIFIER} )? \s* + (?: # <FUNC_IDENTIFIER> + {STRICT_IDENTIFIER} + ) + (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays + \s* [)] + # We allow for a single level of paren nesting in parameters. + \s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)] + ) + ) + ) + # end declarator + ''') + +VAR_DECL = textwrap.dedent(rf''' + # var decl (and typedef and func return type) + (?: + (?: + (?: # <STORAGE> + {STORAGE_CLASS} + ) + \s* + )? + (?: + (?: # <TYPE_QUAL> + {TYPE_QUALIFIER} + ) + \s* + )? + (?: + (?: # <TYPE_SPEC> + {_ind(TYPE_SPEC, 4)} + ) + ) + \s* + (?: + (?: # <DECLARATOR> + {_ind(DECLARATOR, 4)} + ) + ) + ) + # end var decl + ''') + +INITIALIZER = textwrap.dedent(rf''' + # initializer + (?: + (?: + [(] + # no nested parens (e.g. func ptr) + [^)]* + [)] + \s* + )? + (?: + # a string literal + (?: + (?: {_ind(STRING_LITERAL, 4)} \s* )* + {_ind(STRING_LITERAL, 4)} + ) + | + + # a simple initializer + (?: + (?: + [^'",;{{]* + {_ind(STRING_LITERAL, 4)} + )* + [^'",;{{]* + ) + | + + # a struct/array literal + (?: + # We only expect compound initializers with + # single-variable declarations. + {{ + (?: + [^'";]*? + {_ind(STRING_LITERAL, 5)} + )* + [^'";]*? + }} + (?= \s* ; ) # Note this lookahead. + ) + ) + ) + # end initializer + ''') + + +####################################### +# compound type declarations + +STRUCT_MEMBER_DECL = textwrap.dedent(rf''' + (?: + # inline compound type decl + (?: + (?: # <COMPOUND_TYPE_KIND> + {COMPOUND_TYPE_KIND} + ) + (?: + \s+ + (?: # <COMPOUND_TYPE_NAME> + {STRICT_IDENTIFIER} + ) + )? + \s* {{ + ) + | + (?: + # typed member + (?: + # Technically it doesn't have to have a type... + (?: # <SPECIFIER_QUALIFIER> + (?: {TYPE_QUALIFIER} \s* )? + {_ind(TYPE_SPEC, 5)} + ) + (?: + # If it doesn't have a declarator then it will have + # a size and vice versa. + \s* + (?: # <DECLARATOR> + {_ind(DECLARATOR, 6)} + ) + )? + ) + + # sized member + (?: + \s* [:] \s* + (?: # <SIZE> + \d+ + ) + )? + \s* + (?: # <ENDING> + [,;] + ) + ) + | + (?: + \s* + (?: # <CLOSE> + }} + ) + ) + ) + ''') + +ENUM_MEMBER_DECL = textwrap.dedent(rf''' + (?: + (?: + \s* + (?: # <CLOSE> + }} + ) + ) + | + (?: + \s* + (?: # <NAME> + {IDENTIFIER} + ) + (?: + \s* = \s* + (?: # <INIT> + {_ind(STRING_LITERAL, 4)} + | + [^'",}}]+ + ) + )? + \s* + (?: # <ENDING> + , | }} + ) + ) + ) + ''') + + +####################################### +# statements + +SIMPLE_STMT_BODY = textwrap.dedent(rf''' + # simple statement body + (?: + (?: + [^'"{{}};]* + {_ind(STRING_LITERAL, 3)} + )* + [^'"{{}};]* + #(?= [;{{] ) # Note this lookahead. + ) + # end simple statement body + ''') +SIMPLE_STMT = textwrap.dedent(rf''' + # simple statement + (?: + (?: # <SIMPLE_STMT> + # stmt-inline "initializer" + (?: + return \b + (?: + \s* + {_ind(INITIALIZER, 5)} + )? + ) + | + # variable assignment + (?: + (?: [*] \s* )? + (?: + {STRICT_IDENTIFIER} \s* + (?: . | -> ) \s* + )* + {STRICT_IDENTIFIER} + (?: \s* \[ \s* \d+ \s* \] )? + \s* = \s* + {_ind(INITIALIZER, 4)} + ) + | + # catchall return statement + (?: + return \b + (?: + (?: + [^'";]* + {_ind(STRING_LITERAL, 6)} + )* + \s* [^'";]* + )? + ) + | + # simple statement + (?: + {_ind(SIMPLE_STMT_BODY, 4)} + ) + ) + \s* + (?: # <SIMPLE_ENDING> + ; + ) + ) + # end simple statement + ''') +COMPOUND_STMT = textwrap.dedent(rf''' + # compound statement + (?: + \b + (?: + (?: + (?: # <COMPOUND_BARE> + else | do + ) + \b + ) + | + (?: + (?: # <COMPOUND_LABELED> + (?: + case \b + (?: + [^'":]* + {_ind(STRING_LITERAL, 7)} + )* + \s* [^'":]* + ) + | + default + | + {STRICT_IDENTIFIER} + ) + \s* [:] + ) + | + (?: + (?: # <COMPOUND_PAREN> + for | while | if | switch + ) + \s* (?= [(] ) # Note this lookahead. + ) + ) + \s* + ) + # end compound statement + ''') + + +####################################### +# function bodies + +LOCAL = textwrap.dedent(rf''' + (?: + # an empty statement + (?: # <EMPTY> + ; + ) + | + # inline type decl + (?: + (?: + (?: # <INLINE_LEADING> + [^;{{}}]+? + ) + \s* + )? + (?: # <INLINE_PRE> + (?: {STORAGE_CLASS} \s* )? + (?: {TYPE_QUALIFIER} \s* )? + )? # </INLINE_PRE> + (?: # <INLINE_KIND> + {COMPOUND_TYPE_KIND} + ) + (?: + \s+ + (?: # <INLINE_NAME> + {STRICT_IDENTIFIER} + ) + )? + \s* {{ + ) + | + # var decl + (?: + (?: # <STORAGE> + {STORAGE_CLASS} + )? # </STORAGE> + (?: + \s* + (?: # <VAR_DECL> + {_ind(VAR_DECL, 5)} + ) + ) + (?: + (?: + # initializer + # We expect only basic initializers. + \s* = \s* + (?: # <VAR_INIT> + {_ind(INITIALIZER, 6)} + ) + )? + (?: + \s* + (?: # <VAR_ENDING> + [,;] + ) + ) + ) + ) + | + {_ind(COMPOUND_STMT, 2)} + | + # start-of-block + (?: + (?: # <BLOCK_LEADING> + (?: + [^'"{{}};]* + {_ind(STRING_LITERAL, 5)} + )* + [^'"{{}};]* + # Presumably we will not see "== {{". + [^\s='"{{}});] + \s* + )? # </BLOCK_LEADING> + (?: # <BLOCK_OPEN> + {{ + ) + ) + | + {_ind(SIMPLE_STMT, 2)} + | + # end-of-block + (?: # <BLOCK_CLOSE> + }} + ) + ) + ''') + +LOCAL_STATICS = textwrap.dedent(rf''' + (?: + # inline type decl + (?: + (?: + (?: # <INLINE_LEADING> + [^;{{}}]+? + ) + \s* + )? + (?: # <INLINE_PRE> + (?: {STORAGE_CLASS} \s* )? + (?: {TYPE_QUALIFIER} \s* )? + )? + (?: # <INLINE_KIND> + {COMPOUND_TYPE_KIND} + ) + (?: + \s+ + (?: # <INLINE_NAME> + {STRICT_IDENTIFIER} + ) + )? + \s* {{ + ) + | + # var decl + (?: + # We only look for static variables. + (?: # <STATIC_DECL> + static \b + (?: \s* {TYPE_QUALIFIER} )? + \s* {_ind(TYPE_SPEC, 4)} + \s* {_ind(DECLARATOR, 4)} + ) + \s* + (?: + (?: # <STATIC_INIT> + = \s* + {_ind(INITIALIZER, 4)} + \s* + [,;{{] + ) + | + (?: # <STATIC_ENDING> + [,;] + ) + ) + ) + | + # everything else + (?: + (?: # <DELIM_LEADING> + (?: + [^'"{{}};]* + {_ind(STRING_LITERAL, 4)} + )* + \s* [^'"{{}};]* + ) + (?: + (?: # <BLOCK_OPEN> + {{ + ) + | + (?: # <BLOCK_CLOSE> + }} + ) + | + (?: # <STMT_END> + ; + ) + ) + ) + ) + ''') + + +####################################### +# global declarations + +GLOBAL = textwrap.dedent(rf''' + (?: + # an empty statement + (?: # <EMPTY> + ; + ) + | + + # compound type decl (maybe inline) + (?: + (?: + (?: # <COMPOUND_LEADING> + [^;{{}}]+? + ) + \s* + )? + (?: # <COMPOUND_KIND> + {COMPOUND_TYPE_KIND} + ) + (?: + \s+ + (?: # <COMPOUND_NAME> + {STRICT_IDENTIFIER} + ) + )? + \s* {{ + ) + | + # bogus inline decl artifact + # This simplifies resolving the relative syntactic ambiguity of + # inline structs. + (?: + (?: # <FORWARD_KIND> + {COMPOUND_TYPE_KIND} + ) + \s* + (?: # <FORWARD_NAME> + {ANON_IDENTIFIER} + ) + (?: # <MAYBE_INLINE_ACTUAL> + [^=,;({{[*\]]* + [=,;({{] + ) + ) + | + + # typedef + (?: + \b typedef \b \s* + (?: # <TYPEDEF_DECL> + {_ind(VAR_DECL, 4)} + ) + (?: + # We expect no inline type definitions in the parameters. + \s* [(] \s* + (?: # <TYPEDEF_FUNC_PARAMS> + [^{{;]* + ) + \s* [)] + )? + \s* ; + ) + | + + # func decl/definition & var decls + # XXX dedicated pattern for funcs (more restricted)? + (?: + (?: + (?: # <VAR_STORAGE> + {STORAGE_CLASS} + ) + \s* + )? + (?: + (?: # <FUNC_INLINE> + \b inline \b + ) + \s* + )? + (?: # <VAR_DECL> + {_ind(VAR_DECL, 4)} + ) + (?: + # func decl / definition + (?: + (?: + # We expect no inline type definitions in the parameters. + \s* [(] \s* + (?: # <FUNC_PARAMS> + [^{{;]* + ) + \s* [)] \s* + (?: # <FUNC_DELIM> + [{{;] + ) + ) + | + (?: + # This is some old-school syntax! + \s* [(] \s* + # We throw away the bare names: + {STRICT_IDENTIFIER} + (?: \s* , \s* {STRICT_IDENTIFIER} )* + \s* [)] \s* + + # We keep the trailing param declarations: + (?: # <FUNC_LEGACY_PARAMS> + # There's at least one! + (?: {TYPE_QUALIFIER} \s* )? + {_ind(TYPE_SPEC, 7)} + \s* + {_ind(DECLARATOR, 7)} + \s* ; + (?: + \s* + (?: {TYPE_QUALIFIER} \s* )? + {_ind(TYPE_SPEC, 8)} + \s* + {_ind(DECLARATOR, 8)} + \s* ; + )* + ) + \s* {{ + ) + ) + | + # var / typedef + (?: + (?: + # initializer + # We expect only basic initializers. + \s* = \s* + (?: # <VAR_INIT> + {_ind(INITIALIZER, 6)} + ) + )? + \s* + (?: # <VAR_ENDING> + [,;] + ) + ) + ) + ) + ) + ''') |