9 files changed, 1966 insertions, 0 deletions
diff --git a/Tools/c-analyzer/c_parser/parser/__init__.py b/Tools/c-analyzer/c_parser/parser/__init__.py
new file mode 100644
index 00000000000..7cb34caf09e
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/__init__.py
@@ -0,0 +1,212 @@
+"""A simple non-validating parser for C99.
+
+The functions and regex patterns here are not entirely suitable for
+validating C syntax.  Please rely on a proper compiler for that.
+Instead our goal here is merely matching and extracting information from
+valid C code.
+
+Furthermore, the grammar rules for the C syntax (particularly as
+described in the K&R book) actually describe a superset, of which the
+full C langage is a proper subset.  Here are some of the extra
+conditions that must be applied when parsing C code:
+
+* ...
+
+(see: http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1256.pdf)
+
+We have taken advantage of the elements of the C grammar that are used
+only in a few limited contexts, mostly as delimiters.  They allow us to
+focus the regex patterns confidently.  Here are the relevant tokens and
+in which grammar rules they are used:
+
+separators:
+* ";"
+   + (decl) struct/union:  at end of each member decl
+   + (decl) declaration:  at end of each (non-compound) decl
+   + (stmt) expr stmt:  at end of each stmt
+   + (stmt) for:  between exprs in "header"
+   + (stmt) goto:  at end
+   + (stmt) continue:  at end
+   + (stmt) break:  at end
+   + (stmt) return:  at end
+* ","
+   + (decl) struct/union:  between member declators
+   + (decl) param-list:  between params
+   + (decl) enum: between enumerators
+   + (decl) initializer (compound):  between initializers
+   + (expr) postfix:  between func call args
+   + (expr) expression:  between "assignment" exprs
+* ":"
+   + (decl) struct/union:  in member declators
+   + (stmt) label:  between label and stmt
+   + (stmt) case:  between expression and stmt
+   + (stmt) default:  between "default" and stmt
+* "="
+   + (decl) delaration:  between decl and initializer
+   + (decl) enumerator:  between identifier and "initializer"
+   + (expr) assignment:  between "var" and expr
+
+wrappers:
+* "(...)"
+   + (decl) declarator (func ptr):  to wrap ptr/name
+   + (decl) declarator (func ptr):  around params
+   + (decl) declarator:  around sub-declarator (for readability)
+   + (expr) postfix (func call):  around args
+   + (expr) primary:  around sub-expr
+   + (stmt) if:  around condition
+   + (stmt) switch:  around source expr
+   + (stmt) while:  around condition
+   + (stmt) do-while:  around condition
+   + (stmt) for:  around "header"
+* "{...}"
+   + (decl) enum:  around enumerators
+   + (decl) func:  around body
+   + (stmt) compound:  around stmts
+* "[...]"
+   * (decl) declarator:  for arrays
+   * (expr) postfix:  array access
+
+other:
+* "*"
+   + (decl) declarator:  for pointer types
+   + (expr) unary:  for pointer deref
+
+
+To simplify the regular expressions used here, we've takens some
+shortcuts and made certain assumptions about the code we are parsing.
+Some of these allow us to skip context-sensitive matching (e.g. braces)
+or otherwise still match arbitrary C code unambiguously.  However, in
+some cases there are certain corner cases where the patterns are
+ambiguous relative to arbitrary C code.  However, they are still
+unambiguous in the specific code we are parsing.
+
+Here are the cases where we've taken shortcuts or made assumptions:
+
+* there is no overlap syntactically between the local context (func
+  bodies) and the global context (other than variable decls), so we
+  do not need to worry about ambiguity due to the overlap:
+   + the global context has no expressions or statements
+   + the local context has no function definitions or type decls
+* no "inline" type declarations (struct, union, enum) in function
+  parameters ~(including function pointers)~
+* no "inline" type decls in function return types
+* no superflous parentheses in declarators
+* var decls in for loops are always "simple" (e.g. no inline types)
+* only inline struct/union/enum decls may be anonymouns (without a name)
+* no function pointers in function pointer parameters
+* for loop "headers" do not have curly braces (e.g. compound init)
+* syntactically, variable decls do not overlap with stmts/exprs, except
+  in the following case:
+    spam (*eggs) (...)
+  This could be either a function pointer variable named "eggs"
+  or a call to a function named "spam", which returns a function
+  pointer that gets called.  The only differentiator is the
+  syntax used in the "..." part.  It will be comma-separated
+  parameters for the former and comma-separated expressions for
+  the latter.  Thus, if we expect such decls or calls then we must
+  parse the decl params.
+"""
+
+"""
+TODO:
+* extract CPython-specific code
+* drop include injection (or only add when needed)
+* track position instead of slicing "text"
+* Parser class instead of the _iter_source() mess
+* alt impl using a state machine (& tokenizer or split on delimiters)
+"""
+
+from ..info import ParsedItem
+from ._info import SourceInfo
+
+
+def parse(srclines):
+    if isinstance(srclines, str):  # a filename
+        raise NotImplementedError
+
+    anon_name = anonymous_names()
+    for result in _parse(srclines, anon_name):
+        yield ParsedItem.from_raw(result)
+
+
+# XXX Later: Add a separate function to deal with preprocessor directives
+# parsed out of raw source.
+
+
+def anonymous_names():
+    counter = 1
+    def anon_name(prefix='anon-'):
+        nonlocal counter
+        name = f'{prefix}{counter}'
+        counter += 1
+        return name
+    return anon_name
+
+
+#############################
+# internal impl
+
+import logging
+
+
+_logger = logging.getLogger(__name__)
+
+
+def _parse(srclines, anon_name):
+    from ._global import parse_globals
+
+    source = _iter_source(srclines)
+    #source = _iter_source(srclines, showtext=True)
+    for result in parse_globals(source, anon_name):
+        # XXX Handle blocks here insted of in parse_globals().
+        yield result
+
+
+def _iter_source(lines, *, maxtext=20_000, maxlines=700, showtext=False):
+    filestack = []
+    allinfo = {}
+    # "lines" should be (fileinfo, data), as produced by the preprocessor code.
+    for fileinfo, line in lines:
+        if fileinfo.filename in filestack:
+            while fileinfo.filename != filestack[-1]:
+                filename = filestack.pop()
+                del allinfo[filename]
+            filename = fileinfo.filename
+            srcinfo = allinfo[filename]
+        else:
+            filename = fileinfo.filename
+            srcinfo = SourceInfo(filename)
+            filestack.append(filename)
+            allinfo[filename] = srcinfo
+
+        _logger.debug(f'-> {line}')
+        srcinfo._add_line(line, fileinfo.lno)
+        if len(srcinfo.text) > maxtext:
+            break
+        if srcinfo.end - srcinfo.start > maxlines:
+            break
+        while srcinfo._used():
+            yield srcinfo
+            if showtext:
+                _logger.debug(f'=> {srcinfo.text}')
+    else:
+        if not filestack:
+            srcinfo = SourceInfo('???')
+        else:
+            filename = filestack[-1]
+            srcinfo = allinfo[filename]
+            while srcinfo._used():
+                yield srcinfo
+                if showtext:
+                    _logger.debug(f'=> {srcinfo.text}')
+        yield srcinfo
+        if showtext:
+            _logger.debug(f'=> {srcinfo.text}')
+        if not srcinfo._ready:
+            return
+    # At this point either the file ended prematurely
+    # or there's "too much" text.
+    filename, lno, text = srcinfo.filename, srcinfo._start, srcinfo.text
+    if len(text) > 500:
+        text = text[:500] + '...'
+    raise Exception(f'unmatched text ({filename} starting at line {lno}):\n{text}')
diff --git a/Tools/c-analyzer/c_parser/parser/_alt.py b/Tools/c-analyzer/c_parser/parser/_alt.py
new file mode 100644
index 00000000000..05a9101b4f5
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/_alt.py
@@ -0,0 +1,6 @@
+
+def _parse(srclines, anon_name):
+    text = ' '.join(l for _, l in srclines)
+
+    from ._delim import parse
+    yield from parse(text, anon_name)
diff --git a/Tools/c-analyzer/c_parser/parser/_common.py b/Tools/c-analyzer/c_parser/parser/_common.py
new file mode 100644
index 00000000000..40c36039f3f
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/_common.py
@@ -0,0 +1,115 @@
+import re
+
+from ._regexes import (
+    _ind,
+    STRING_LITERAL,
+    VAR_DECL as _VAR_DECL,
+)
+
+
+def log_match(group, m):
+    from . import _logger
+    _logger.debug(f'matched <{group}> ({m.group(0)})')
+
+
+#############################
+# regex utils
+
+def set_capture_group(pattern, group, *, strict=True):
+    old = f'(?:  # <{group}>'
+    if strict and f'(?:  # <{group}>' not in pattern:
+        raise ValueError(f'{old!r} not found in pattern')
+    return pattern.replace(old, f'(  # <{group}>', 1)
+
+
+def set_capture_groups(pattern, groups, *, strict=True):
+    for group in groups:
+        pattern = set_capture_group(pattern, group, strict=strict)
+    return pattern
+
+
+#############################
+# syntax-related utils
+
+_PAREN_RE = re.compile(rf'''
+    (?:
+        (?:
+            [^'"()]*
+            {_ind(STRING_LITERAL, 3)}
+         )*
+        [^'"()]*
+        (?:
+            ( [(] )
+            |
+            ( [)] )
+         )
+     )
+    ''', re.VERBOSE)
+
+
+def match_paren(text, depth=0):
+    pos = 0
+    while (m := _PAREN_RE.match(text, pos)):
+        pos = m.end()
+        _open, _close = m.groups()
+        if _open:
+            depth += 1
+        else:  # _close
+            depth -= 1
+            if depth == 0:
+                return pos
+    else:
+        raise ValueError(f'could not find matching parens for {text!r}')
+
+
+VAR_DECL = set_capture_groups(_VAR_DECL, (
+    'STORAGE',
+    'TYPE_QUAL',
+    'TYPE_SPEC',
+    'DECLARATOR',
+    'IDENTIFIER',
+    'WRAPPED_IDENTIFIER',
+    'FUNC_IDENTIFIER',
+))
+
+
+def parse_var_decl(decl):
+    m = re.match(VAR_DECL, decl, re.VERBOSE)
+    (storage, typequal, typespec, declarator,
+     name,
+     wrappedname,
+     funcptrname,
+     ) = m.groups()
+    if name:
+        kind = 'simple'
+    elif wrappedname:
+        kind = 'wrapped'
+        name = wrappedname
+    elif funcptrname:
+        kind = 'funcptr'
+        name = funcptrname
+    else:
+        raise NotImplementedError
+    abstract = declarator.replace(name, '')
+    vartype = {
+        'storage': storage,
+        'typequal': typequal,
+        'typespec': typespec,
+        'abstract': abstract,
+    }
+    return (kind, name, vartype)
+
+
+#############################
+# parser state utils
+
+# XXX Drop this or use it!
+def iter_results(results):
+    if not results:
+        return
+    if callable(results):
+        results = results()
+
+    for result, text in results():
+        if result:
+            yield result, text
diff --git a/Tools/c-analyzer/c_parser/parser/_compound_decl_body.py b/Tools/c-analyzer/c_parser/parser/_compound_decl_body.py
new file mode 100644
index 00000000000..eb5bc67607b
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/_compound_decl_body.py
@@ -0,0 +1,158 @@
+import re
+
+from ._regexes import (
+    STRUCT_MEMBER_DECL as _STRUCT_MEMBER_DECL,
+    ENUM_MEMBER_DECL as _ENUM_MEMBER_DECL,
+)
+from ._common import (
+    log_match,
+    parse_var_decl,
+    set_capture_groups,
+)
+
+
+#############################
+# struct / union
+
+STRUCT_MEMBER_DECL = set_capture_groups(_STRUCT_MEMBER_DECL, (
+    'COMPOUND_TYPE_KIND',
+    'COMPOUND_TYPE_NAME',
+    'SPECIFIER_QUALIFIER',
+    'DECLARATOR',
+    'SIZE',
+    'ENDING',
+    'CLOSE',
+))
+STRUCT_MEMBER_RE = re.compile(rf'^ \s* {STRUCT_MEMBER_DECL}', re.VERBOSE)
+
+
+def parse_struct_body(source, anon_name, parent):
+    done = False
+    while not done:
+        done = True
+        for srcinfo in source:
+            m = STRUCT_MEMBER_RE.match(srcinfo.text)
+            if m:
+                break
+        else:
+            # We ran out of lines.
+            if srcinfo is not None:
+                srcinfo.done()
+            return
+        for item in _parse_struct_next(m, srcinfo, anon_name, parent):
+            if callable(item):
+                parse_body = item
+                yield from parse_body(source)
+            else:
+                yield item
+            done = False
+
+
+def _parse_struct_next(m, srcinfo, anon_name, parent):
+    (inline_kind, inline_name,
+     qualspec, declarator,
+     size,
+     ending,
+     close,
+     ) = m.groups()
+    remainder = srcinfo.text[m.end():]
+
+    if close:
+        log_match('compound close', m)
+        srcinfo.advance(remainder)
+
+    elif inline_kind:
+        log_match('compound inline', m)
+        kind = inline_kind
+        name = inline_name or anon_name('inline-')
+        # Immediately emit a forward declaration.
+        yield srcinfo.resolve(kind, name=name, data=None)
+
+        # un-inline the decl.  Note that it might not actually be inline.
+        # We handle the case in the "maybe_inline_actual" branch.
+        srcinfo.nest(
+            remainder,
+            f'{kind} {name}',
+        )
+        def parse_body(source):
+            _parse_body = DECL_BODY_PARSERS[kind]
+
+            data = []  # members
+            ident = f'{kind} {name}'
+            for item in _parse_body(source, anon_name, ident):
+                if item.kind == 'field':
+                    data.append(item)
+                else:
+                    yield item
+            # XXX Should "parent" really be None for inline type decls?
+            yield srcinfo.resolve(kind, data, name, parent=None)
+
+            srcinfo.resume()
+        yield parse_body
+
+    else:
+        # not inline (member)
+        log_match('compound member', m)
+        if qualspec:
+            _, name, data = parse_var_decl(f'{qualspec} {declarator}')
+            if not name:
+                name = anon_name('struct-field-')
+            if size:
+#                data = (data, size)
+                data['size'] = int(size)
+        else:
+            # This shouldn't happen (we expect each field to have a name).
+            raise NotImplementedError
+            name = sized_name or anon_name('struct-field-')
+            data = int(size)
+
+        yield srcinfo.resolve('field', data, name, parent)  # XXX Restart?
+        if ending == ',':
+            remainder = rf'{qualspec} {remainder}'
+        srcinfo.advance(remainder)
+
+
+#############################
+# enum
+
+ENUM_MEMBER_DECL = set_capture_groups(_ENUM_MEMBER_DECL, (
+    'CLOSE',
+    'NAME',
+    'INIT',
+    'ENDING',
+))
+ENUM_MEMBER_RE = re.compile(rf'{ENUM_MEMBER_DECL}', re.VERBOSE)
+
+
+def parse_enum_body(source, _anon_name, _parent):
+    ending = None
+    while ending != '}':
+        for srcinfo in source:
+            m = ENUM_MEMBER_RE.match(srcinfo.text)
+            if m:
+                break
+        else:
+            # We ran out of lines.
+            if srcinfo is not None:
+                srcinfo.done()
+            return
+        remainder = srcinfo.text[m.end():]
+
+        (close,
+         name, init, ending,
+         ) = m.groups()
+        if close:
+            ending = '}'
+        else:
+            data = init
+            yield srcinfo.resolve('field', data, name, _parent)
+        srcinfo.advance(remainder)
+
+
+#############################
+
+DECL_BODY_PARSERS = {
+    'struct': parse_struct_body,
+    'union': parse_struct_body,
+    'enum': parse_enum_body,
+}
diff --git a/Tools/c-analyzer/c_parser/parser/_delim.py b/Tools/c-analyzer/c_parser/parser/_delim.py
new file mode 100644
index 00000000000..51433a629d3
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/_delim.py
@@ -0,0 +1,54 @@
+import re
+import textwrap
+
+from ._regexes import _ind, STRING_LITERAL
+
+
+def parse(text, anon_name):
+    context = None
+    data = None
+    for m in DELIMITER_RE.find_iter(text):
+        before, opened, closed = m.groups()
+        delim = opened or closed
+
+        handle_segment = HANDLERS[context][delim]
+        result, context, data = handle_segment(before, delim, data)
+        if result:
+            yield result
+
+
+DELIMITER = textwrap.dedent(rf'''
+    (
+        (?:
+            [^'"()\[\]{};]*
+            {_ind(STRING_LITERAL, 3)}
+        }*
+        [^'"()\[\]{};]+
+     )?  # <before>
+    (?:
+        (
+            [(\[{]
+         )  # <open>
+        |
+        (
+            [)\]};]
+         )  # <close>
+     )?
+    ''')
+DELIMITER_RE = re.compile(DELIMITER, re.VERBOSE)
+
+_HANDLERS = {
+    None: {  # global
+        # opened
+        '{': ...,
+        '[': None,
+        '(': None,
+        # closed
+        '}': None,
+        ']': None,
+        ')': None,
+        ';': ...,
+    },
+    '': {
+    },
+}
diff --git a/Tools/c-analyzer/c_parser/parser/_func_body.py b/Tools/c-analyzer/c_parser/parser/_func_body.py
new file mode 100644
index 00000000000..42fd459e111
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/_func_body.py
@@ -0,0 +1,278 @@
+import re
+
+from ._regexes import (
+    LOCAL as _LOCAL,
+    LOCAL_STATICS as _LOCAL_STATICS,
+)
+from ._common import (
+    log_match,
+    parse_var_decl,
+    set_capture_groups,
+    match_paren,
+)
+from ._compound_decl_body import DECL_BODY_PARSERS
+
+
+LOCAL = set_capture_groups(_LOCAL, (
+    'EMPTY',
+    'INLINE_LEADING',
+    'INLINE_PRE',
+    'INLINE_KIND',
+    'INLINE_NAME',
+    'STORAGE',
+    'VAR_DECL',
+    'VAR_INIT',
+    'VAR_ENDING',
+    'COMPOUND_BARE',
+    'COMPOUND_LABELED',
+    'COMPOUND_PAREN',
+    'BLOCK_LEADING',
+    'BLOCK_OPEN',
+    'SIMPLE_STMT',
+    'SIMPLE_ENDING',
+    'BLOCK_CLOSE',
+))
+LOCAL_RE = re.compile(rf'^ \s* {LOCAL}', re.VERBOSE)
+
+
+# Note that parse_function_body() still has trouble with a few files
+# in the CPython codebase.
+
+def parse_function_body(source, name, anon_name):
+    # XXX
+    raise NotImplementedError
+
+
+def parse_function_body(name, text, resolve, source, anon_name, parent):
+    raise NotImplementedError
+    # For now we do not worry about locals declared in for loop "headers".
+    depth = 1;
+    while depth > 0:
+        m = LOCAL_RE.match(text)
+        while not m:
+            text, resolve = continue_text(source, text or '{', resolve)
+            m = LOCAL_RE.match(text)
+        text = text[m.end():]
+        (
+         empty,
+         inline_leading, inline_pre, inline_kind, inline_name,
+         storage, decl,
+         var_init, var_ending,
+         compound_bare, compound_labeled, compound_paren,
+         block_leading, block_open,
+         simple_stmt, simple_ending,
+         block_close,
+         ) = m.groups()
+
+        if empty:
+            log_match('', m)
+            resolve(None, None, None, text)
+            yield None, text
+        elif inline_kind:
+            log_match('', m)
+            kind = inline_kind
+            name = inline_name or anon_name('inline-')
+            data = []  # members
+            # We must set the internal "text" from _iter_source() to the
+            # start of the inline compound body,
+            # Note that this is effectively like a forward reference that
+            # we do not emit.
+            resolve(kind, None, name, text, None)
+            _parse_body = DECL_BODY_PARSERS[kind]
+            before = []
+            ident = f'{kind} {name}'
+            for member, inline, text in _parse_body(text, resolve, source, anon_name, ident):
+                if member:
+                    data.append(member)
+                if inline:
+                    yield from inline
+            # un-inline the decl.  Note that it might not actually be inline.
+            # We handle the case in the "maybe_inline_actual" branch.
+            text = f'{inline_leading or ""} {inline_pre or ""} {kind} {name} {text}'
+            # XXX Should "parent" really be None for inline type decls?
+            yield resolve(kind, data, name, text, None), text
+        elif block_close:
+            log_match('', m)
+            depth -= 1
+            resolve(None, None, None, text)
+            # XXX This isn't great.  Calling resolve() should have
+            # cleared the closing bracket.  However, some code relies
+            # on the yielded value instead of the resolved one.  That
+            # needs to be fixed.
+            yield None, text
+        elif compound_bare:
+            log_match('', m)
+            yield resolve('statement', compound_bare, None, text, parent), text
+        elif compound_labeled:
+            log_match('', m)
+            yield resolve('statement', compound_labeled, None, text, parent), text
+        elif compound_paren:
+            log_match('', m)
+            try:
+                pos = match_paren(text)
+            except ValueError:
+                text = f'{compound_paren} {text}'
+                #resolve(None, None, None, text)
+                text, resolve = continue_text(source, text, resolve)
+                yield None, text
+            else:
+                head = text[:pos]
+                text = text[pos:]
+                if compound_paren == 'for':
+                    # XXX Parse "head" as a compound statement.
+                    stmt1, stmt2, stmt3 = head.split(';', 2)
+                    data = {
+                        'compound': compound_paren,
+                        'statements': (stmt1, stmt2, stmt3),
+                    }
+                else:
+                    data = {
+                        'compound': compound_paren,
+                        'statement': head,
+                    }
+                yield resolve('statement', data, None, text, parent), text
+        elif block_open:
+            log_match('', m)
+            depth += 1
+            if block_leading:
+                # An inline block: the last evaluated expression is used
+                # in place of the block.
+                # XXX Combine it with the remainder after the block close.
+                stmt = f'{block_open}{{<expr>}}...;'
+                yield resolve('statement', stmt, None, text, parent), text
+            else:
+                resolve(None, None, None, text)
+                yield None, text
+        elif simple_ending:
+            log_match('', m)
+            yield resolve('statement', simple_stmt, None, text, parent), text
+        elif var_ending:
+            log_match('', m)
+            kind = 'variable'
+            _, name, vartype = parse_var_decl(decl)
+            data = {
+                'storage': storage,
+                'vartype': vartype,
+            }
+            after = ()
+            if var_ending == ',':
+                # It was a multi-declaration, so queue up the next one.
+                _, qual, typespec, _ = vartype.values()
+                text = f'{storage or ""} {qual or ""} {typespec} {text}'
+            yield resolve(kind, data, name, text, parent), text
+            if var_init:
+                _data = f'{name} = {var_init.strip()}'
+                yield resolve('statement', _data, None, text, parent), text
+        else:
+            # This should be unreachable.
+            raise NotImplementedError
+
+
+#############################
+# static local variables
+
+LOCAL_STATICS = set_capture_groups(_LOCAL_STATICS, (
+    'INLINE_LEADING',
+    'INLINE_PRE',
+    'INLINE_KIND',
+    'INLINE_NAME',
+    'STATIC_DECL',
+    'STATIC_INIT',
+    'STATIC_ENDING',
+    'DELIM_LEADING',
+    'BLOCK_OPEN',
+    'BLOCK_CLOSE',
+    'STMT_END',
+))
+LOCAL_STATICS_RE = re.compile(rf'^ \s* {LOCAL_STATICS}', re.VERBOSE)
+
+
+def parse_function_statics(source, func, anon_name):
+    # For now we do not worry about locals declared in for loop "headers".
+    depth = 1;
+    while depth > 0:
+        for srcinfo in source:
+            m = LOCAL_STATICS_RE.match(srcinfo.text)
+            if m:
+                break
+        else:
+            # We ran out of lines.
+            if srcinfo is not None:
+                srcinfo.done()
+            return
+        for item, depth in _parse_next_local_static(m, srcinfo,
+                                                    anon_name, func, depth):
+            if callable(item):
+                parse_body = item
+                yield from parse_body(source)
+            elif item is not None:
+                yield item
+
+
+def _parse_next_local_static(m, srcinfo, anon_name, func, depth):
+    (inline_leading, inline_pre, inline_kind, inline_name,
+     static_decl, static_init, static_ending,
+     _delim_leading,
+     block_open,
+     block_close,
+     stmt_end,
+     ) = m.groups()
+    remainder = srcinfo.text[m.end():]
+
+    if inline_kind:
+        log_match('func inline', m)
+        kind = inline_kind
+        name = inline_name or anon_name('inline-')
+        # Immediately emit a forward declaration.
+        yield srcinfo.resolve(kind, name=name, data=None), depth
+
+        # un-inline the decl.  Note that it might not actually be inline.
+        # We handle the case in the "maybe_inline_actual" branch.
+        srcinfo.nest(
+            remainder,
+            f'{inline_leading or ""} {inline_pre or ""} {kind} {name}'
+        )
+        def parse_body(source):
+            _parse_body = DECL_BODY_PARSERS[kind]
+
+            data = []  # members
+            ident = f'{kind} {name}'
+            for item in _parse_body(source, anon_name, ident):
+                if item.kind == 'field':
+                    data.append(item)
+                else:
+                    yield item
+            # XXX Should "parent" really be None for inline type decls?
+            yield srcinfo.resolve(kind, data, name, parent=None)
+
+            srcinfo.resume()
+        yield parse_body, depth
+
+    elif static_decl:
+        log_match('local variable', m)
+        _, name, data = parse_var_decl(static_decl)
+
+        yield srcinfo.resolve('variable', data, name, parent=func), depth
+
+        if static_init:
+            srcinfo.advance(f'{name} {static_init} {remainder}')
+        elif static_ending == ',':
+            # It was a multi-declaration, so queue up the next one.
+            _, qual, typespec, _ = data.values()
+            srcinfo.advance(f'static {qual or ""} {typespec} {remainder}')
+        else:
+            srcinfo.advance('')
+
+    else:
+        log_match('func other', m)
+        if block_open:
+            depth += 1
+        elif block_close:
+            depth -= 1
+        elif stmt_end:
+            pass
+        else:
+            # This should be unreachable.
+            raise NotImplementedError
+        srcinfo.advance(remainder)
+        yield None, depth
diff --git a/Tools/c-analyzer/c_parser/parser/_global.py b/Tools/c-analyzer/c_parser/parser/_global.py
new file mode 100644
index 00000000000..35947c12998
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/_global.py
@@ -0,0 +1,179 @@
+import re
+
+from ._regexes import (
+    GLOBAL as _GLOBAL,
+)
+from ._common import (
+    log_match,
+    parse_var_decl,
+    set_capture_groups,
+)
+from ._compound_decl_body import DECL_BODY_PARSERS
+#from ._func_body import parse_function_body
+from ._func_body import parse_function_statics as parse_function_body
+
+
+GLOBAL = set_capture_groups(_GLOBAL, (
+    'EMPTY',
+    'COMPOUND_LEADING',
+    'COMPOUND_KIND',
+    'COMPOUND_NAME',
+    'FORWARD_KIND',
+    'FORWARD_NAME',
+    'MAYBE_INLINE_ACTUAL',
+    'TYPEDEF_DECL',
+    'TYPEDEF_FUNC_PARAMS',
+    'VAR_STORAGE',
+    'FUNC_INLINE',
+    'VAR_DECL',
+    'FUNC_PARAMS',
+    'FUNC_DELIM',
+    'FUNC_LEGACY_PARAMS',
+    'VAR_INIT',
+    'VAR_ENDING',
+))
+GLOBAL_RE = re.compile(rf'^ \s* {GLOBAL}', re.VERBOSE)
+
+
+def parse_globals(source, anon_name):
+    for srcinfo in source:
+        m = GLOBAL_RE.match(srcinfo.text)
+        if not m:
+            # We need more text.
+            continue
+        for item in _parse_next(m, srcinfo, anon_name):
+            if callable(item):
+                parse_body = item
+                yield from parse_body(source)
+            else:
+                yield item
+    else:
+        # We ran out of lines.
+        if srcinfo is not None:
+            srcinfo.done()
+        return
+
+
+def _parse_next(m, srcinfo, anon_name):
+    (
+     empty,
+     # compound type decl (maybe inline)
+     compound_leading, compound_kind, compound_name,
+     forward_kind, forward_name, maybe_inline_actual,
+     # typedef
+     typedef_decl, typedef_func_params,
+     # vars and funcs
+     storage, func_inline, decl,
+     func_params, func_delim, func_legacy_params,
+     var_init, var_ending,
+     ) = m.groups()
+    remainder = srcinfo.text[m.end():]
+
+    if empty:
+        log_match('global empty', m)
+        srcinfo.advance(remainder)
+
+    elif maybe_inline_actual:
+        log_match('maybe_inline_actual', m)
+        # Ignore forward declarations.
+        # XXX Maybe return them too (with an "isforward" flag)?
+        if not maybe_inline_actual.strip().endswith(';'):
+            remainder = maybe_inline_actual + remainder
+        yield srcinfo.resolve(forward_kind, None, forward_name)
+        if maybe_inline_actual.strip().endswith('='):
+            # We use a dummy prefix for a fake typedef.
+            # XXX Ideally this case would not be caught by MAYBE_INLINE_ACTUAL.
+            _, name, data = parse_var_decl(f'{forward_kind} {forward_name} fake_typedef_{forward_name}')
+            yield srcinfo.resolve('typedef', data, name, parent=None)
+            remainder = f'{name} {remainder}'
+        srcinfo.advance(remainder)
+
+    elif compound_kind:
+        kind = compound_kind
+        name = compound_name or anon_name('inline-')
+        # Immediately emit a forward declaration.
+        yield srcinfo.resolve(kind, name=name, data=None)
+
+        # un-inline the decl.  Note that it might not actually be inline.
+        # We handle the case in the "maybe_inline_actual" branch.
+        srcinfo.nest(
+            remainder,
+            f'{compound_leading or ""} {compound_kind} {name}',
+        )
+        def parse_body(source):
+            _parse_body = DECL_BODY_PARSERS[compound_kind]
+
+            data = []  # members
+            ident = f'{kind} {name}'
+            for item in _parse_body(source, anon_name, ident):
+                if item.kind == 'field':
+                    data.append(item)
+                else:
+                    yield item
+            # XXX Should "parent" really be None for inline type decls?
+            yield srcinfo.resolve(kind, data, name, parent=None)
+
+            srcinfo.resume()
+        yield parse_body
+
+    elif typedef_decl:
+        log_match('typedef', m)
+        kind = 'typedef'
+        _, name, data = parse_var_decl(typedef_decl)
+        if typedef_func_params:
+            return_type = data
+            # This matches the data for func declarations.
+            data = {
+                'storage': None,
+                'inline': None,
+                'params': f'({typedef_func_params})',
+                'returntype': return_type,
+                'isforward': True,
+            }
+        yield srcinfo.resolve(kind, data, name, parent=None)
+        srcinfo.advance(remainder)
+
+    elif func_delim or func_legacy_params:
+        log_match('function', m)
+        kind = 'function'
+        _, name, return_type = parse_var_decl(decl)
+        func_params = func_params or func_legacy_params
+        data = {
+            'storage': storage,
+            'inline': func_inline,
+            'params': f'({func_params})',
+            'returntype': return_type,
+            'isforward': func_delim == ';',
+        }
+
+        yield srcinfo.resolve(kind, data, name, parent=None)
+        srcinfo.advance(remainder)
+
+        if func_delim == '{' or func_legacy_params:
+            def parse_body(source):
+                yield from parse_function_body(source, name, anon_name)
+            yield parse_body
+
+    elif var_ending:
+        log_match('global variable', m)
+        kind = 'variable'
+        _, name, vartype = parse_var_decl(decl)
+        data = {
+            'storage': storage,
+            'vartype': vartype,
+        }
+        yield srcinfo.resolve(kind, data, name, parent=None)
+
+        if var_ending == ',':
+            # It was a multi-declaration, so queue up the next one.
+            _, qual, typespec, _ = vartype.values()
+            remainder = f'{storage or ""} {qual or ""} {typespec} {remainder}'
+        srcinfo.advance(remainder)
+
+        if var_init:
+            _data = f'{name} = {var_init.strip()}'
+            yield srcinfo.resolve('statement', _data, name=None)
+
+    else:
+        # This should be unreachable.
+        raise NotImplementedError
diff --git a/Tools/c-analyzer/c_parser/parser/_info.py b/Tools/c-analyzer/c_parser/parser/_info.py
new file mode 100644
index 00000000000..2dcd5e5e760
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/_info.py
@@ -0,0 +1,168 @@
+from ..info import KIND, ParsedItem, FileInfo
+
+
+class TextInfo:
+
+    def __init__(self, text, start=None, end=None):
+        # immutable:
+        if not start:
+            start = 1
+        self.start = start
+
+        # mutable:
+        lines = text.splitlines() or ['']
+        self.text = text.strip()
+        if not end:
+            end = start + len(lines) - 1
+        self.end = end
+        self.line = lines[-1]
+
+    def __repr__(self):
+        args = (f'{a}={getattr(self, a)!r}'
+                for a in ['text', 'start', 'end'])
+        return f'{type(self).__name__}({", ".join(args)})'
+
+    def add_line(self, line, lno=None):
+        if lno is None:
+            lno = self.end + 1
+        else:
+            if isinstance(lno, FileInfo):
+                fileinfo = lno
+                if fileinfo.filename != self.filename:
+                    raise NotImplementedError((fileinfo, self.filename))
+                lno = fileinfo.lno
+            # XXX
+            #if lno < self.end:
+            #    raise NotImplementedError((lno, self.end))
+        line = line.lstrip()
+        self.text += ' ' + line
+        self.line = line
+        self.end = lno
+
+
+class SourceInfo:
+
+    _ready = False
+
+    def __init__(self, filename, _current=None):
+        # immutable:
+        self.filename = filename
+        # mutable:
+        if isinstance(_current, str):
+            _current = TextInfo(_current)
+        self._current = _current
+        start = -1
+        self._start = _current.start if _current else -1
+        self._nested = []
+        self._set_ready()
+
+    def __repr__(self):
+        args = (f'{a}={getattr(self, a)!r}'
+                for a in ['filename', '_current'])
+        return f'{type(self).__name__}({", ".join(args)})'
+
+    @property
+    def start(self):
+        if self._current is None:
+            return self._start
+        return self._current.start
+
+    @property
+    def end(self):
+        if self._current is None:
+            return self._start
+        return self._current.end
+
+    @property
+    def text(self):
+        if self._current is None:
+            return ''
+        return self._current.text
+
+    def nest(self, text, before, start=None):
+        if self._current is None:
+            raise Exception('nesting requires active source text')
+        current = self._current
+        current.text = before
+        self._nested.append(current)
+        self._replace(text, start)
+
+    def resume(self, remainder=None):
+        if not self._nested:
+            raise Exception('no nested text to resume')
+        if self._current is None:
+            raise Exception('un-nesting requires active source text')
+        if remainder is None:
+            remainder = self._current.text
+        self._clear()
+        self._current = self._nested.pop()
+        self._current.text += ' ' + remainder
+        self._set_ready()
+
+    def advance(self, remainder, start=None):
+        if self._current is None:
+            raise Exception('advancing requires active source text')
+        if remainder.strip():
+            self._replace(remainder, start, fixnested=True)
+        else:
+            if self._nested:
+                self._replace('', start, fixnested=True)
+                #raise Exception('cannot advance while nesting')
+            else:
+                self._clear(start)
+
+    def resolve(self, kind, data, name, parent=None):
+        # "field" isn't a top-level kind, so we leave it as-is.
+        if kind and kind != 'field':
+            kind = KIND._from_raw(kind)
+        fileinfo = FileInfo(self.filename, self._start)
+        return ParsedItem(fileinfo, kind, parent, name, data)
+
+    def done(self):
+        self._set_ready()
+
+    def _set_ready(self):
+        if self._current is None:
+            self._ready = False
+        else:
+            self._ready = self._current.text.strip() != ''
+
+    def _used(self):
+        ready = self._ready
+        self._ready = False
+        return ready
+
+    def _clear(self, start=None):
+        old = self._current
+        if self._current is not None:
+            # XXX Fail if self._current wasn't used up?
+            if start is None:
+                start = self._current.end
+            self._current = None
+        if start is not None:
+            self._start = start
+        self._set_ready()
+        return old
+
+    def _replace(self, text, start=None, *, fixnested=False):
+        end = self._current.end
+        old = self._clear(start)
+        self._current = TextInfo(text, self._start, end)
+        if fixnested and self._nested and self._nested[-1] is old:
+            self._nested[-1] = self._current
+        self._set_ready()
+
+    def _add_line(self, line, lno=None):
+        if not line.strip():
+            # We don't worry about multi-line string literals.
+            return
+        if self._current is None:
+            self._start = lno
+            self._current = TextInfo(line, lno)
+        else:
+            # XXX
+            #if lno < self._current.end:
+            #    # A circular include?
+            #    raise NotImplementedError((lno, self))
+            self._current.add_line(line, lno)
+        self._ready = True
diff --git a/Tools/c-analyzer/c_parser/parser/_regexes.py b/Tools/c-analyzer/c_parser/parser/_regexes.py
new file mode 100644
index 00000000000..e9bc31d335a
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/_regexes.py
@@ -0,0 +1,796 @@
+# Regular expression patterns for C syntax.
+#
+# None of these patterns has any capturing.  However, a number of them
+# have capturing markers compatible with utils.set_capture_groups().
+
+import textwrap
+
+
+def _ind(text, level=1, edges='both'):
+    indent = '    ' * level
+    text = textwrap.indent(text, indent)
+    if edges == 'pre' or edges == 'both':
+        text = '\n' + indent + text.lstrip()
+    if edges == 'post' or edges == 'both':
+        text = text.rstrip() + '\n' + '    ' * (level - 1)
+    return text
+
+
+#######################################
+# general
+
+HEX = r'(?: [0-9a-zA-Z] )'
+
+STRING_LITERAL = textwrap.dedent(rf'''
+    (?:
+        # character literal
+        (?:
+            ['] [^'] [']
+            |
+            ['] \\ . [']
+            |
+            ['] \\x{HEX}{HEX} [']
+            |
+            ['] \\0\d\d [']
+            |
+            (?:
+                ['] \\o[01]\d\d [']
+                |
+                ['] \\o2[0-4]\d [']
+                |
+                ['] \\o25[0-5] [']
+             )
+         )
+        |
+        # string literal
+        (?:
+            ["] (?: [^"\\]* \\ . )* [^"\\]* ["]
+         )
+        # end string literal
+     )
+    ''')
+
+_KEYWORD = textwrap.dedent(r'''
+    (?:
+        \b
+        (?:
+            auto |
+            extern |
+            register |
+            static |
+            typedef |
+
+            const |
+            volatile |
+
+            signed |
+            unsigned |
+            char |
+            short |
+            int |
+            long |
+            float |
+            double |
+            void |
+
+            struct |
+            union |
+            enum |
+
+            goto |
+            return |
+            sizeof |
+            break |
+            continue |
+            if |
+            else |
+            for |
+            do |
+            while |
+            switch |
+            case |
+            default |
+            entry
+         )
+        \b
+     )
+    ''')
+KEYWORD = rf'''
+    # keyword
+    {_KEYWORD}
+    # end keyword
+    '''
+_KEYWORD = ''.join(_KEYWORD.split())
+
+IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )'
+# We use a negative lookahead to filter out keywords.
+STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )'
+ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )'
+
+
+#######################################
+# types
+
+SIMPLE_TYPE = textwrap.dedent(rf'''
+    # simple type
+    (?:
+        \b
+        (?:
+            void
+            |
+            (?: signed | unsigned )  # implies int
+            |
+            (?:
+                (?: (?: signed | unsigned ) \s+ )?
+                (?: (?: long | short ) \s+ )?
+                (?: char | short | int | long | float | double )
+             )
+         )
+        \b
+     )
+    # end simple type
+    ''')
+
+COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )'
+
+
+#######################################
+# variable declarations
+
+STORAGE_CLASS = r'(?: \b (?: auto | register | static | extern ) \b )'
+TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )'
+PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )'
+
+TYPE_SPEC = textwrap.dedent(rf'''
+    # type spec
+    (?:
+        {_ind(SIMPLE_TYPE, 2)}
+        |
+        (?:
+            [_]*typeof[_]*
+            \s* [(]
+            (?: \s* [*&] )*
+            \s* {STRICT_IDENTIFIER}
+            \s* [)]
+         )
+        |
+        # reference to a compound type
+        (?:
+            {COMPOUND_TYPE_KIND}
+            (?: \s* {ANON_IDENTIFIER} )?
+         )
+        |
+        # reference to a typedef
+        {STRICT_IDENTIFIER}
+     )
+    # end type spec
+    ''')
+
+DECLARATOR = textwrap.dedent(rf'''
+    # declarator  (possibly abstract)
+    (?:
+        (?: {PTR_QUALIFIER} \s* )*
+        (?:
+            (?:
+                (?:  # <IDENTIFIER>
+                    {STRICT_IDENTIFIER}
+                )
+                (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
+             )
+            |
+            (?:
+                [(] \s*
+                (?:  # <WRAPPED_IDENTIFIER>
+                    {STRICT_IDENTIFIER}
+                )
+                (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
+                \s* [)]
+             )
+            |
+            # func ptr
+            (?:
+                [(] (?: \s* {PTR_QUALIFIER} )? \s*
+                (?:  # <FUNC_IDENTIFIER>
+                    {STRICT_IDENTIFIER}
+                )
+                (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
+                \s* [)]
+                # We allow for a single level of paren nesting in parameters.
+                \s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)]
+             )
+         )
+     )
+    # end declarator
+    ''')
+
+VAR_DECL = textwrap.dedent(rf'''
+    # var decl (and typedef and func return type)
+    (?:
+        (?:
+            (?:  # <STORAGE>
+                {STORAGE_CLASS}
+            )
+            \s*
+        )?
+        (?:
+            (?:  # <TYPE_QUAL>
+                {TYPE_QUALIFIER}
+            )
+            \s*
+         )?
+        (?:
+            (?:  # <TYPE_SPEC>
+                {_ind(TYPE_SPEC, 4)}
+            )
+         )
+        \s*
+        (?:
+            (?:  # <DECLARATOR>
+                {_ind(DECLARATOR, 4)}
+            )
+         )
+     )
+    # end var decl
+    ''')
+
+INITIALIZER = textwrap.dedent(rf'''
+    # initializer
+    (?:
+        (?:
+            [(]
+            # no nested parens (e.g. func ptr)
+            [^)]*
+            [)]
+            \s*
+         )?
+        (?:
+            # a string literal
+            (?:
+                (?: {_ind(STRING_LITERAL, 4)} \s* )*
+                {_ind(STRING_LITERAL, 4)}
+             )
+            |
+
+            # a simple initializer
+            (?:
+                (?:
+                    [^'",;{{]*
+                    {_ind(STRING_LITERAL, 4)}
+                 )*
+                [^'",;{{]*
+             )
+            |
+
+            # a struct/array literal
+            (?:
+                # We only expect compound initializers with
+                # single-variable declarations.
+                {{
+                (?:
+                    [^'";]*?
+                    {_ind(STRING_LITERAL, 5)}
+                 )*
+                [^'";]*?
+                }}
+                (?= \s* ; )  # Note this lookahead.
+             )
+         )
+     )
+    # end initializer
+    ''')
+
+
+#######################################
+# compound type declarations
+
+STRUCT_MEMBER_DECL = textwrap.dedent(rf'''
+    (?:
+        # inline compound type decl
+        (?:
+            (?:  # <COMPOUND_TYPE_KIND>
+                {COMPOUND_TYPE_KIND}
+             )
+            (?:
+                \s+
+                (?:  # <COMPOUND_TYPE_NAME>
+                    {STRICT_IDENTIFIER}
+                 )
+             )?
+            \s* {{
+         )
+        |
+        (?:
+            # typed member
+            (?:
+                # Technically it doesn't have to have a type...
+                (?:  # <SPECIFIER_QUALIFIER>
+                    (?: {TYPE_QUALIFIER} \s* )?
+                    {_ind(TYPE_SPEC, 5)}
+                 )
+                (?:
+                    # If it doesn't have a declarator then it will have
+                    # a size and vice versa.
+                    \s*
+                    (?:  # <DECLARATOR>
+                        {_ind(DECLARATOR, 6)}
+                     )
+                 )?
+            )
+
+            # sized member
+            (?:
+                \s* [:] \s*
+                (?:  # <SIZE>
+                    \d+
+                 )
+             )?
+            \s*
+            (?:  # <ENDING>
+                [,;]
+             )
+         )
+        |
+        (?:
+            \s*
+            (?:  # <CLOSE>
+                }}
+             )
+         )
+     )
+    ''')
+
+ENUM_MEMBER_DECL = textwrap.dedent(rf'''
+    (?:
+        (?:
+            \s*
+            (?:  # <CLOSE>
+                }}
+             )
+         )
+        |
+        (?:
+            \s*
+            (?:  # <NAME>
+                {IDENTIFIER}
+             )
+            (?:
+                \s* = \s*
+                (?:  # <INIT>
+                    {_ind(STRING_LITERAL, 4)}
+                    |
+                    [^'",}}]+
+                 )
+             )?
+            \s*
+            (?:  # <ENDING>
+                , | }}
+             )
+         )
+     )
+    ''')
+
+
+#######################################
+# statements
+
+SIMPLE_STMT_BODY = textwrap.dedent(rf'''
+    # simple statement body
+    (?:
+        (?:
+            [^'"{{}};]*
+            {_ind(STRING_LITERAL, 3)}
+         )*
+        [^'"{{}};]*
+        #(?= [;{{] )  # Note this lookahead.
+     )
+    # end simple statement body
+    ''')
+SIMPLE_STMT = textwrap.dedent(rf'''
+    # simple statement
+    (?:
+        (?:  # <SIMPLE_STMT>
+            # stmt-inline "initializer"
+            (?:
+                return \b
+                (?:
+                    \s*
+                    {_ind(INITIALIZER, 5)}
+                )?
+             )
+            |
+            # variable assignment
+            (?:
+                (?: [*] \s* )?
+                (?:
+                    {STRICT_IDENTIFIER} \s*
+                    (?: . | -> ) \s*
+                 )*
+                {STRICT_IDENTIFIER}
+                (?: \s* \[ \s* \d+ \s* \] )?
+                \s* = \s*
+                {_ind(INITIALIZER, 4)}
+             )
+            |
+            # catchall return statement
+            (?:
+                return \b
+                (?:
+                    (?:
+                        [^'";]*
+                        {_ind(STRING_LITERAL, 6)}
+                     )*
+                    \s* [^'";]*
+                 )?
+             )
+            |
+            # simple statement
+            (?:
+                {_ind(SIMPLE_STMT_BODY, 4)}
+             )
+         )
+        \s*
+        (?:  # <SIMPLE_ENDING>
+            ;
+         )
+     )
+    # end simple statement
+    ''')
+COMPOUND_STMT = textwrap.dedent(rf'''
+    # compound statement
+    (?:
+        \b
+        (?:
+            (?:
+                (?:  # <COMPOUND_BARE>
+                    else | do
+                 )
+                \b
+             )
+            |
+            (?:
+                (?:  # <COMPOUND_LABELED>
+                    (?:
+                        case \b
+                        (?:
+                            [^'":]*
+                            {_ind(STRING_LITERAL, 7)}
+                         )*
+                        \s* [^'":]*
+                     )
+                    |
+                    default
+                    |
+                    {STRICT_IDENTIFIER}
+                 )
+                \s* [:]
+             )
+            |
+            (?:
+                (?:  # <COMPOUND_PAREN>
+                    for | while | if | switch
+                 )
+                \s* (?= [(] )  # Note this lookahead.
+             )
+         )
+        \s*
+     )
+    # end compound statement
+    ''')
+
+
+#######################################
+# function bodies
+
+LOCAL = textwrap.dedent(rf'''
+    (?:
+        # an empty statement
+        (?:  # <EMPTY>
+            ;
+         )
+        |
+        # inline type decl
+        (?:
+            (?:
+                (?:  # <INLINE_LEADING>
+                    [^;{{}}]+?
+                 )
+                \s*
+             )?
+            (?:  # <INLINE_PRE>
+                (?: {STORAGE_CLASS} \s* )?
+                (?: {TYPE_QUALIFIER} \s* )?
+             )?  # </INLINE_PRE>
+            (?:  # <INLINE_KIND>
+                {COMPOUND_TYPE_KIND}
+             )
+            (?:
+                \s+
+                (?:  # <INLINE_NAME>
+                    {STRICT_IDENTIFIER}
+                 )
+             )?
+            \s* {{
+         )
+        |
+        # var decl
+        (?:
+            (?:  # <STORAGE>
+                {STORAGE_CLASS}
+             )?  # </STORAGE>
+            (?:
+                \s*
+                (?:  # <VAR_DECL>
+                    {_ind(VAR_DECL, 5)}
+                 )
+             )
+            (?:
+                (?:
+                    # initializer
+                    # We expect only basic initializers.
+                    \s* = \s*
+                    (?:  # <VAR_INIT>
+                        {_ind(INITIALIZER, 6)}
+                     )
+                 )?
+                (?:
+                    \s*
+                    (?:  # <VAR_ENDING>
+                        [,;]
+                     )
+                 )
+             )
+         )
+        |
+        {_ind(COMPOUND_STMT, 2)}
+        |
+        # start-of-block
+        (?:
+            (?:  # <BLOCK_LEADING>
+                (?:
+                    [^'"{{}};]*
+                    {_ind(STRING_LITERAL, 5)}
+                 )*
+                [^'"{{}};]*
+                # Presumably we will not see "== {{".
+                [^\s='"{{}});]
+                \s*
+             )?  # </BLOCK_LEADING>
+            (?:  # <BLOCK_OPEN>
+                {{
+             )
+         )
+        |
+        {_ind(SIMPLE_STMT, 2)}
+        |
+        # end-of-block
+        (?:  # <BLOCK_CLOSE>
+            }}
+         )
+     )
+    ''')
+
+LOCAL_STATICS = textwrap.dedent(rf'''
+    (?:
+        # inline type decl
+        (?:
+            (?:
+                (?:  # <INLINE_LEADING>
+                    [^;{{}}]+?
+                 )
+                \s*
+             )?
+            (?:  # <INLINE_PRE>
+                (?: {STORAGE_CLASS} \s* )?
+                (?: {TYPE_QUALIFIER} \s* )?
+             )?
+            (?:  # <INLINE_KIND>
+                {COMPOUND_TYPE_KIND}
+             )
+            (?:
+                \s+
+                (?:  # <INLINE_NAME>
+                    {STRICT_IDENTIFIER}
+                 )
+             )?
+            \s* {{
+         )
+        |
+        # var decl
+        (?:
+            # We only look for static variables.
+            (?:  # <STATIC_DECL>
+                static \b
+                (?: \s* {TYPE_QUALIFIER} )?
+                \s* {_ind(TYPE_SPEC, 4)}
+                \s* {_ind(DECLARATOR, 4)}
+             )
+            \s*
+            (?:
+                (?:  # <STATIC_INIT>
+                    = \s*
+                    {_ind(INITIALIZER, 4)}
+                    \s*
+                    [,;{{]
+                 )
+                |
+                (?:  # <STATIC_ENDING>
+                    [,;]
+                 )
+             )
+         )
+        |
+        # everything else
+        (?:
+            (?:  # <DELIM_LEADING>
+                (?:
+                    [^'"{{}};]*
+                    {_ind(STRING_LITERAL, 4)}
+                 )*
+                \s* [^'"{{}};]*
+             )
+            (?:
+                (?:  # <BLOCK_OPEN>
+                    {{
+                 )
+                |
+                (?:  # <BLOCK_CLOSE>
+                    }}
+                 )
+                |
+                (?:  # <STMT_END>
+                    ;
+                 )
+             )
+         )
+     )
+    ''')
+
+
+#######################################
+# global declarations
+
+GLOBAL = textwrap.dedent(rf'''
+    (?:
+        # an empty statement
+        (?:  # <EMPTY>
+            ;
+         )
+        |
+
+        # compound type decl (maybe inline)
+        (?:
+            (?:
+                (?:  # <COMPOUND_LEADING>
+                    [^;{{}}]+?
+                 )
+                 \s*
+             )?
+            (?:  # <COMPOUND_KIND>
+                {COMPOUND_TYPE_KIND}
+             )
+            (?:
+                \s+
+                (?:  # <COMPOUND_NAME>
+                    {STRICT_IDENTIFIER}
+                 )
+             )?
+            \s* {{
+         )
+        |
+        # bogus inline decl artifact
+        # This simplifies resolving the relative syntactic ambiguity of
+        # inline structs.
+        (?:
+            (?:  # <FORWARD_KIND>
+                {COMPOUND_TYPE_KIND}
+             )
+            \s*
+            (?:  # <FORWARD_NAME>
+                {ANON_IDENTIFIER}
+             )
+            (?:  # <MAYBE_INLINE_ACTUAL>
+                [^=,;({{[*\]]*
+                [=,;({{]
+             )
+         )
+        |
+
+        # typedef
+        (?:
+            \b typedef \b \s*
+            (?:  # <TYPEDEF_DECL>
+                {_ind(VAR_DECL, 4)}
+             )
+            (?:
+                # We expect no inline type definitions in the parameters.
+                \s* [(] \s*
+                (?:  # <TYPEDEF_FUNC_PARAMS>
+                    [^{{;]*
+                 )
+                \s* [)]
+             )?
+            \s* ;
+         )
+        |
+
+        # func decl/definition & var decls
+        # XXX dedicated pattern for funcs (more restricted)?
+        (?:
+            (?:
+                (?:  # <VAR_STORAGE>
+                    {STORAGE_CLASS}
+                 )
+                \s*
+             )?
+            (?:
+                (?:  # <FUNC_INLINE>
+                    \b inline \b
+                 )
+                \s*
+             )?
+            (?:  # <VAR_DECL>
+                {_ind(VAR_DECL, 4)}
+             )
+            (?:
+                # func decl / definition
+                (?:
+                    (?:
+                        # We expect no inline type definitions in the parameters.
+                        \s* [(] \s*
+                        (?:  # <FUNC_PARAMS>
+                            [^{{;]*
+                         )
+                        \s* [)] \s*
+                        (?:  # <FUNC_DELIM>
+                            [{{;]
+                         )
+                     )
+                    |
+                    (?:
+                        # This is some old-school syntax!
+                        \s* [(] \s*
+                        # We throw away the bare names:
+                        {STRICT_IDENTIFIER}
+                        (?: \s* , \s* {STRICT_IDENTIFIER} )*
+                        \s* [)] \s*
+
+                        # We keep the trailing param declarations:
+                        (?:  # <FUNC_LEGACY_PARAMS>
+                            # There's at least one!
+                            (?: {TYPE_QUALIFIER} \s* )?
+                            {_ind(TYPE_SPEC, 7)}
+                            \s*
+                            {_ind(DECLARATOR, 7)}
+                            \s* ;
+                            (?:
+                                \s*
+                                (?: {TYPE_QUALIFIER} \s* )?
+                                {_ind(TYPE_SPEC, 8)}
+                                \s*
+                                {_ind(DECLARATOR, 8)}
+                                \s* ;
+                             )*
+                         )
+                        \s* {{
+                     )
+                 )
+                |
+                # var / typedef
+                (?:
+                    (?:
+                        # initializer
+                        # We expect only basic initializers.
+                        \s* = \s*
+                        (?:  # <VAR_INIT>
+                            {_ind(INITIALIZER, 6)}
+                         )
+                     )?
+                    \s*
+                    (?:  # <VAR_ENDING>
+                        [,;]
+                     )
+                 )
+             )
+         )
+     )
+    ''')