| """Parser for the Python metagrammar""" |
| |
| import io |
| import tokenize # from stdlib |
| |
| from .automata import NFA, NFAState |
| |
| |
| class GrammarParser: |
| """Parser for Python grammar files.""" |
| |
| _translation_table = { |
| tokenize.NAME: "NAME", |
| tokenize.STRING: "STRING", |
| tokenize.NEWLINE: "NEWLINE", |
| tokenize.NL: "NL", |
| tokenize.OP: "OP", |
| tokenize.ENDMARKER: "ENDMARKER", |
| tokenize.COMMENT: "COMMENT", |
| } |
| |
| def __init__(self, grammar): |
| self.grammar = grammar |
| grammar_adaptor = io.StringIO(grammar) |
| self.generator = tokenize.generate_tokens(grammar_adaptor.readline) |
| self._gettoken() # Initialize lookahead |
| self._current_rule_name = None |
| |
| def parse(self): |
| """Turn the grammar into a collection of NFAs""" |
| # grammar: (NEWLINE | rule)* ENDMARKER |
| while self.type != tokenize.ENDMARKER: |
| while self.type == tokenize.NEWLINE: |
| self._gettoken() |
| # rule: NAME ':' rhs NEWLINE |
| self._current_rule_name = self._expect(tokenize.NAME) |
| self._expect(tokenize.OP, ":") |
| a, z = self._parse_rhs() |
| self._expect(tokenize.NEWLINE) |
| |
| yield NFA(a, z) |
| |
| def _parse_rhs(self): |
| # rhs: items ('|' items)* |
| a, z = self._parse_items() |
| if self.value != "|": |
| return a, z |
| else: |
| aa = NFAState(self._current_rule_name) |
| zz = NFAState(self._current_rule_name) |
| while True: |
| # Allow to transit directly to the previous state and connect the end of the |
| # previous state to the end of the current one, effectively allowing to skip |
| # the current state. |
| aa.add_arc(a) |
| z.add_arc(zz) |
| if self.value != "|": |
| break |
| |
| self._gettoken() |
| a, z = self._parse_items() |
| return aa, zz |
| |
| def _parse_items(self): |
| # items: item+ |
| a, b = self._parse_item() |
| while self.type in (tokenize.NAME, tokenize.STRING) or self.value in ("(", "["): |
| c, d = self._parse_item() |
| # Allow a transition between the end of the previous state |
| # and the beginning of the new one, connecting all the items |
| # together. In this way we can only reach the end if we visit |
| # all the items. |
| b.add_arc(c) |
| b = d |
| return a, b |
| |
| def _parse_item(self): |
| # item: '[' rhs ']' | atom ['+' | '*'] |
| if self.value == "[": |
| self._gettoken() |
| a, z = self._parse_rhs() |
| self._expect(tokenize.OP, "]") |
| # Make a transition from the beginning to the end so it is possible to |
| # advance for free to the next state of this item # without consuming |
| # anything from the rhs. |
| a.add_arc(z) |
| return a, z |
| else: |
| a, z = self._parse_atom() |
| value = self.value |
| if value not in ("+", "*"): |
| return a, z |
| self._gettoken() |
| z.add_arc(a) |
| if value == "+": |
| # Create a cycle to the beginning so we go back to the old state in this |
| # item and repeat. |
| return a, z |
| else: |
| # The end state is the same as the beginning, so we can cycle arbitrarily |
| # and end in the beginning if necessary. |
| return a, a |
| |
| def _parse_atom(self): |
| # atom: '(' rhs ')' | NAME | STRING |
| if self.value == "(": |
| self._gettoken() |
| a, z = self._parse_rhs() |
| self._expect(tokenize.OP, ")") |
| return a, z |
| elif self.type in (tokenize.NAME, tokenize.STRING): |
| a = NFAState(self._current_rule_name) |
| z = NFAState(self._current_rule_name) |
| # We can transit to the next state only if we consume the value. |
| a.add_arc(z, self.value) |
| self._gettoken() |
| return a, z |
| else: |
| self._raise_error( |
| "expected (...) or NAME or STRING, got {} ({})", |
| self._translation_table.get(self.type, self.type), |
| self.value, |
| ) |
| |
| def _expect(self, type_, value=None): |
| if self.type != type_: |
| self._raise_error( |
| "expected {}, got {} ({})", |
| self._translation_table.get(type_, type_), |
| self._translation_table.get(self.type, self.type), |
| self.value, |
| ) |
| if value is not None and self.value != value: |
| self._raise_error("expected {}, got {}", value, self.value) |
| value = self.value |
| self._gettoken() |
| return value |
| |
| def _gettoken(self): |
| tup = next(self.generator) |
| while tup[0] in (tokenize.COMMENT, tokenize.NL): |
| tup = next(self.generator) |
| self.type, self.value, self.begin, self.end, self.line = tup |
| |
| def _raise_error(self, msg, *args): |
| if args: |
| try: |
| msg = msg.format(*args) |
| except Exception: |
| msg = " ".join([msg] + list(map(str, args))) |
| line = self.grammar.splitlines()[self.begin[0] - 1] |
| raise SyntaxError(msg, ("<grammar>", self.begin[0], self.begin[1], line)) |