blob: 074a083fb74b8b73592777f35f573b3a0b9de098 [file] [log] [blame]
"""Parser for the Python metagrammar"""
import io
import tokenize # from stdlib
from .automata import NFA, NFAState
class GrammarParser:
"""Parser for Python grammar files."""
_translation_table = {
tokenize.NAME: "NAME",
tokenize.STRING: "STRING",
tokenize.NEWLINE: "NEWLINE",
tokenize.NL: "NL",
tokenize.OP: "OP",
tokenize.ENDMARKER: "ENDMARKER",
tokenize.COMMENT: "COMMENT",
}
def __init__(self, grammar):
self.grammar = grammar
grammar_adaptor = io.StringIO(grammar)
self.generator = tokenize.generate_tokens(grammar_adaptor.readline)
self._gettoken() # Initialize lookahead
self._current_rule_name = None
def parse(self):
"""Turn the grammar into a collection of NFAs"""
# grammar: (NEWLINE | rule)* ENDMARKER
while self.type != tokenize.ENDMARKER:
while self.type == tokenize.NEWLINE:
self._gettoken()
# rule: NAME ':' rhs NEWLINE
self._current_rule_name = self._expect(tokenize.NAME)
self._expect(tokenize.OP, ":")
a, z = self._parse_rhs()
self._expect(tokenize.NEWLINE)
yield NFA(a, z)
def _parse_rhs(self):
# rhs: items ('|' items)*
a, z = self._parse_items()
if self.value != "|":
return a, z
else:
aa = NFAState(self._current_rule_name)
zz = NFAState(self._current_rule_name)
while True:
# Allow to transit directly to the previous state and connect the end of the
# previous state to the end of the current one, effectively allowing to skip
# the current state.
aa.add_arc(a)
z.add_arc(zz)
if self.value != "|":
break
self._gettoken()
a, z = self._parse_items()
return aa, zz
def _parse_items(self):
# items: item+
a, b = self._parse_item()
while self.type in (tokenize.NAME, tokenize.STRING) or self.value in ("(", "["):
c, d = self._parse_item()
# Allow a transition between the end of the previous state
# and the beginning of the new one, connecting all the items
# together. In this way we can only reach the end if we visit
# all the items.
b.add_arc(c)
b = d
return a, b
def _parse_item(self):
# item: '[' rhs ']' | atom ['+' | '*']
if self.value == "[":
self._gettoken()
a, z = self._parse_rhs()
self._expect(tokenize.OP, "]")
# Make a transition from the beginning to the end so it is possible to
# advance for free to the next state of this item # without consuming
# anything from the rhs.
a.add_arc(z)
return a, z
else:
a, z = self._parse_atom()
value = self.value
if value not in ("+", "*"):
return a, z
self._gettoken()
z.add_arc(a)
if value == "+":
# Create a cycle to the beginning so we go back to the old state in this
# item and repeat.
return a, z
else:
# The end state is the same as the beginning, so we can cycle arbitrarily
# and end in the beginning if necessary.
return a, a
def _parse_atom(self):
# atom: '(' rhs ')' | NAME | STRING
if self.value == "(":
self._gettoken()
a, z = self._parse_rhs()
self._expect(tokenize.OP, ")")
return a, z
elif self.type in (tokenize.NAME, tokenize.STRING):
a = NFAState(self._current_rule_name)
z = NFAState(self._current_rule_name)
# We can transit to the next state only if we consume the value.
a.add_arc(z, self.value)
self._gettoken()
return a, z
else:
self._raise_error(
"expected (...) or NAME or STRING, got {} ({})",
self._translation_table.get(self.type, self.type),
self.value,
)
def _expect(self, type_, value=None):
if self.type != type_:
self._raise_error(
"expected {}, got {} ({})",
self._translation_table.get(type_, type_),
self._translation_table.get(self.type, self.type),
self.value,
)
if value is not None and self.value != value:
self._raise_error("expected {}, got {}", value, self.value)
value = self.value
self._gettoken()
return value
def _gettoken(self):
tup = next(self.generator)
while tup[0] in (tokenize.COMMENT, tokenize.NL):
tup = next(self.generator)
self.type, self.value, self.begin, self.end, self.line = tup
def _raise_error(self, msg, *args):
if args:
try:
msg = msg.format(*args)
except Exception:
msg = " ".join([msg] + list(map(str, args)))
line = self.grammar.splitlines()[self.begin[0] - 1]
raise SyntaxError(msg, ("<grammar>", self.begin[0], self.begin[1], line))