| # Copyright 2015 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| import parser |
| import symbol |
| import sys |
| import token |
| import tokenize |
| |
| from py_utils.refactor import offset_token |
| |
| |
| class Snippet(object): |
| """A node in the Python parse tree. |
| |
| The Python grammar is defined at: |
| https://docs.python.org/2/reference/grammar.html |
| |
| There are two types of Snippets: |
| TokenSnippets are leaf nodes containing actual text. |
| Symbols are internal nodes representing higher-level groupings, and are |
| defined by the left-hand sides of the BNFs in the above link. |
| """ |
| @property |
| def type(self): |
| raise NotImplementedError() |
| |
| @property |
| def type_name(self): |
| raise NotImplementedError() |
| |
| @property |
| def children(self): |
| """Return a list of this node's children.""" |
| raise NotImplementedError() |
| |
| @property |
| def tokens(self): |
| """Return a tuple of the tokens this Snippet contains.""" |
| raise NotImplementedError() |
| |
| def PrintTree(self, indent=0, stream=sys.stdout): |
| """Spew a pretty-printed parse tree. Mostly useful for debugging.""" |
| raise NotImplementedError() |
| |
| def __str__(self): |
| return offset_token.Untokenize(self.tokens) |
| |
| def FindAll(self, snippet_type): |
| if isinstance(snippet_type, int): |
| if self.type == snippet_type: |
| yield self |
| else: |
| if isinstance(self, snippet_type): |
| yield self |
| |
| for child in self.children: |
| for snippet in child.FindAll(snippet_type): |
| yield snippet |
| |
| def FindChild(self, snippet_type, **kwargs): |
| for child in self.children: |
| if isinstance(snippet_type, int): |
| if child.type != snippet_type: |
| continue |
| else: |
| if not isinstance(child, snippet_type): |
| continue |
| |
| for attribute, value in kwargs: |
| if getattr(child, attribute) != value: |
| break |
| else: |
| return child |
| raise ValueError('%s is not in %s. Children are: %s' % |
| (snippet_type, self, self.children)) |
| |
| def FindChildren(self, snippet_type): |
| if isinstance(snippet_type, int): |
| for child in self.children: |
| if child.type == snippet_type: |
| yield child |
| else: |
| for child in self.children: |
| if isinstance(child, snippet_type): |
| yield child |
| |
| |
| class TokenSnippet(Snippet): |
| """A Snippet containing a list of tokens. |
| |
| A list of tokens may start with any number of comments and non-terminating |
| newlines, but must end with a syntactically meaningful token. |
| """ |
| |
| def __init__(self, token_type, tokens): |
| # For operators and delimiters, the TokenSnippet's type may be more specific |
| # than the type of the constituent token. E.g. the TokenSnippet type is |
| # token.DOT, but the token type is token.OP. This is because the parser |
| # has more context than the tokenizer. |
| self._type = token_type |
| self._tokens = tokens |
| self._modified = False |
| |
| @classmethod |
| def Create(cls, token_type, string, offset=(0, 0)): |
| return cls(token_type, |
| [offset_token.OffsetToken(token_type, string, offset)]) |
| |
| @property |
| def type(self): |
| return self._type |
| |
| @property |
| def type_name(self): |
| return token.tok_name[self.type] |
| |
| @property |
| def value(self): |
| return self._tokens[-1].string |
| |
| @value.setter |
| def value(self, value): |
| self._tokens[-1].string = value |
| self._modified = True |
| |
| @property |
| def children(self): |
| return [] |
| |
| @property |
| def tokens(self): |
| return tuple(self._tokens) |
| |
| @property |
| def modified(self): |
| return self._modified |
| |
| def PrintTree(self, indent=0, stream=sys.stdout): |
| stream.write(' ' * indent) |
| if not self.tokens: |
| print >> stream, self.type_name |
| return |
| |
| print >> stream, '%-4s' % self.type_name, repr(self.tokens[0].string) |
| for tok in self.tokens[1:]: |
| stream.write(' ' * indent) |
| print >> stream, ' ' * max(len(self.type_name), 4), repr(tok.string) |
| |
| |
| class Symbol(Snippet): |
| """A Snippet containing sub-Snippets. |
| |
| The possible types and type_names are defined in Python's symbol module.""" |
| |
| def __init__(self, symbol_type, children): |
| self._type = symbol_type |
| self._children = children |
| |
| @property |
| def type(self): |
| return self._type |
| |
| @property |
| def type_name(self): |
| return symbol.sym_name[self.type] |
| |
| @property |
| def children(self): |
| return self._children |
| |
| @children.setter |
| def children(self, value): # pylint: disable=arguments-differ |
| self._children = value |
| |
| @property |
| def tokens(self): |
| tokens = [] |
| for child in self.children: |
| tokens += child.tokens |
| return tuple(tokens) |
| |
| @property |
| def modified(self): |
| return any(child.modified for child in self.children) |
| |
| def PrintTree(self, indent=0, stream=sys.stdout): |
| stream.write(' ' * indent) |
| |
| # If there's only one child, collapse it onto the same line. |
| node = self |
| while len(node.children) == 1 and len(node.children[0].children) == 1: |
| print >> stream, node.type_name, |
| node = node.children[0] |
| |
| print >> stream, node.type_name |
| for child in node.children: |
| child.PrintTree(indent + 2, stream) |
| |
| |
| def Snippetize(f): |
| """Return the syntax tree of the given file.""" |
| f.seek(0) |
| syntax_tree = parser.st2list(parser.suite(f.read())) |
| tokens = offset_token.Tokenize(f) |
| |
| snippet = _SnippetizeNode(syntax_tree, tokens) |
| assert not tokens |
| return snippet |
| |
| |
| def _SnippetizeNode(node, tokens): |
| # The parser module gives a syntax tree that discards comments, |
| # non-terminating newlines, and whitespace information. Use the tokens given |
| # by the tokenize module to annotate the syntax tree with the information |
| # needed to exactly reproduce the original source code. |
| node_type = node[0] |
| |
| if node_type >= token.NT_OFFSET: |
| # Symbol. |
| children = tuple(_SnippetizeNode(child, tokens) for child in node[1:]) |
| return Symbol(node_type, children) |
| else: |
| # Token. |
| grabbed_tokens = [] |
| while tokens and ( |
| tokens[0].type == tokenize.COMMENT or tokens[0].type == tokenize.NL): |
| grabbed_tokens.append(tokens.popleft()) |
| |
| # parser has 2 NEWLINEs right before the end. |
| # tokenize has 0 or 1 depending on if the file has one. |
| # Create extra nodes without consuming tokens to account for this. |
| if node_type == token.NEWLINE: |
| for tok in tokens: |
| if tok.type == token.ENDMARKER: |
| return TokenSnippet(node_type, grabbed_tokens) |
| if tok.type != token.DEDENT: |
| break |
| |
| assert tokens[0].type == token.OP or node_type == tokens[0].type |
| |
| grabbed_tokens.append(tokens.popleft()) |
| return TokenSnippet(node_type, grabbed_tokens) |