| """ANTLR3 runtime package""" |
| |
| # begin[licence] |
| # |
| # [The "BSD licence"] |
| # Copyright (c) 2005-2008 Terence Parr |
| # All rights reserved. |
| # |
| # Redistribution and use in source and binary forms, with or without |
| # modification, are permitted provided that the following conditions |
| # are met: |
| # 1. Redistributions of source code must retain the above copyright |
| # notice, this list of conditions and the following disclaimer. |
| # 2. Redistributions in binary form must reproduce the above copyright |
| # notice, this list of conditions and the following disclaimer in the |
| # documentation and/or other materials provided with the distribution. |
| # 3. The name of the author may not be used to endorse or promote products |
| # derived from this software without specific prior written permission. |
| # |
| # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
| # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
| # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
| # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, |
| # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
| # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
| # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| # |
| # end[licence] |
| |
| import sys |
| import inspect |
| |
| from antlr3 import compatible_api_versions |
| from antlr3.constants import DEFAULT_CHANNEL, HIDDEN_CHANNEL, EOF, \ |
| EOR_TOKEN_TYPE, INVALID_TOKEN_TYPE |
| from antlr3.exceptions import RecognitionException, MismatchedTokenException, \ |
| MismatchedRangeException, MismatchedTreeNodeException, \ |
| NoViableAltException, EarlyExitException, MismatchedSetException, \ |
| MismatchedNotSetException, FailedPredicateException, \ |
| BacktrackingFailed, UnwantedTokenException, MissingTokenException |
| from antlr3.tokens import CommonToken, SKIP_TOKEN |
| from antlr3.compat import set, frozenset, reversed |
| |
| |
| class RecognizerSharedState(object): |
| """ |
| The set of fields needed by an abstract recognizer to recognize input |
| and recover from errors etc... As a separate state object, it can be |
| shared among multiple grammars; e.g., when one grammar imports another. |
| |
| These fields are publically visible but the actual state pointer per |
| parser is protected. |
| """ |
| |
| def __init__(self): |
| # Track the set of token types that can follow any rule invocation. |
| # Stack grows upwards. |
| self.following = [] |
| |
| # This is true when we see an error and before having successfully |
| # matched a token. Prevents generation of more than one error message |
| # per error. |
| self.errorRecovery = False |
| |
| # The index into the input stream where the last error occurred. |
| # This is used to prevent infinite loops where an error is found |
| # but no token is consumed during recovery...another error is found, |
| # ad naseum. This is a failsafe mechanism to guarantee that at least |
| # one token/tree node is consumed for two errors. |
| self.lastErrorIndex = -1 |
| |
| # If 0, no backtracking is going on. Safe to exec actions etc... |
| # If >0 then it's the level of backtracking. |
| self.backtracking = 0 |
| |
| # An array[size num rules] of Map<Integer,Integer> that tracks |
| # the stop token index for each rule. ruleMemo[ruleIndex] is |
| # the memoization table for ruleIndex. For key ruleStartIndex, you |
| # get back the stop token for associated rule or MEMO_RULE_FAILED. |
| # |
| # This is only used if rule memoization is on (which it is by default). |
| self.ruleMemo = None |
| |
| ## Did the recognizer encounter a syntax error? Track how many. |
| self.syntaxErrors = 0 |
| |
| |
| # LEXER FIELDS (must be in same state object to avoid casting |
| # constantly in generated code and Lexer object) :( |
| |
| |
| ## The goal of all lexer rules/methods is to create a token object. |
| # This is an instance variable as multiple rules may collaborate to |
| # create a single token. nextToken will return this object after |
| # matching lexer rule(s). If you subclass to allow multiple token |
| # emissions, then set this to the last token to be matched or |
| # something nonnull so that the auto token emit mechanism will not |
| # emit another token. |
| self.token = None |
| |
| ## What character index in the stream did the current token start at? |
| # Needed, for example, to get the text for current token. Set at |
| # the start of nextToken. |
| self.tokenStartCharIndex = -1 |
| |
| ## The line on which the first character of the token resides |
| self.tokenStartLine = None |
| |
| ## The character position of first character within the line |
| self.tokenStartCharPositionInLine = None |
| |
| ## The channel number for the current token |
| self.channel = None |
| |
| ## The token type for the current token |
| self.type = None |
| |
| ## You can set the text for the current token to override what is in |
| # the input char buffer. Use setText() or can set this instance var. |
| self.text = None |
| |
| |
| class BaseRecognizer(object): |
| """ |
| @brief Common recognizer functionality. |
| |
| A generic recognizer that can handle recognizers generated from |
| lexer, parser, and tree grammars. This is all the parsing |
| support code essentially; most of it is error recovery stuff and |
| backtracking. |
| """ |
| |
| MEMO_RULE_FAILED = -2 |
| MEMO_RULE_UNKNOWN = -1 |
| |
| # copies from Token object for convenience in actions |
| DEFAULT_TOKEN_CHANNEL = DEFAULT_CHANNEL |
| |
| # for convenience in actions |
| HIDDEN = HIDDEN_CHANNEL |
| |
| # overridden by generated subclasses |
| tokenNames = None |
| |
| # The api_version attribute has been introduced in 3.3. If it is not |
| # overwritten in the generated recognizer, we assume a default of v0. |
| api_version = 0 |
| |
| def __init__(self, state=None): |
| # Input stream of the recognizer. Must be initialized by a subclass. |
| self.input = None |
| |
| ## State of a lexer, parser, or tree parser are collected into a state |
| # object so the state can be shared. This sharing is needed to |
| # have one grammar import others and share same error variables |
| # and other state variables. It's a kind of explicit multiple |
| # inheritance via delegation of methods and shared state. |
| if state is None: |
| state = RecognizerSharedState() |
| self._state = state |
| |
| if self.api_version not in compatible_api_versions: |
| raise RuntimeError( |
| ("ANTLR version mismatch: " |
| "The recognizer has been generated with API V%s, " |
| "but this runtime does not support this.") |
| % self.api_version) |
| |
| # this one only exists to shut up pylint :( |
| def setInput(self, input): |
| self.input = input |
| |
| |
| def reset(self): |
| """ |
| reset the parser's state; subclasses must rewinds the input stream |
| """ |
| |
| # wack everything related to error recovery |
| if self._state is None: |
| # no shared state work to do |
| return |
| |
| self._state.following = [] |
| self._state.errorRecovery = False |
| self._state.lastErrorIndex = -1 |
| self._state.syntaxErrors = 0 |
| # wack everything related to backtracking and memoization |
| self._state.backtracking = 0 |
| if self._state.ruleMemo is not None: |
| self._state.ruleMemo = {} |
| |
| |
| def match(self, input, ttype, follow): |
| """ |
| Match current input symbol against ttype. Attempt |
| single token insertion or deletion error recovery. If |
| that fails, throw MismatchedTokenException. |
| |
| To turn off single token insertion or deletion error |
| recovery, override recoverFromMismatchedToken() and have it |
| throw an exception. See TreeParser.recoverFromMismatchedToken(). |
| This way any error in a rule will cause an exception and |
| immediate exit from rule. Rule would recover by resynchronizing |
| to the set of symbols that can follow rule ref. |
| """ |
| |
| matchedSymbol = self.getCurrentInputSymbol(input) |
| if self.input.LA(1) == ttype: |
| self.input.consume() |
| self._state.errorRecovery = False |
| return matchedSymbol |
| |
| if self._state.backtracking > 0: |
| # FIXME: need to return matchedSymbol here as well. damn!! |
| raise BacktrackingFailed |
| |
| matchedSymbol = self.recoverFromMismatchedToken(input, ttype, follow) |
| return matchedSymbol |
| |
| |
| def matchAny(self, input): |
| """Match the wildcard: in a symbol""" |
| |
| self._state.errorRecovery = False |
| self.input.consume() |
| |
| |
| def mismatchIsUnwantedToken(self, input, ttype): |
| return input.LA(2) == ttype |
| |
| |
| def mismatchIsMissingToken(self, input, follow): |
| if follow is None: |
| # we have no information about the follow; we can only consume |
| # a single token and hope for the best |
| return False |
| |
| # compute what can follow this grammar element reference |
| if EOR_TOKEN_TYPE in follow: |
| viableTokensFollowingThisRule = self.computeContextSensitiveRuleFOLLOW() |
| follow = follow | viableTokensFollowingThisRule |
| |
| if len(self._state.following) > 0: |
| # remove EOR if we're not the start symbol |
| follow = follow - set([EOR_TOKEN_TYPE]) |
| |
| # if current token is consistent with what could come after set |
| # then we know we're missing a token; error recovery is free to |
| # "insert" the missing token |
| if input.LA(1) in follow or EOR_TOKEN_TYPE in follow: |
| return True |
| |
| return False |
| |
| |
| def reportError(self, e): |
| """Report a recognition problem. |
| |
| This method sets errorRecovery to indicate the parser is recovering |
| not parsing. Once in recovery mode, no errors are generated. |
| To get out of recovery mode, the parser must successfully match |
| a token (after a resync). So it will go: |
| |
| 1. error occurs |
| 2. enter recovery mode, report error |
| 3. consume until token found in resynch set |
| 4. try to resume parsing |
| 5. next match() will reset errorRecovery mode |
| |
| If you override, make sure to update syntaxErrors if you care about |
| that. |
| |
| """ |
| |
| # if we've already reported an error and have not matched a token |
| # yet successfully, don't report any errors. |
| if self._state.errorRecovery: |
| return |
| |
| self._state.syntaxErrors += 1 # don't count spurious |
| self._state.errorRecovery = True |
| |
| self.displayRecognitionError(self.tokenNames, e) |
| |
| |
| def displayRecognitionError(self, tokenNames, e): |
| hdr = self.getErrorHeader(e) |
| msg = self.getErrorMessage(e, tokenNames) |
| self.emitErrorMessage(hdr+" "+msg) |
| |
| |
| def getErrorMessage(self, e, tokenNames): |
| """ |
| What error message should be generated for the various |
| exception types? |
| |
| Not very object-oriented code, but I like having all error message |
| generation within one method rather than spread among all of the |
| exception classes. This also makes it much easier for the exception |
| handling because the exception classes do not have to have pointers back |
| to this object to access utility routines and so on. Also, changing |
| the message for an exception type would be difficult because you |
| would have to subclassing exception, but then somehow get ANTLR |
| to make those kinds of exception objects instead of the default. |
| This looks weird, but trust me--it makes the most sense in terms |
| of flexibility. |
| |
| For grammar debugging, you will want to override this to add |
| more information such as the stack frame with |
| getRuleInvocationStack(e, this.getClass().getName()) and, |
| for no viable alts, the decision description and state etc... |
| |
| Override this to change the message generated for one or more |
| exception types. |
| """ |
| |
| if isinstance(e, UnwantedTokenException): |
| tokenName = "<unknown>" |
| if e.expecting == EOF: |
| tokenName = "EOF" |
| |
| else: |
| tokenName = self.tokenNames[e.expecting] |
| |
| msg = "extraneous input %s expecting %s" % ( |
| self.getTokenErrorDisplay(e.getUnexpectedToken()), |
| tokenName |
| ) |
| |
| elif isinstance(e, MissingTokenException): |
| tokenName = "<unknown>" |
| if e.expecting == EOF: |
| tokenName = "EOF" |
| |
| else: |
| tokenName = self.tokenNames[e.expecting] |
| |
| msg = "missing %s at %s" % ( |
| tokenName, self.getTokenErrorDisplay(e.token) |
| ) |
| |
| elif isinstance(e, MismatchedTokenException): |
| tokenName = "<unknown>" |
| if e.expecting == EOF: |
| tokenName = "EOF" |
| else: |
| tokenName = self.tokenNames[e.expecting] |
| |
| msg = "mismatched input " \ |
| + self.getTokenErrorDisplay(e.token) \ |
| + " expecting " \ |
| + tokenName |
| |
| elif isinstance(e, MismatchedTreeNodeException): |
| tokenName = "<unknown>" |
| if e.expecting == EOF: |
| tokenName = "EOF" |
| else: |
| tokenName = self.tokenNames[e.expecting] |
| |
| msg = "mismatched tree node: %s expecting %s" \ |
| % (e.node, tokenName) |
| |
| elif isinstance(e, NoViableAltException): |
| msg = "no viable alternative at input " \ |
| + self.getTokenErrorDisplay(e.token) |
| |
| elif isinstance(e, EarlyExitException): |
| msg = "required (...)+ loop did not match anything at input " \ |
| + self.getTokenErrorDisplay(e.token) |
| |
| elif isinstance(e, MismatchedSetException): |
| msg = "mismatched input " \ |
| + self.getTokenErrorDisplay(e.token) \ |
| + " expecting set " \ |
| + repr(e.expecting) |
| |
| elif isinstance(e, MismatchedNotSetException): |
| msg = "mismatched input " \ |
| + self.getTokenErrorDisplay(e.token) \ |
| + " expecting set " \ |
| + repr(e.expecting) |
| |
| elif isinstance(e, FailedPredicateException): |
| msg = "rule " \ |
| + e.ruleName \ |
| + " failed predicate: {" \ |
| + e.predicateText \ |
| + "}?" |
| |
| else: |
| msg = str(e) |
| |
| return msg |
| |
| |
| def getNumberOfSyntaxErrors(self): |
| """ |
| Get number of recognition errors (lexer, parser, tree parser). Each |
| recognizer tracks its own number. So parser and lexer each have |
| separate count. Does not count the spurious errors found between |
| an error and next valid token match |
| |
| See also reportError() |
| """ |
| return self._state.syntaxErrors |
| |
| |
| def getErrorHeader(self, e): |
| """ |
| What is the error header, normally line/character position information? |
| """ |
| |
| source_name = self.getSourceName() |
| if source_name is not None: |
| return "%s line %d:%d" % (source_name, e.line, e.charPositionInLine) |
| return "line %d:%d" % (e.line, e.charPositionInLine) |
| |
| |
| def getTokenErrorDisplay(self, t): |
| """ |
| How should a token be displayed in an error message? The default |
| is to display just the text, but during development you might |
| want to have a lot of information spit out. Override in that case |
| to use t.toString() (which, for CommonToken, dumps everything about |
| the token). This is better than forcing you to override a method in |
| your token objects because you don't have to go modify your lexer |
| so that it creates a new Java type. |
| """ |
| |
| s = t.text |
| if s is None: |
| if t.type == EOF: |
| s = "<EOF>" |
| else: |
| s = "<"+t.type+">" |
| |
| return repr(s) |
| |
| |
| def emitErrorMessage(self, msg): |
| """Override this method to change where error messages go""" |
| sys.stderr.write(msg + '\n') |
| |
| |
| def recover(self, input, re): |
| """ |
| Recover from an error found on the input stream. This is |
| for NoViableAlt and mismatched symbol exceptions. If you enable |
| single token insertion and deletion, this will usually not |
| handle mismatched symbol exceptions but there could be a mismatched |
| token that the match() routine could not recover from. |
| """ |
| |
| # PROBLEM? what if input stream is not the same as last time |
| # perhaps make lastErrorIndex a member of input |
| if self._state.lastErrorIndex == input.index(): |
| # uh oh, another error at same token index; must be a case |
| # where LT(1) is in the recovery token set so nothing is |
| # consumed; consume a single token so at least to prevent |
| # an infinite loop; this is a failsafe. |
| input.consume() |
| |
| self._state.lastErrorIndex = input.index() |
| followSet = self.computeErrorRecoverySet() |
| |
| self.beginResync() |
| self.consumeUntil(input, followSet) |
| self.endResync() |
| |
| |
| def beginResync(self): |
| """ |
| A hook to listen in on the token consumption during error recovery. |
| The DebugParser subclasses this to fire events to the listenter. |
| """ |
| |
| pass |
| |
| |
| def endResync(self): |
| """ |
| A hook to listen in on the token consumption during error recovery. |
| The DebugParser subclasses this to fire events to the listenter. |
| """ |
| |
| pass |
| |
| |
| def computeErrorRecoverySet(self): |
| """ |
| Compute the error recovery set for the current rule. During |
| rule invocation, the parser pushes the set of tokens that can |
| follow that rule reference on the stack; this amounts to |
| computing FIRST of what follows the rule reference in the |
| enclosing rule. This local follow set only includes tokens |
| from within the rule; i.e., the FIRST computation done by |
| ANTLR stops at the end of a rule. |
| |
| EXAMPLE |
| |
| When you find a "no viable alt exception", the input is not |
| consistent with any of the alternatives for rule r. The best |
| thing to do is to consume tokens until you see something that |
| can legally follow a call to r *or* any rule that called r. |
| You don't want the exact set of viable next tokens because the |
| input might just be missing a token--you might consume the |
| rest of the input looking for one of the missing tokens. |
| |
| Consider grammar: |
| |
| a : '[' b ']' |
| | '(' b ')' |
| ; |
| b : c '^' INT ; |
| c : ID |
| | INT |
| ; |
| |
| At each rule invocation, the set of tokens that could follow |
| that rule is pushed on a stack. Here are the various "local" |
| follow sets: |
| |
| FOLLOW(b1_in_a) = FIRST(']') = ']' |
| FOLLOW(b2_in_a) = FIRST(')') = ')' |
| FOLLOW(c_in_b) = FIRST('^') = '^' |
| |
| Upon erroneous input "[]", the call chain is |
| |
| a -> b -> c |
| |
| and, hence, the follow context stack is: |
| |
| depth local follow set after call to rule |
| 0 \<EOF> a (from main()) |
| 1 ']' b |
| 3 '^' c |
| |
| Notice that ')' is not included, because b would have to have |
| been called from a different context in rule a for ')' to be |
| included. |
| |
| For error recovery, we cannot consider FOLLOW(c) |
| (context-sensitive or otherwise). We need the combined set of |
| all context-sensitive FOLLOW sets--the set of all tokens that |
| could follow any reference in the call chain. We need to |
| resync to one of those tokens. Note that FOLLOW(c)='^' and if |
| we resync'd to that token, we'd consume until EOF. We need to |
| sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}. |
| In this case, for input "[]", LA(1) is in this set so we would |
| not consume anything and after printing an error rule c would |
| return normally. It would not find the required '^' though. |
| At this point, it gets a mismatched token error and throws an |
| exception (since LA(1) is not in the viable following token |
| set). The rule exception handler tries to recover, but finds |
| the same recovery set and doesn't consume anything. Rule b |
| exits normally returning to rule a. Now it finds the ']' (and |
| with the successful match exits errorRecovery mode). |
| |
| So, you cna see that the parser walks up call chain looking |
| for the token that was a member of the recovery set. |
| |
| Errors are not generated in errorRecovery mode. |
| |
| ANTLR's error recovery mechanism is based upon original ideas: |
| |
| "Algorithms + Data Structures = Programs" by Niklaus Wirth |
| |
| and |
| |
| "A note on error recovery in recursive descent parsers": |
| http://portal.acm.org/citation.cfm?id=947902.947905 |
| |
| Later, Josef Grosch had some good ideas: |
| |
| "Efficient and Comfortable Error Recovery in Recursive Descent |
| Parsers": |
| ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip |
| |
| Like Grosch I implemented local FOLLOW sets that are combined |
| at run-time upon error to avoid overhead during parsing. |
| """ |
| |
| return self.combineFollows(False) |
| |
| |
| def computeContextSensitiveRuleFOLLOW(self): |
| """ |
| Compute the context-sensitive FOLLOW set for current rule. |
| This is set of token types that can follow a specific rule |
| reference given a specific call chain. You get the set of |
| viable tokens that can possibly come next (lookahead depth 1) |
| given the current call chain. Contrast this with the |
| definition of plain FOLLOW for rule r: |
| |
| FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)} |
| |
| where x in T* and alpha, beta in V*; T is set of terminals and |
| V is the set of terminals and nonterminals. In other words, |
| FOLLOW(r) is the set of all tokens that can possibly follow |
| references to r in *any* sentential form (context). At |
| runtime, however, we know precisely which context applies as |
| we have the call chain. We may compute the exact (rather |
| than covering superset) set of following tokens. |
| |
| For example, consider grammar: |
| |
| stat : ID '=' expr ';' // FOLLOW(stat)=={EOF} |
| | "return" expr '.' |
| ; |
| expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'} |
| atom : INT // FOLLOW(atom)=={'+',')',';','.'} |
| | '(' expr ')' |
| ; |
| |
| The FOLLOW sets are all inclusive whereas context-sensitive |
| FOLLOW sets are precisely what could follow a rule reference. |
| For input input "i=(3);", here is the derivation: |
| |
| stat => ID '=' expr ';' |
| => ID '=' atom ('+' atom)* ';' |
| => ID '=' '(' expr ')' ('+' atom)* ';' |
| => ID '=' '(' atom ')' ('+' atom)* ';' |
| => ID '=' '(' INT ')' ('+' atom)* ';' |
| => ID '=' '(' INT ')' ';' |
| |
| At the "3" token, you'd have a call chain of |
| |
| stat -> expr -> atom -> expr -> atom |
| |
| What can follow that specific nested ref to atom? Exactly ')' |
| as you can see by looking at the derivation of this specific |
| input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}. |
| |
| You want the exact viable token set when recovering from a |
| token mismatch. Upon token mismatch, if LA(1) is member of |
| the viable next token set, then you know there is most likely |
| a missing token in the input stream. "Insert" one by just not |
| throwing an exception. |
| """ |
| |
| return self.combineFollows(True) |
| |
| |
| def combineFollows(self, exact): |
| followSet = set() |
| for idx, localFollowSet in reversed(list(enumerate(self._state.following))): |
| followSet |= localFollowSet |
| if exact: |
| # can we see end of rule? |
| if EOR_TOKEN_TYPE in localFollowSet: |
| # Only leave EOR in set if at top (start rule); this lets |
| # us know if have to include follow(start rule); i.e., EOF |
| if idx > 0: |
| followSet.remove(EOR_TOKEN_TYPE) |
| |
| else: |
| # can't see end of rule, quit |
| break |
| |
| return followSet |
| |
| |
| def recoverFromMismatchedToken(self, input, ttype, follow): |
| """Attempt to recover from a single missing or extra token. |
| |
| EXTRA TOKEN |
| |
| LA(1) is not what we are looking for. If LA(2) has the right token, |
| however, then assume LA(1) is some extra spurious token. Delete it |
| and LA(2) as if we were doing a normal match(), which advances the |
| input. |
| |
| MISSING TOKEN |
| |
| If current token is consistent with what could come after |
| ttype then it is ok to 'insert' the missing token, else throw |
| exception For example, Input 'i=(3;' is clearly missing the |
| ')'. When the parser returns from the nested call to expr, it |
| will have call chain: |
| |
| stat -> expr -> atom |
| |
| and it will be trying to match the ')' at this point in the |
| derivation: |
| |
| => ID '=' '(' INT ')' ('+' atom)* ';' |
| ^ |
| match() will see that ';' doesn't match ')' and report a |
| mismatched token error. To recover, it sees that LA(1)==';' |
| is in the set of tokens that can follow the ')' token |
| reference in rule atom. It can assume that you forgot the ')'. |
| """ |
| |
| e = None |
| |
| # if next token is what we are looking for then "delete" this token |
| if self.mismatchIsUnwantedToken(input, ttype): |
| e = UnwantedTokenException(ttype, input) |
| |
| self.beginResync() |
| input.consume() # simply delete extra token |
| self.endResync() |
| |
| # report after consuming so AW sees the token in the exception |
| self.reportError(e) |
| |
| # we want to return the token we're actually matching |
| matchedSymbol = self.getCurrentInputSymbol(input) |
| |
| # move past ttype token as if all were ok |
| input.consume() |
| return matchedSymbol |
| |
| # can't recover with single token deletion, try insertion |
| if self.mismatchIsMissingToken(input, follow): |
| inserted = self.getMissingSymbol(input, e, ttype, follow) |
| e = MissingTokenException(ttype, input, inserted) |
| |
| # report after inserting so AW sees the token in the exception |
| self.reportError(e) |
| return inserted |
| |
| # even that didn't work; must throw the exception |
| e = MismatchedTokenException(ttype, input) |
| raise e |
| |
| |
| def recoverFromMismatchedSet(self, input, e, follow): |
| """Not currently used""" |
| |
| if self.mismatchIsMissingToken(input, follow): |
| self.reportError(e) |
| # we don't know how to conjure up a token for sets yet |
| return self.getMissingSymbol(input, e, INVALID_TOKEN_TYPE, follow) |
| |
| # TODO do single token deletion like above for Token mismatch |
| raise e |
| |
| |
| def getCurrentInputSymbol(self, input): |
| """ |
| Match needs to return the current input symbol, which gets put |
| into the label for the associated token ref; e.g., x=ID. Token |
| and tree parsers need to return different objects. Rather than test |
| for input stream type or change the IntStream interface, I use |
| a simple method to ask the recognizer to tell me what the current |
| input symbol is. |
| |
| This is ignored for lexers. |
| """ |
| |
| return None |
| |
| |
| def getMissingSymbol(self, input, e, expectedTokenType, follow): |
| """Conjure up a missing token during error recovery. |
| |
| The recognizer attempts to recover from single missing |
| symbols. But, actions might refer to that missing symbol. |
| For example, x=ID {f($x);}. The action clearly assumes |
| that there has been an identifier matched previously and that |
| $x points at that token. If that token is missing, but |
| the next token in the stream is what we want we assume that |
| this token is missing and we keep going. Because we |
| have to return some token to replace the missing token, |
| we have to conjure one up. This method gives the user control |
| over the tokens returned for missing tokens. Mostly, |
| you will want to create something special for identifier |
| tokens. For literals such as '{' and ',', the default |
| action in the parser or tree parser works. It simply creates |
| a CommonToken of the appropriate type. The text will be the token. |
| If you change what tokens must be created by the lexer, |
| override this method to create the appropriate tokens. |
| """ |
| |
| return None |
| |
| |
| ## def recoverFromMissingElement(self, input, e, follow): |
| ## """ |
| ## This code is factored out from mismatched token and mismatched set |
| ## recovery. It handles "single token insertion" error recovery for |
| ## both. No tokens are consumed to recover from insertions. Return |
| ## true if recovery was possible else return false. |
| ## """ |
| |
| ## if self.mismatchIsMissingToken(input, follow): |
| ## self.reportError(e) |
| ## return True |
| |
| ## # nothing to do; throw exception |
| ## return False |
| |
| |
| def consumeUntil(self, input, tokenTypes): |
| """ |
| Consume tokens until one matches the given token or token set |
| |
| tokenTypes can be a single token type or a set of token types |
| |
| """ |
| |
| if not isinstance(tokenTypes, (set, frozenset)): |
| tokenTypes = frozenset([tokenTypes]) |
| |
| ttype = input.LA(1) |
| while ttype != EOF and ttype not in tokenTypes: |
| input.consume() |
| ttype = input.LA(1) |
| |
| |
| def getRuleInvocationStack(self): |
| """ |
| Return List<String> of the rules in your parser instance |
| leading up to a call to this method. You could override if |
| you want more details such as the file/line info of where |
| in the parser java code a rule is invoked. |
| |
| This is very useful for error messages and for context-sensitive |
| error recovery. |
| |
| You must be careful, if you subclass a generated recognizers. |
| The default implementation will only search the module of self |
| for rules, but the subclass will not contain any rules. |
| You probably want to override this method to look like |
| |
| def getRuleInvocationStack(self): |
| return self._getRuleInvocationStack(<class>.__module__) |
| |
| where <class> is the class of the generated recognizer, e.g. |
| the superclass of self. |
| """ |
| |
| return self._getRuleInvocationStack(self.__module__) |
| |
| |
| def _getRuleInvocationStack(cls, module): |
| """ |
| A more general version of getRuleInvocationStack where you can |
| pass in, for example, a RecognitionException to get it's rule |
| stack trace. This routine is shared with all recognizers, hence, |
| static. |
| |
| TODO: move to a utility class or something; weird having lexer call |
| this |
| """ |
| |
| # mmmhhh,... perhaps look at the first argument |
| # (f_locals[co_varnames[0]]?) and test if it's a (sub)class of |
| # requested recognizer... |
| |
| rules = [] |
| for frame in reversed(inspect.stack()): |
| code = frame[0].f_code |
| codeMod = inspect.getmodule(code) |
| if codeMod is None: |
| continue |
| |
| # skip frames not in requested module |
| if codeMod.__name__ != module: |
| continue |
| |
| # skip some unwanted names |
| if code.co_name in ('nextToken', '<module>'): |
| continue |
| |
| rules.append(code.co_name) |
| |
| return rules |
| |
| _getRuleInvocationStack = classmethod(_getRuleInvocationStack) |
| |
| |
| def getBacktrackingLevel(self): |
| return self._state.backtracking |
| |
| def setBacktrackingLevel(self, n): |
| self._state.backtracking = n |
| |
| |
| def getGrammarFileName(self): |
| """For debugging and other purposes, might want the grammar name. |
| |
| Have ANTLR generate an implementation for this method. |
| """ |
| |
| return self.grammarFileName |
| |
| |
| def getSourceName(self): |
| raise NotImplementedError |
| |
| |
| def toStrings(self, tokens): |
| """A convenience method for use most often with template rewrites. |
| |
| Convert a List<Token> to List<String> |
| """ |
| |
| if tokens is None: |
| return None |
| |
| return [token.text for token in tokens] |
| |
| |
| def getRuleMemoization(self, ruleIndex, ruleStartIndex): |
| """ |
| Given a rule number and a start token index number, return |
| MEMO_RULE_UNKNOWN if the rule has not parsed input starting from |
| start index. If this rule has parsed input starting from the |
| start index before, then return where the rule stopped parsing. |
| It returns the index of the last token matched by the rule. |
| """ |
| |
| if ruleIndex not in self._state.ruleMemo: |
| self._state.ruleMemo[ruleIndex] = {} |
| |
| return self._state.ruleMemo[ruleIndex].get( |
| ruleStartIndex, self.MEMO_RULE_UNKNOWN |
| ) |
| |
| |
| def alreadyParsedRule(self, input, ruleIndex): |
| """ |
| Has this rule already parsed input at the current index in the |
| input stream? Return the stop token index or MEMO_RULE_UNKNOWN. |
| If we attempted but failed to parse properly before, return |
| MEMO_RULE_FAILED. |
| |
| This method has a side-effect: if we have seen this input for |
| this rule and successfully parsed before, then seek ahead to |
| 1 past the stop token matched for this rule last time. |
| """ |
| |
| stopIndex = self.getRuleMemoization(ruleIndex, input.index()) |
| if stopIndex == self.MEMO_RULE_UNKNOWN: |
| return False |
| |
| if stopIndex == self.MEMO_RULE_FAILED: |
| raise BacktrackingFailed |
| |
| else: |
| input.seek(stopIndex + 1) |
| |
| return True |
| |
| |
| def memoize(self, input, ruleIndex, ruleStartIndex, success): |
| """ |
| Record whether or not this rule parsed the input at this position |
| successfully. |
| """ |
| |
| if success: |
| stopTokenIndex = input.index() - 1 |
| else: |
| stopTokenIndex = self.MEMO_RULE_FAILED |
| |
| if ruleIndex in self._state.ruleMemo: |
| self._state.ruleMemo[ruleIndex][ruleStartIndex] = stopTokenIndex |
| |
| |
| def traceIn(self, ruleName, ruleIndex, inputSymbol): |
| sys.stdout.write("enter %s %s" % (ruleName, inputSymbol)) |
| |
| if self._state.backtracking > 0: |
| sys.stdout.write(" backtracking=%s" % self._state.backtracking) |
| |
| sys.stdout.write('\n') |
| |
| |
| def traceOut(self, ruleName, ruleIndex, inputSymbol): |
| sys.stdout.write("exit %s %s" % (ruleName, inputSymbol)) |
| |
| if self._state.backtracking > 0: |
| sys.stdout.write(" backtracking=%s" % self._state.backtracking) |
| |
| # mmmm... we use BacktrackingFailed exceptions now. So how could we |
| # get that information here? |
| #if self._state.failed: |
| # sys.stdout.write(" failed") |
| #else: |
| # sys.stdout.write(" succeeded") |
| |
| sys.stdout.write('\n') |
| |
| |
| class TokenSource(object): |
| """ |
| @brief Abstract baseclass for token producers. |
| |
| A source of tokens must provide a sequence of tokens via nextToken() |
| and also must reveal it's source of characters; CommonToken's text is |
| computed from a CharStream; it only store indices into the char stream. |
| |
| Errors from the lexer are never passed to the parser. Either you want |
| to keep going or you do not upon token recognition error. If you do not |
| want to continue lexing then you do not want to continue parsing. Just |
| throw an exception not under RecognitionException and Java will naturally |
| toss you all the way out of the recognizers. If you want to continue |
| lexing then you should not throw an exception to the parser--it has already |
| requested a token. Keep lexing until you get a valid one. Just report |
| errors and keep going, looking for a valid token. |
| """ |
| |
| def nextToken(self): |
| """Return a Token object from your input stream (usually a CharStream). |
| |
| Do not fail/return upon lexing error; keep chewing on the characters |
| until you get a good one; errors are not passed through to the parser. |
| """ |
| |
| raise NotImplementedError |
| |
| |
| def __iter__(self): |
| """The TokenSource is an interator. |
| |
| The iteration will not include the final EOF token, see also the note |
| for the next() method. |
| |
| """ |
| |
| return self |
| |
| |
| def next(self): |
| """Return next token or raise StopIteration. |
| |
| Note that this will raise StopIteration when hitting the EOF token, |
| so EOF will not be part of the iteration. |
| |
| """ |
| |
| token = self.nextToken() |
| if token is None or token.type == EOF: |
| raise StopIteration |
| return token |
| |
| |
| class Lexer(BaseRecognizer, TokenSource): |
| """ |
| @brief Baseclass for generated lexer classes. |
| |
| A lexer is recognizer that draws input symbols from a character stream. |
| lexer grammars result in a subclass of this object. A Lexer object |
| uses simplified match() and error recovery mechanisms in the interest |
| of speed. |
| """ |
| |
| def __init__(self, input, state=None): |
| BaseRecognizer.__init__(self, state) |
| TokenSource.__init__(self) |
| |
| # Where is the lexer drawing characters from? |
| self.input = input |
| |
| |
| def reset(self): |
| BaseRecognizer.reset(self) # reset all recognizer state variables |
| |
| if self.input is not None: |
| # rewind the input |
| self.input.seek(0) |
| |
| if self._state is None: |
| # no shared state work to do |
| return |
| |
| # wack Lexer state variables |
| self._state.token = None |
| self._state.type = INVALID_TOKEN_TYPE |
| self._state.channel = DEFAULT_CHANNEL |
| self._state.tokenStartCharIndex = -1 |
| self._state.tokenStartLine = -1 |
| self._state.tokenStartCharPositionInLine = -1 |
| self._state.text = None |
| |
| |
| def makeEOFToken(self): |
| eof = CommonToken( |
| type=EOF, channel=DEFAULT_CHANNEL, |
| input=self.input, |
| start=self.input.index(), stop=self.input.index()) |
| eof.line = self.input.line |
| eof.charPositionInLine = self.input.charPositionInLine |
| return eof |
| |
| def nextToken(self): |
| """ |
| Return a token from this source; i.e., match a token on the char |
| stream. |
| """ |
| |
| while 1: |
| self._state.token = None |
| self._state.channel = DEFAULT_CHANNEL |
| self._state.tokenStartCharIndex = self.input.index() |
| self._state.tokenStartCharPositionInLine = self.input.charPositionInLine |
| self._state.tokenStartLine = self.input.line |
| self._state.text = None |
| if self.input.LA(1) == EOF: |
| return self.makeEOFToken() |
| |
| try: |
| self.mTokens() |
| |
| if self._state.token is None: |
| self.emit() |
| |
| elif self._state.token == SKIP_TOKEN: |
| continue |
| |
| return self._state.token |
| |
| except NoViableAltException, re: |
| self.reportError(re) |
| self.recover(re) # throw out current char and try again |
| |
| except RecognitionException, re: |
| self.reportError(re) |
| # match() routine has already called recover() |
| |
| |
| def skip(self): |
| """ |
| Instruct the lexer to skip creating a token for current lexer rule |
| and look for another token. nextToken() knows to keep looking when |
| a lexer rule finishes with token set to SKIP_TOKEN. Recall that |
| if token==null at end of any token rule, it creates one for you |
| and emits it. |
| """ |
| |
| self._state.token = SKIP_TOKEN |
| |
| |
| def mTokens(self): |
| """This is the lexer entry point that sets instance var 'token'""" |
| |
| # abstract method |
| raise NotImplementedError |
| |
| |
| def setCharStream(self, input): |
| """Set the char stream and reset the lexer""" |
| self.input = None |
| self.reset() |
| self.input = input |
| |
| |
| def getSourceName(self): |
| return self.input.getSourceName() |
| |
| |
| def emit(self, token=None): |
| """ |
| The standard method called to automatically emit a token at the |
| outermost lexical rule. The token object should point into the |
| char buffer start..stop. If there is a text override in 'text', |
| use that to set the token's text. Override this method to emit |
| custom Token objects. |
| |
| If you are building trees, then you should also override |
| Parser or TreeParser.getMissingSymbol(). |
| """ |
| |
| if token is None: |
| token = CommonToken( |
| input=self.input, |
| type=self._state.type, |
| channel=self._state.channel, |
| start=self._state.tokenStartCharIndex, |
| stop=self.getCharIndex()-1 |
| ) |
| token.line = self._state.tokenStartLine |
| token.text = self._state.text |
| token.charPositionInLine = self._state.tokenStartCharPositionInLine |
| |
| self._state.token = token |
| |
| return token |
| |
| |
| def match(self, s): |
| if isinstance(s, basestring): |
| for c in s: |
| if self.input.LA(1) != ord(c): |
| if self._state.backtracking > 0: |
| raise BacktrackingFailed |
| |
| mte = MismatchedTokenException(c, self.input) |
| self.recover(mte) |
| raise mte |
| |
| self.input.consume() |
| |
| else: |
| if self.input.LA(1) != s: |
| if self._state.backtracking > 0: |
| raise BacktrackingFailed |
| |
| mte = MismatchedTokenException(unichr(s), self.input) |
| self.recover(mte) # don't really recover; just consume in lexer |
| raise mte |
| |
| self.input.consume() |
| |
| |
| def matchAny(self): |
| self.input.consume() |
| |
| |
| def matchRange(self, a, b): |
| if self.input.LA(1) < a or self.input.LA(1) > b: |
| if self._state.backtracking > 0: |
| raise BacktrackingFailed |
| |
| mre = MismatchedRangeException(unichr(a), unichr(b), self.input) |
| self.recover(mre) |
| raise mre |
| |
| self.input.consume() |
| |
| |
| def getLine(self): |
| return self.input.line |
| |
| |
| def getCharPositionInLine(self): |
| return self.input.charPositionInLine |
| |
| |
| def getCharIndex(self): |
| """What is the index of the current character of lookahead?""" |
| |
| return self.input.index() |
| |
| |
| def getText(self): |
| """ |
| Return the text matched so far for the current token or any |
| text override. |
| """ |
| if self._state.text is not None: |
| return self._state.text |
| |
| return self.input.substring( |
| self._state.tokenStartCharIndex, |
| self.getCharIndex()-1 |
| ) |
| |
| |
| def setText(self, text): |
| """ |
| Set the complete text of this token; it wipes any previous |
| changes to the text. |
| """ |
| self._state.text = text |
| |
| |
| text = property(getText, setText) |
| |
| |
| def reportError(self, e): |
| ## TODO: not thought about recovery in lexer yet. |
| |
| ## # if we've already reported an error and have not matched a token |
| ## # yet successfully, don't report any errors. |
| ## if self.errorRecovery: |
| ## #System.err.print("[SPURIOUS] "); |
| ## return; |
| ## |
| ## self.errorRecovery = True |
| |
| self.displayRecognitionError(self.tokenNames, e) |
| |
| |
| def getErrorMessage(self, e, tokenNames): |
| msg = None |
| |
| if isinstance(e, MismatchedTokenException): |
| msg = "mismatched character " \ |
| + self.getCharErrorDisplay(e.c) \ |
| + " expecting " \ |
| + self.getCharErrorDisplay(e.expecting) |
| |
| elif isinstance(e, NoViableAltException): |
| msg = "no viable alternative at character " \ |
| + self.getCharErrorDisplay(e.c) |
| |
| elif isinstance(e, EarlyExitException): |
| msg = "required (...)+ loop did not match anything at character " \ |
| + self.getCharErrorDisplay(e.c) |
| |
| elif isinstance(e, MismatchedNotSetException): |
| msg = "mismatched character " \ |
| + self.getCharErrorDisplay(e.c) \ |
| + " expecting set " \ |
| + repr(e.expecting) |
| |
| elif isinstance(e, MismatchedSetException): |
| msg = "mismatched character " \ |
| + self.getCharErrorDisplay(e.c) \ |
| + " expecting set " \ |
| + repr(e.expecting) |
| |
| elif isinstance(e, MismatchedRangeException): |
| msg = "mismatched character " \ |
| + self.getCharErrorDisplay(e.c) \ |
| + " expecting set " \ |
| + self.getCharErrorDisplay(e.a) \ |
| + ".." \ |
| + self.getCharErrorDisplay(e.b) |
| |
| else: |
| msg = BaseRecognizer.getErrorMessage(self, e, tokenNames) |
| |
| return msg |
| |
| |
| def getCharErrorDisplay(self, c): |
| if c == EOF: |
| c = '<EOF>' |
| return repr(c) |
| |
| |
| def recover(self, re): |
| """ |
| Lexers can normally match any char in it's vocabulary after matching |
| a token, so do the easy thing and just kill a character and hope |
| it all works out. You can instead use the rule invocation stack |
| to do sophisticated error recovery if you are in a fragment rule. |
| """ |
| |
| self.input.consume() |
| |
| |
| def traceIn(self, ruleName, ruleIndex): |
| inputSymbol = "%s line=%d:%s" % (self.input.LT(1), |
| self.getLine(), |
| self.getCharPositionInLine() |
| ) |
| |
| BaseRecognizer.traceIn(self, ruleName, ruleIndex, inputSymbol) |
| |
| |
| def traceOut(self, ruleName, ruleIndex): |
| inputSymbol = "%s line=%d:%s" % (self.input.LT(1), |
| self.getLine(), |
| self.getCharPositionInLine() |
| ) |
| |
| BaseRecognizer.traceOut(self, ruleName, ruleIndex, inputSymbol) |
| |
| |
| |
| class Parser(BaseRecognizer): |
| """ |
| @brief Baseclass for generated parser classes. |
| """ |
| |
| def __init__(self, lexer, state=None): |
| BaseRecognizer.__init__(self, state) |
| |
| self.input = lexer |
| |
| |
| def reset(self): |
| BaseRecognizer.reset(self) # reset all recognizer state variables |
| if self.input is not None: |
| self.input.seek(0) # rewind the input |
| |
| |
| def getCurrentInputSymbol(self, input): |
| return input.LT(1) |
| |
| |
| def getMissingSymbol(self, input, e, expectedTokenType, follow): |
| if expectedTokenType == EOF: |
| tokenText = "<missing EOF>" |
| else: |
| tokenText = "<missing " + self.tokenNames[expectedTokenType] + ">" |
| t = CommonToken(type=expectedTokenType, text=tokenText) |
| current = input.LT(1) |
| if current.type == EOF: |
| current = input.LT(-1) |
| |
| if current is not None: |
| t.line = current.line |
| t.charPositionInLine = current.charPositionInLine |
| t.channel = DEFAULT_CHANNEL |
| return t |
| |
| |
| def setTokenStream(self, input): |
| """Set the token stream and reset the parser""" |
| |
| self.input = None |
| self.reset() |
| self.input = input |
| |
| |
| def getTokenStream(self): |
| return self.input |
| |
| |
| def getSourceName(self): |
| return self.input.getSourceName() |
| |
| |
| def traceIn(self, ruleName, ruleIndex): |
| BaseRecognizer.traceIn(self, ruleName, ruleIndex, self.input.LT(1)) |
| |
| |
| def traceOut(self, ruleName, ruleIndex): |
| BaseRecognizer.traceOut(self, ruleName, ruleIndex, self.input.LT(1)) |
| |
| |
| class RuleReturnScope(object): |
| """ |
| Rules can return start/stop info as well as possible trees and templates. |
| """ |
| |
| def getStart(self): |
| """Return the start token or tree.""" |
| return None |
| |
| |
| def getStop(self): |
| """Return the stop token or tree.""" |
| return None |
| |
| |
| def getTree(self): |
| """Has a value potentially if output=AST.""" |
| return None |
| |
| |
| def getTemplate(self): |
| """Has a value potentially if output=template.""" |
| return None |
| |
| |
| class ParserRuleReturnScope(RuleReturnScope): |
| """ |
| Rules that return more than a single value must return an object |
| containing all the values. Besides the properties defined in |
| RuleLabelScope.predefinedRulePropertiesScope there may be user-defined |
| return values. This class simply defines the minimum properties that |
| are always defined and methods to access the others that might be |
| available depending on output option such as template and tree. |
| |
| Note text is not an actual property of the return value, it is computed |
| from start and stop using the input stream's toString() method. I |
| could add a ctor to this so that we can pass in and store the input |
| stream, but I'm not sure we want to do that. It would seem to be undefined |
| to get the .text property anyway if the rule matches tokens from multiple |
| input streams. |
| |
| I do not use getters for fields of objects that are used simply to |
| group values such as this aggregate. The getters/setters are there to |
| satisfy the superclass interface. |
| """ |
| |
| def __init__(self): |
| self.start = None |
| self.stop = None |
| self.tree = None # only used when output=AST |
| |
| |
| def getStart(self): |
| return self.start |
| |
| |
| def getStop(self): |
| return self.stop |
| |
| |
| def getTree(self): |
| return self.tree |