| #!/usr/bin/env python |
| # |
| # Copyright 2007 The Closure Linter Authors. All Rights Reserved. |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS-IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| """Regular expression based JavaScript parsing classes.""" |
| |
| __author__ = ('robbyw@google.com (Robert Walker)', |
| 'ajp@google.com (Andy Perelson)') |
| |
| import copy |
| import re |
| |
| from closure_linter import javascripttokens |
| from closure_linter.common import matcher |
| from closure_linter.common import tokenizer |
| |
| # Shorthand |
| Type = javascripttokens.JavaScriptTokenType |
| Matcher = matcher.Matcher |
| |
| |
| class JavaScriptModes(object): |
| """Enumeration of the different matcher modes used for JavaScript.""" |
| TEXT_MODE = 'text' |
| SINGLE_QUOTE_STRING_MODE = 'single_quote_string' |
| DOUBLE_QUOTE_STRING_MODE = 'double_quote_string' |
| BLOCK_COMMENT_MODE = 'block_comment' |
| DOC_COMMENT_MODE = 'doc_comment' |
| DOC_COMMENT_LEX_SPACES_MODE = 'doc_comment_spaces' |
| LINE_COMMENT_MODE = 'line_comment' |
| PARAMETER_MODE = 'parameter' |
| FUNCTION_MODE = 'function' |
| |
| |
| class JavaScriptTokenizer(tokenizer.Tokenizer): |
| """JavaScript tokenizer. |
| |
| Convert JavaScript code in to an array of tokens. |
| """ |
| |
| # Useful patterns for JavaScript parsing. |
| IDENTIFIER_CHAR = r'A-Za-z0-9_$.' |
| |
| # Number patterns based on: |
| # http://www.mozilla.org/js/language/js20-2000-07/formal/lexer-grammar.html |
| MANTISSA = r""" |
| (\d+(?!\.)) | # Matches '10' |
| (\d+\.(?!\d)) | # Matches '10.' |
| (\d*\.\d+) # Matches '.5' or '10.5' |
| """ |
| DECIMAL_LITERAL = r'(%s)([eE][-+]?\d+)?' % MANTISSA |
| HEX_LITERAL = r'0[xX][0-9a-fA-F]+' |
| NUMBER = re.compile(r""" |
| ((%s)|(%s)) |
| """ % (HEX_LITERAL, DECIMAL_LITERAL), re.VERBOSE) |
| |
| # Strings come in three parts - first we match the start of the string, then |
| # the contents, then the end. The contents consist of any character except a |
| # backslash or end of string, or a backslash followed by any character, or a |
| # backslash followed by end of line to support correct parsing of multi-line |
| # strings. |
| SINGLE_QUOTE = re.compile(r"'") |
| SINGLE_QUOTE_TEXT = re.compile(r"([^'\\]|\\(.|$))+") |
| DOUBLE_QUOTE = re.compile(r'"') |
| DOUBLE_QUOTE_TEXT = re.compile(r'([^"\\]|\\(.|$))+') |
| |
| START_SINGLE_LINE_COMMENT = re.compile(r'//') |
| END_OF_LINE_SINGLE_LINE_COMMENT = re.compile(r'//$') |
| |
| START_DOC_COMMENT = re.compile(r'/\*\*') |
| START_BLOCK_COMMENT = re.compile(r'/\*') |
| END_BLOCK_COMMENT = re.compile(r'\*/') |
| BLOCK_COMMENT_TEXT = re.compile(r'([^*]|\*(?!/))+') |
| |
| # Comment text is anything that we are not going to parse into another special |
| # token like (inline) flags or end comments. Complicated regex to match |
| # most normal characters, and '*', '{', '}', and '@' when we are sure that |
| # it is safe. Expression [^*{\s]@ must come first, or the other options will |
| # match everything before @, and we won't match @'s that aren't part of flags |
| # like in email addresses in the @author tag. |
| DOC_COMMENT_TEXT = re.compile(r'([^*{}\s]@|[^*{}@]|\*(?!/))+') |
| DOC_COMMENT_NO_SPACES_TEXT = re.compile(r'([^*{}\s]@|[^*{}@\s]|\*(?!/))+') |
| |
| # Match the prefix ' * ' that starts every line of jsdoc. Want to include |
| # spaces after the '*', but nothing else that occurs after a '*', and don't |
| # want to match the '*' in '*/'. |
| DOC_PREFIX = re.compile(r'\s*\*(\s+|(?!/))') |
| |
| START_BLOCK = re.compile('{') |
| END_BLOCK = re.compile('}') |
| |
| REGEX_CHARACTER_CLASS = r""" |
| \[ # Opening bracket |
| ([^\]\\]|\\.)* # Anything but a ] or \, |
| # or a backslash followed by anything |
| \] # Closing bracket |
| """ |
| # We ensure the regex is followed by one of the above tokens to avoid |
| # incorrectly parsing something like x / y / z as x REGEX(/ y /) z |
| POST_REGEX_LIST = [ |
| ';', ',', r'\.', r'\)', r'\]', '$', r'\/\/', r'\/\*', ':', '}'] |
| |
| REGEX = re.compile(r""" |
| / # opening slash |
| (?!\*) # not the start of a comment |
| (\\.|[^\[\/\\]|(%s))* # a backslash followed by anything, |
| # or anything but a / or [ or \, |
| # or a character class |
| / # closing slash |
| [gimsx]* # optional modifiers |
| (?=\s*(%s)) |
| """ % (REGEX_CHARACTER_CLASS, '|'.join(POST_REGEX_LIST)), |
| re.VERBOSE) |
| |
| ANYTHING = re.compile(r'.*') |
| PARAMETERS = re.compile(r'[^\)]+') |
| CLOSING_PAREN_WITH_SPACE = re.compile(r'\)\s*') |
| |
| FUNCTION_DECLARATION = re.compile(r'\bfunction\b') |
| |
| OPENING_PAREN = re.compile(r'\(') |
| CLOSING_PAREN = re.compile(r'\)') |
| |
| OPENING_BRACKET = re.compile(r'\[') |
| CLOSING_BRACKET = re.compile(r'\]') |
| |
| # We omit these JS keywords from the list: |
| # function - covered by FUNCTION_DECLARATION. |
| # delete, in, instanceof, new, typeof - included as operators. |
| # this - included in identifiers. |
| # null, undefined - not included, should go in some "special constant" list. |
| KEYWORD_LIST = ['break', 'case', 'catch', 'continue', 'default', 'do', 'else', |
| 'finally', 'for', 'if', 'return', 'switch', 'throw', 'try', 'var', |
| 'while', 'with'] |
| # Match a keyword string followed by a non-identifier character in order to |
| # not match something like doSomething as do + Something. |
| KEYWORD = re.compile('(%s)((?=[^%s])|$)' % ( |
| '|'.join(KEYWORD_LIST), IDENTIFIER_CHAR)) |
| |
| # List of regular expressions to match as operators. Some notes: for our |
| # purposes, the comma behaves similarly enough to a normal operator that we |
| # include it here. r'\bin\b' actually matches 'in' surrounded by boundary |
| # characters - this may not match some very esoteric uses of the in operator. |
| # Operators that are subsets of larger operators must come later in this list |
| # for proper matching, e.g., '>>' must come AFTER '>>>'. |
| OPERATOR_LIST = [',', r'\+\+', '===', '!==', '>>>=', '>>>', '==', '>=', '<=', |
| '!=', '<<=', '>>=', '<<', '>>', '>', '<', r'\+=', r'\+', |
| '--', '\^=', '-=', '-', '/=', '/', r'\*=', r'\*', '%=', '%', |
| '&&', r'\|\|', '&=', '&', r'\|=', r'\|', '=', '!', ':', '\?', |
| r'\bdelete\b', r'\bin\b', r'\binstanceof\b', r'\bnew\b', |
| r'\btypeof\b', r'\bvoid\b'] |
| OPERATOR = re.compile('|'.join(OPERATOR_LIST)) |
| |
| WHITESPACE = re.compile(r'\s+') |
| SEMICOLON = re.compile(r';') |
| # Technically JavaScript identifiers can't contain '.', but we treat a set of |
| # nested identifiers as a single identifier. |
| NESTED_IDENTIFIER = r'[a-zA-Z_$][%s.]*' % IDENTIFIER_CHAR |
| IDENTIFIER = re.compile(NESTED_IDENTIFIER) |
| |
| SIMPLE_LVALUE = re.compile(r""" |
| (?P<identifier>%s) # a valid identifier |
| (?=\s* # optional whitespace |
| \= # look ahead to equal sign |
| (?!=)) # not follwed by equal |
| """ % NESTED_IDENTIFIER, re.VERBOSE) |
| |
| # A doc flag is a @ sign followed by non-space characters that appears at the |
| # beginning of the line, after whitespace, or after a '{'. The look-behind |
| # check is necessary to not match someone@google.com as a flag. |
| DOC_FLAG = re.compile(r'(^|(?<=\s))@(?P<name>[a-zA-Z]+)') |
| # To properly parse parameter names, we need to tokenize whitespace into a |
| # token. |
| DOC_FLAG_LEX_SPACES = re.compile(r'(^|(?<=\s))@(?P<name>%s)\b' % |
| '|'.join(['param'])) |
| |
| DOC_INLINE_FLAG = re.compile(r'(?<={)@(?P<name>[a-zA-Z]+)') |
| |
| # Star followed by non-slash, i.e a star that does not end a comment. |
| # This is used for TYPE_GROUP below. |
| SAFE_STAR = r'(\*(?!/))' |
| |
| COMMON_DOC_MATCHERS = [ |
| # Find the end of the comment. |
| Matcher(END_BLOCK_COMMENT, Type.END_DOC_COMMENT, |
| JavaScriptModes.TEXT_MODE), |
| |
| # Tokenize documented flags like @private. |
| Matcher(DOC_INLINE_FLAG, Type.DOC_INLINE_FLAG), |
| Matcher(DOC_FLAG_LEX_SPACES, Type.DOC_FLAG, |
| JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE), |
| |
| # Encountering a doc flag should leave lex spaces mode. |
| Matcher(DOC_FLAG, Type.DOC_FLAG, JavaScriptModes.DOC_COMMENT_MODE), |
| |
| # Tokenize braces so we can find types. |
| Matcher(START_BLOCK, Type.DOC_START_BRACE), |
| Matcher(END_BLOCK, Type.DOC_END_BRACE), |
| Matcher(DOC_PREFIX, Type.DOC_PREFIX, None, True)] |
| |
| |
| # The token matcher groups work as follows: it is an list of Matcher objects. |
| # The matchers will be tried in this order, and the first to match will be |
| # returned. Hence the order is important because the matchers that come first |
| # overrule the matchers that come later. |
| JAVASCRIPT_MATCHERS = { |
| # Matchers for basic text mode. |
| JavaScriptModes.TEXT_MODE: [ |
| # Check a big group - strings, starting comments, and regexes - all |
| # of which could be intertwined. 'string with /regex/', |
| # /regex with 'string'/, /* comment with /regex/ and string */ (and so |
| # on) |
| Matcher(START_DOC_COMMENT, Type.START_DOC_COMMENT, |
| JavaScriptModes.DOC_COMMENT_MODE), |
| Matcher(START_BLOCK_COMMENT, Type.START_BLOCK_COMMENT, |
| JavaScriptModes.BLOCK_COMMENT_MODE), |
| Matcher(END_OF_LINE_SINGLE_LINE_COMMENT, |
| Type.START_SINGLE_LINE_COMMENT), |
| Matcher(START_SINGLE_LINE_COMMENT, Type.START_SINGLE_LINE_COMMENT, |
| JavaScriptModes.LINE_COMMENT_MODE), |
| Matcher(SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_START, |
| JavaScriptModes.SINGLE_QUOTE_STRING_MODE), |
| Matcher(DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_START, |
| JavaScriptModes.DOUBLE_QUOTE_STRING_MODE), |
| Matcher(REGEX, Type.REGEX), |
| |
| # Next we check for start blocks appearing outside any of the items |
| # above. |
| Matcher(START_BLOCK, Type.START_BLOCK), |
| Matcher(END_BLOCK, Type.END_BLOCK), |
| |
| # Then we search for function declarations. |
| Matcher(FUNCTION_DECLARATION, Type.FUNCTION_DECLARATION, |
| JavaScriptModes.FUNCTION_MODE), |
| |
| # Next, we convert non-function related parens to tokens. |
| Matcher(OPENING_PAREN, Type.START_PAREN), |
| Matcher(CLOSING_PAREN, Type.END_PAREN), |
| |
| # Next, we convert brackets to tokens. |
| Matcher(OPENING_BRACKET, Type.START_BRACKET), |
| Matcher(CLOSING_BRACKET, Type.END_BRACKET), |
| |
| # Find numbers. This has to happen before operators because scientific |
| # notation numbers can have + and - in them. |
| Matcher(NUMBER, Type.NUMBER), |
| |
| # Find operators and simple assignments |
| Matcher(SIMPLE_LVALUE, Type.SIMPLE_LVALUE), |
| Matcher(OPERATOR, Type.OPERATOR), |
| |
| # Find key words and whitespace. |
| Matcher(KEYWORD, Type.KEYWORD), |
| Matcher(WHITESPACE, Type.WHITESPACE), |
| |
| # Find identifiers. |
| Matcher(IDENTIFIER, Type.IDENTIFIER), |
| |
| # Finally, we convert semicolons to tokens. |
| Matcher(SEMICOLON, Type.SEMICOLON)], |
| |
| # Matchers for single quote strings. |
| JavaScriptModes.SINGLE_QUOTE_STRING_MODE: [ |
| Matcher(SINGLE_QUOTE_TEXT, Type.STRING_TEXT), |
| Matcher(SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_END, |
| JavaScriptModes.TEXT_MODE)], |
| |
| # Matchers for double quote strings. |
| JavaScriptModes.DOUBLE_QUOTE_STRING_MODE: [ |
| Matcher(DOUBLE_QUOTE_TEXT, Type.STRING_TEXT), |
| Matcher(DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_END, |
| JavaScriptModes.TEXT_MODE)], |
| |
| # Matchers for block comments. |
| JavaScriptModes.BLOCK_COMMENT_MODE: [ |
| # First we check for exiting a block comment. |
| Matcher(END_BLOCK_COMMENT, Type.END_BLOCK_COMMENT, |
| JavaScriptModes.TEXT_MODE), |
| |
| # Match non-comment-ending text.. |
| Matcher(BLOCK_COMMENT_TEXT, Type.COMMENT)], |
| |
| # Matchers for doc comments. |
| JavaScriptModes.DOC_COMMENT_MODE: COMMON_DOC_MATCHERS + [ |
| Matcher(DOC_COMMENT_TEXT, Type.COMMENT)], |
| |
| JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: COMMON_DOC_MATCHERS + [ |
| Matcher(WHITESPACE, Type.COMMENT), |
| Matcher(DOC_COMMENT_NO_SPACES_TEXT, Type.COMMENT)], |
| |
| # Matchers for single line comments. |
| JavaScriptModes.LINE_COMMENT_MODE: [ |
| # We greedy match until the end of the line in line comment mode. |
| Matcher(ANYTHING, Type.COMMENT, JavaScriptModes.TEXT_MODE)], |
| |
| # Matchers for code after the function keyword. |
| JavaScriptModes.FUNCTION_MODE: [ |
| # Must match open paren before anything else and move into parameter |
| # mode, otherwise everything inside the parameter list is parsed |
| # incorrectly. |
| Matcher(OPENING_PAREN, Type.START_PARAMETERS, |
| JavaScriptModes.PARAMETER_MODE), |
| Matcher(WHITESPACE, Type.WHITESPACE), |
| Matcher(IDENTIFIER, Type.FUNCTION_NAME)], |
| |
| # Matchers for function parameters |
| JavaScriptModes.PARAMETER_MODE: [ |
| # When in function parameter mode, a closing paren is treated specially. |
| # Everything else is treated as lines of parameters. |
| Matcher(CLOSING_PAREN_WITH_SPACE, Type.END_PARAMETERS, |
| JavaScriptModes.TEXT_MODE), |
| Matcher(PARAMETERS, Type.PARAMETERS, JavaScriptModes.PARAMETER_MODE)]} |
| |
| # When text is not matched, it is given this default type based on mode. |
| # If unspecified in this map, the default default is Type.NORMAL. |
| JAVASCRIPT_DEFAULT_TYPES = { |
| JavaScriptModes.DOC_COMMENT_MODE: Type.COMMENT, |
| JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: Type.COMMENT |
| } |
| |
| def __init__(self, parse_js_doc = True): |
| """Create a tokenizer object. |
| |
| Args: |
| parse_js_doc: Whether to do detailed parsing of javascript doc comments, |
| or simply treat them as normal comments. Defaults to parsing JsDoc. |
| """ |
| matchers = self.JAVASCRIPT_MATCHERS |
| if not parse_js_doc: |
| # Make a copy so the original doesn't get modified. |
| matchers = copy.deepcopy(matchers) |
| matchers[JavaScriptModes.DOC_COMMENT_MODE] = matchers[ |
| JavaScriptModes.BLOCK_COMMENT_MODE] |
| |
| tokenizer.Tokenizer.__init__(self, JavaScriptModes.TEXT_MODE, matchers, |
| self.JAVASCRIPT_DEFAULT_TYPES) |
| |
| def _CreateToken(self, string, token_type, line, line_number, values=None): |
| """Creates a new JavaScriptToken object. |
| |
| Args: |
| string: The string of input the token contains. |
| token_type: The type of token. |
| line: The text of the line this token is in. |
| line_number: The line number of the token. |
| values: A dict of named values within the token. For instance, a |
| function declaration may have a value called 'name' which captures the |
| name of the function. |
| """ |
| return javascripttokens.JavaScriptToken(string, token_type, line, |
| line_number, values) |