blob: 98f91849a1b33892ff58aa0ebbe68bd6183c9d87 [file] [log] [blame]
#!/usr/bin/env python
#
# Copyright 2007 The Closure Linter Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Regular expression based JavaScript parsing classes."""
__author__ = ('robbyw@google.com (Robert Walker)',
'ajp@google.com (Andy Perelson)')
import copy
import re
from closure_linter import javascripttokens
from closure_linter.common import matcher
from closure_linter.common import tokenizer
# Shorthand
Type = javascripttokens.JavaScriptTokenType
Matcher = matcher.Matcher
class JavaScriptModes(object):
"""Enumeration of the different matcher modes used for JavaScript."""
TEXT_MODE = 'text'
SINGLE_QUOTE_STRING_MODE = 'single_quote_string'
DOUBLE_QUOTE_STRING_MODE = 'double_quote_string'
BLOCK_COMMENT_MODE = 'block_comment'
DOC_COMMENT_MODE = 'doc_comment'
DOC_COMMENT_LEX_SPACES_MODE = 'doc_comment_spaces'
LINE_COMMENT_MODE = 'line_comment'
PARAMETER_MODE = 'parameter'
FUNCTION_MODE = 'function'
class JavaScriptTokenizer(tokenizer.Tokenizer):
"""JavaScript tokenizer.
Convert JavaScript code in to an array of tokens.
"""
# Useful patterns for JavaScript parsing.
IDENTIFIER_CHAR = r'A-Za-z0-9_$.'
# Number patterns based on:
# http://www.mozilla.org/js/language/js20-2000-07/formal/lexer-grammar.html
MANTISSA = r"""
(\d+(?!\.)) | # Matches '10'
(\d+\.(?!\d)) | # Matches '10.'
(\d*\.\d+) # Matches '.5' or '10.5'
"""
DECIMAL_LITERAL = r'(%s)([eE][-+]?\d+)?' % MANTISSA
HEX_LITERAL = r'0[xX][0-9a-fA-F]+'
NUMBER = re.compile(r"""
((%s)|(%s))
""" % (HEX_LITERAL, DECIMAL_LITERAL), re.VERBOSE)
# Strings come in three parts - first we match the start of the string, then
# the contents, then the end. The contents consist of any character except a
# backslash or end of string, or a backslash followed by any character, or a
# backslash followed by end of line to support correct parsing of multi-line
# strings.
SINGLE_QUOTE = re.compile(r"'")
SINGLE_QUOTE_TEXT = re.compile(r"([^'\\]|\\(.|$))+")
DOUBLE_QUOTE = re.compile(r'"')
DOUBLE_QUOTE_TEXT = re.compile(r'([^"\\]|\\(.|$))+')
START_SINGLE_LINE_COMMENT = re.compile(r'//')
END_OF_LINE_SINGLE_LINE_COMMENT = re.compile(r'//$')
START_DOC_COMMENT = re.compile(r'/\*\*')
START_BLOCK_COMMENT = re.compile(r'/\*')
END_BLOCK_COMMENT = re.compile(r'\*/')
BLOCK_COMMENT_TEXT = re.compile(r'([^*]|\*(?!/))+')
# Comment text is anything that we are not going to parse into another special
# token like (inline) flags or end comments. Complicated regex to match
# most normal characters, and '*', '{', '}', and '@' when we are sure that
# it is safe. Expression [^*{\s]@ must come first, or the other options will
# match everything before @, and we won't match @'s that aren't part of flags
# like in email addresses in the @author tag.
DOC_COMMENT_TEXT = re.compile(r'([^*{}\s]@|[^*{}@]|\*(?!/))+')
DOC_COMMENT_NO_SPACES_TEXT = re.compile(r'([^*{}\s]@|[^*{}@\s]|\*(?!/))+')
# Match the prefix ' * ' that starts every line of jsdoc. Want to include
# spaces after the '*', but nothing else that occurs after a '*', and don't
# want to match the '*' in '*/'.
DOC_PREFIX = re.compile(r'\s*\*(\s+|(?!/))')
START_BLOCK = re.compile('{')
END_BLOCK = re.compile('}')
REGEX_CHARACTER_CLASS = r"""
\[ # Opening bracket
([^\]\\]|\\.)* # Anything but a ] or \,
# or a backslash followed by anything
\] # Closing bracket
"""
# We ensure the regex is followed by one of the above tokens to avoid
# incorrectly parsing something like x / y / z as x REGEX(/ y /) z
POST_REGEX_LIST = [
';', ',', r'\.', r'\)', r'\]', '$', r'\/\/', r'\/\*', ':', '}']
REGEX = re.compile(r"""
/ # opening slash
(?!\*) # not the start of a comment
(\\.|[^\[\/\\]|(%s))* # a backslash followed by anything,
# or anything but a / or [ or \,
# or a character class
/ # closing slash
[gimsx]* # optional modifiers
(?=\s*(%s))
""" % (REGEX_CHARACTER_CLASS, '|'.join(POST_REGEX_LIST)),
re.VERBOSE)
ANYTHING = re.compile(r'.*')
PARAMETERS = re.compile(r'[^\)]+')
CLOSING_PAREN_WITH_SPACE = re.compile(r'\)\s*')
FUNCTION_DECLARATION = re.compile(r'\bfunction\b')
OPENING_PAREN = re.compile(r'\(')
CLOSING_PAREN = re.compile(r'\)')
OPENING_BRACKET = re.compile(r'\[')
CLOSING_BRACKET = re.compile(r'\]')
# We omit these JS keywords from the list:
# function - covered by FUNCTION_DECLARATION.
# delete, in, instanceof, new, typeof - included as operators.
# this - included in identifiers.
# null, undefined - not included, should go in some "special constant" list.
KEYWORD_LIST = ['break', 'case', 'catch', 'continue', 'default', 'do', 'else',
'finally', 'for', 'if', 'return', 'switch', 'throw', 'try', 'var',
'while', 'with']
# Match a keyword string followed by a non-identifier character in order to
# not match something like doSomething as do + Something.
KEYWORD = re.compile('(%s)((?=[^%s])|$)' % (
'|'.join(KEYWORD_LIST), IDENTIFIER_CHAR))
# List of regular expressions to match as operators. Some notes: for our
# purposes, the comma behaves similarly enough to a normal operator that we
# include it here. r'\bin\b' actually matches 'in' surrounded by boundary
# characters - this may not match some very esoteric uses of the in operator.
# Operators that are subsets of larger operators must come later in this list
# for proper matching, e.g., '>>' must come AFTER '>>>'.
OPERATOR_LIST = [',', r'\+\+', '===', '!==', '>>>=', '>>>', '==', '>=', '<=',
'!=', '<<=', '>>=', '<<', '>>', '>', '<', r'\+=', r'\+',
'--', '\^=', '-=', '-', '/=', '/', r'\*=', r'\*', '%=', '%',
'&&', r'\|\|', '&=', '&', r'\|=', r'\|', '=', '!', ':', '\?',
r'\bdelete\b', r'\bin\b', r'\binstanceof\b', r'\bnew\b',
r'\btypeof\b', r'\bvoid\b']
OPERATOR = re.compile('|'.join(OPERATOR_LIST))
WHITESPACE = re.compile(r'\s+')
SEMICOLON = re.compile(r';')
# Technically JavaScript identifiers can't contain '.', but we treat a set of
# nested identifiers as a single identifier.
NESTED_IDENTIFIER = r'[a-zA-Z_$][%s.]*' % IDENTIFIER_CHAR
IDENTIFIER = re.compile(NESTED_IDENTIFIER)
SIMPLE_LVALUE = re.compile(r"""
(?P<identifier>%s) # a valid identifier
(?=\s* # optional whitespace
\= # look ahead to equal sign
(?!=)) # not follwed by equal
""" % NESTED_IDENTIFIER, re.VERBOSE)
# A doc flag is a @ sign followed by non-space characters that appears at the
# beginning of the line, after whitespace, or after a '{'. The look-behind
# check is necessary to not match someone@google.com as a flag.
DOC_FLAG = re.compile(r'(^|(?<=\s))@(?P<name>[a-zA-Z]+)')
# To properly parse parameter names, we need to tokenize whitespace into a
# token.
DOC_FLAG_LEX_SPACES = re.compile(r'(^|(?<=\s))@(?P<name>%s)\b' %
'|'.join(['param']))
DOC_INLINE_FLAG = re.compile(r'(?<={)@(?P<name>[a-zA-Z]+)')
# Star followed by non-slash, i.e a star that does not end a comment.
# This is used for TYPE_GROUP below.
SAFE_STAR = r'(\*(?!/))'
COMMON_DOC_MATCHERS = [
# Find the end of the comment.
Matcher(END_BLOCK_COMMENT, Type.END_DOC_COMMENT,
JavaScriptModes.TEXT_MODE),
# Tokenize documented flags like @private.
Matcher(DOC_INLINE_FLAG, Type.DOC_INLINE_FLAG),
Matcher(DOC_FLAG_LEX_SPACES, Type.DOC_FLAG,
JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE),
# Encountering a doc flag should leave lex spaces mode.
Matcher(DOC_FLAG, Type.DOC_FLAG, JavaScriptModes.DOC_COMMENT_MODE),
# Tokenize braces so we can find types.
Matcher(START_BLOCK, Type.DOC_START_BRACE),
Matcher(END_BLOCK, Type.DOC_END_BRACE),
Matcher(DOC_PREFIX, Type.DOC_PREFIX, None, True)]
# The token matcher groups work as follows: it is an list of Matcher objects.
# The matchers will be tried in this order, and the first to match will be
# returned. Hence the order is important because the matchers that come first
# overrule the matchers that come later.
JAVASCRIPT_MATCHERS = {
# Matchers for basic text mode.
JavaScriptModes.TEXT_MODE: [
# Check a big group - strings, starting comments, and regexes - all
# of which could be intertwined. 'string with /regex/',
# /regex with 'string'/, /* comment with /regex/ and string */ (and so
# on)
Matcher(START_DOC_COMMENT, Type.START_DOC_COMMENT,
JavaScriptModes.DOC_COMMENT_MODE),
Matcher(START_BLOCK_COMMENT, Type.START_BLOCK_COMMENT,
JavaScriptModes.BLOCK_COMMENT_MODE),
Matcher(END_OF_LINE_SINGLE_LINE_COMMENT,
Type.START_SINGLE_LINE_COMMENT),
Matcher(START_SINGLE_LINE_COMMENT, Type.START_SINGLE_LINE_COMMENT,
JavaScriptModes.LINE_COMMENT_MODE),
Matcher(SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_START,
JavaScriptModes.SINGLE_QUOTE_STRING_MODE),
Matcher(DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_START,
JavaScriptModes.DOUBLE_QUOTE_STRING_MODE),
Matcher(REGEX, Type.REGEX),
# Next we check for start blocks appearing outside any of the items
# above.
Matcher(START_BLOCK, Type.START_BLOCK),
Matcher(END_BLOCK, Type.END_BLOCK),
# Then we search for function declarations.
Matcher(FUNCTION_DECLARATION, Type.FUNCTION_DECLARATION,
JavaScriptModes.FUNCTION_MODE),
# Next, we convert non-function related parens to tokens.
Matcher(OPENING_PAREN, Type.START_PAREN),
Matcher(CLOSING_PAREN, Type.END_PAREN),
# Next, we convert brackets to tokens.
Matcher(OPENING_BRACKET, Type.START_BRACKET),
Matcher(CLOSING_BRACKET, Type.END_BRACKET),
# Find numbers. This has to happen before operators because scientific
# notation numbers can have + and - in them.
Matcher(NUMBER, Type.NUMBER),
# Find operators and simple assignments
Matcher(SIMPLE_LVALUE, Type.SIMPLE_LVALUE),
Matcher(OPERATOR, Type.OPERATOR),
# Find key words and whitespace.
Matcher(KEYWORD, Type.KEYWORD),
Matcher(WHITESPACE, Type.WHITESPACE),
# Find identifiers.
Matcher(IDENTIFIER, Type.IDENTIFIER),
# Finally, we convert semicolons to tokens.
Matcher(SEMICOLON, Type.SEMICOLON)],
# Matchers for single quote strings.
JavaScriptModes.SINGLE_QUOTE_STRING_MODE: [
Matcher(SINGLE_QUOTE_TEXT, Type.STRING_TEXT),
Matcher(SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_END,
JavaScriptModes.TEXT_MODE)],
# Matchers for double quote strings.
JavaScriptModes.DOUBLE_QUOTE_STRING_MODE: [
Matcher(DOUBLE_QUOTE_TEXT, Type.STRING_TEXT),
Matcher(DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_END,
JavaScriptModes.TEXT_MODE)],
# Matchers for block comments.
JavaScriptModes.BLOCK_COMMENT_MODE: [
# First we check for exiting a block comment.
Matcher(END_BLOCK_COMMENT, Type.END_BLOCK_COMMENT,
JavaScriptModes.TEXT_MODE),
# Match non-comment-ending text..
Matcher(BLOCK_COMMENT_TEXT, Type.COMMENT)],
# Matchers for doc comments.
JavaScriptModes.DOC_COMMENT_MODE: COMMON_DOC_MATCHERS + [
Matcher(DOC_COMMENT_TEXT, Type.COMMENT)],
JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: COMMON_DOC_MATCHERS + [
Matcher(WHITESPACE, Type.COMMENT),
Matcher(DOC_COMMENT_NO_SPACES_TEXT, Type.COMMENT)],
# Matchers for single line comments.
JavaScriptModes.LINE_COMMENT_MODE: [
# We greedy match until the end of the line in line comment mode.
Matcher(ANYTHING, Type.COMMENT, JavaScriptModes.TEXT_MODE)],
# Matchers for code after the function keyword.
JavaScriptModes.FUNCTION_MODE: [
# Must match open paren before anything else and move into parameter
# mode, otherwise everything inside the parameter list is parsed
# incorrectly.
Matcher(OPENING_PAREN, Type.START_PARAMETERS,
JavaScriptModes.PARAMETER_MODE),
Matcher(WHITESPACE, Type.WHITESPACE),
Matcher(IDENTIFIER, Type.FUNCTION_NAME)],
# Matchers for function parameters
JavaScriptModes.PARAMETER_MODE: [
# When in function parameter mode, a closing paren is treated specially.
# Everything else is treated as lines of parameters.
Matcher(CLOSING_PAREN_WITH_SPACE, Type.END_PARAMETERS,
JavaScriptModes.TEXT_MODE),
Matcher(PARAMETERS, Type.PARAMETERS, JavaScriptModes.PARAMETER_MODE)]}
# When text is not matched, it is given this default type based on mode.
# If unspecified in this map, the default default is Type.NORMAL.
JAVASCRIPT_DEFAULT_TYPES = {
JavaScriptModes.DOC_COMMENT_MODE: Type.COMMENT,
JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: Type.COMMENT
}
def __init__(self, parse_js_doc = True):
"""Create a tokenizer object.
Args:
parse_js_doc: Whether to do detailed parsing of javascript doc comments,
or simply treat them as normal comments. Defaults to parsing JsDoc.
"""
matchers = self.JAVASCRIPT_MATCHERS
if not parse_js_doc:
# Make a copy so the original doesn't get modified.
matchers = copy.deepcopy(matchers)
matchers[JavaScriptModes.DOC_COMMENT_MODE] = matchers[
JavaScriptModes.BLOCK_COMMENT_MODE]
tokenizer.Tokenizer.__init__(self, JavaScriptModes.TEXT_MODE, matchers,
self.JAVASCRIPT_DEFAULT_TYPES)
def _CreateToken(self, string, token_type, line, line_number, values=None):
"""Creates a new JavaScriptToken object.
Args:
string: The string of input the token contains.
token_type: The type of token.
line: The text of the line this token is in.
line_number: The line number of the token.
values: A dict of named values within the token. For instance, a
function declaration may have a value called 'name' which captures the
name of the function.
"""
return javascripttokens.JavaScriptToken(string, token_type, line,
line_number, values)