| /* |
| * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. |
| * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| * |
| * This code is free software; you can redistribute it and/or modify it |
| * under the terms of the GNU General Public License version 2 only, as |
| * published by the Free Software Foundation. Oracle designates this |
| * particular file as subject to the "Classpath" exception as provided |
| * by Oracle in the LICENSE file that accompanied this code. |
| * |
| * This code is distributed in the hope that it will be useful, but WITHOUT |
| * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| * version 2 for more details (a copy is included in the LICENSE file that |
| * accompanied this code). |
| * |
| * You should have received a copy of the GNU General Public License version |
| * 2 along with this work; if not, write to the Free Software Foundation, |
| * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| * |
| * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
| * or visit www.oracle.com if you need additional information or have any |
| * questions. |
| */ |
| |
| package jdk.nashorn.internal.parser; |
| |
| import static jdk.nashorn.internal.parser.TokenType.ADD; |
| import static jdk.nashorn.internal.parser.TokenType.COMMENT; |
| import static jdk.nashorn.internal.parser.TokenType.DECIMAL; |
| import static jdk.nashorn.internal.parser.TokenType.DIRECTIVE_COMMENT; |
| import static jdk.nashorn.internal.parser.TokenType.EOF; |
| import static jdk.nashorn.internal.parser.TokenType.EOL; |
| import static jdk.nashorn.internal.parser.TokenType.ERROR; |
| import static jdk.nashorn.internal.parser.TokenType.ESCSTRING; |
| import static jdk.nashorn.internal.parser.TokenType.EXECSTRING; |
| import static jdk.nashorn.internal.parser.TokenType.FLOATING; |
| import static jdk.nashorn.internal.parser.TokenType.HEXADECIMAL; |
| import static jdk.nashorn.internal.parser.TokenType.LBRACE; |
| import static jdk.nashorn.internal.parser.TokenType.LPAREN; |
| import static jdk.nashorn.internal.parser.TokenType.OCTAL; |
| import static jdk.nashorn.internal.parser.TokenType.RBRACE; |
| import static jdk.nashorn.internal.parser.TokenType.REGEX; |
| import static jdk.nashorn.internal.parser.TokenType.RPAREN; |
| import static jdk.nashorn.internal.parser.TokenType.STRING; |
| import static jdk.nashorn.internal.parser.TokenType.XML; |
| |
| import jdk.nashorn.internal.runtime.ECMAErrors; |
| import jdk.nashorn.internal.runtime.ErrorManager; |
| import jdk.nashorn.internal.runtime.JSErrorType; |
| import jdk.nashorn.internal.runtime.JSType; |
| import jdk.nashorn.internal.runtime.ParserException; |
| import jdk.nashorn.internal.runtime.Source; |
| import jdk.nashorn.internal.runtime.options.Options; |
| |
| /** |
| * Responsible for converting source content into a stream of tokens. |
| * |
| */ |
| @SuppressWarnings("fallthrough") |
| public class Lexer extends Scanner { |
| private static final long MIN_INT_L = Integer.MIN_VALUE; |
| private static final long MAX_INT_L = Integer.MAX_VALUE; |
| |
| private static final boolean XML_LITERALS = Options.getBooleanProperty("nashorn.lexer.xmlliterals"); |
| |
| /** Content source. */ |
| private final Source source; |
| |
| /** Buffered stream for tokens. */ |
| private final TokenStream stream; |
| |
| /** True if here and edit strings are supported. */ |
| private final boolean scripting; |
| |
| /** True if a nested scan. (scan to completion, no EOF.) */ |
| private final boolean nested; |
| |
| /** Pending new line number and position. */ |
| int pendingLine; |
| |
| /** Position of last EOL + 1. */ |
| private int linePosition; |
| |
| /** Type of last token added. */ |
| private TokenType last; |
| |
| private static final String SPACETAB = " \t"; // ASCII space and tab |
| private static final String LFCR = "\n\r"; // line feed and carriage return (ctrl-m) |
| |
| private static final String JSON_WHITESPACE_EOL = LFCR; |
| private static final String JSON_WHITESPACE = SPACETAB + LFCR; |
| |
| private static final String JAVASCRIPT_WHITESPACE_EOL = |
| LFCR + |
| "\u2028" + // line separator |
| "\u2029" // paragraph separator |
| ; |
| private static final String JAVASCRIPT_WHITESPACE = |
| SPACETAB + |
| JAVASCRIPT_WHITESPACE_EOL + |
| "\u000b" + // tabulation line |
| "\u000c" + // ff (ctrl-l) |
| "\u00a0" + // Latin-1 space |
| "\u1680" + // Ogham space mark |
| "\u180e" + // separator, Mongolian vowel |
| "\u2000" + // en quad |
| "\u2001" + // em quad |
| "\u2002" + // en space |
| "\u2003" + // em space |
| "\u2004" + // three-per-em space |
| "\u2005" + // four-per-em space |
| "\u2006" + // six-per-em space |
| "\u2007" + // figure space |
| "\u2008" + // punctuation space |
| "\u2009" + // thin space |
| "\u200a" + // hair space |
| "\u202f" + // narrow no-break space |
| "\u205f" + // medium mathematical space |
| "\u3000" + // ideographic space |
| "\ufeff" // byte order mark |
| ; |
| |
| private static final String JAVASCRIPT_WHITESPACE_IN_REGEXP = |
| "\\u000a" + // line feed |
| "\\u000d" + // carriage return (ctrl-m) |
| "\\u2028" + // line separator |
| "\\u2029" + // paragraph separator |
| "\\u0009" + // tab |
| "\\u0020" + // ASCII space |
| "\\u000b" + // tabulation line |
| "\\u000c" + // ff (ctrl-l) |
| "\\u00a0" + // Latin-1 space |
| "\\u1680" + // Ogham space mark |
| "\\u180e" + // separator, Mongolian vowel |
| "\\u2000" + // en quad |
| "\\u2001" + // em quad |
| "\\u2002" + // en space |
| "\\u2003" + // em space |
| "\\u2004" + // three-per-em space |
| "\\u2005" + // four-per-em space |
| "\\u2006" + // six-per-em space |
| "\\u2007" + // figure space |
| "\\u2008" + // punctuation space |
| "\\u2009" + // thin space |
| "\\u200a" + // hair space |
| "\\u202f" + // narrow no-break space |
| "\\u205f" + // medium mathematical space |
| "\\u3000" + // ideographic space |
| "\\ufeff" // byte order mark |
| ; |
| |
| static String unicodeEscape(final char ch) { |
| final StringBuilder sb = new StringBuilder(); |
| |
| sb.append("\\u"); |
| |
| final String hex = Integer.toHexString(ch); |
| for (int i = hex.length(); i < 4; i++) { |
| sb.append('0'); |
| } |
| sb.append(hex); |
| |
| return sb.toString(); |
| } |
| |
| /** |
| * Constructor |
| * |
| * @param source the source |
| * @param stream the token stream to lex |
| */ |
| public Lexer(final Source source, final TokenStream stream) { |
| this(source, stream, false); |
| } |
| |
| /** |
| * Constructor |
| * |
| * @param source the source |
| * @param stream the token stream to lex |
| * @param scripting are we in scripting mode |
| */ |
| public Lexer(final Source source, final TokenStream stream, final boolean scripting) { |
| this(source, 0, source.getLength(), stream, scripting); |
| } |
| |
| /** |
| * Contructor |
| * |
| * @param source the source |
| * @param start start position in source from which to start lexing |
| * @param len length of source segment to lex |
| * @param stream token stream to lex |
| * @param scripting are we in scripting mode |
| */ |
| |
| public Lexer(final Source source, final int start, final int len, final TokenStream stream, final boolean scripting) { |
| super(source.getContent(), 1, start, len); |
| this.source = source; |
| this.stream = stream; |
| this.scripting = scripting; |
| this.nested = false; |
| this.pendingLine = 1; |
| this.last = EOL; |
| } |
| |
| private Lexer(final Lexer lexer, final State state) { |
| super(lexer, state); |
| |
| source = lexer.source; |
| stream = lexer.stream; |
| scripting = lexer.scripting; |
| nested = true; |
| |
| pendingLine = state.pendingLine; |
| linePosition = state.linePosition; |
| last = EOL; |
| } |
| |
| static class State extends Scanner.State { |
| /** Pending new line number and position. */ |
| public final int pendingLine; |
| |
| /** Position of last EOL + 1. */ |
| public final int linePosition; |
| |
| /** Type of last token added. */ |
| public final TokenType last; |
| |
| /* |
| * Constructor. |
| */ |
| |
| State(final int position, final int limit, final int line, final int pendingLine, final int linePosition, final TokenType last) { |
| super(position, limit, line); |
| |
| this.pendingLine = pendingLine; |
| this.linePosition = linePosition; |
| this.last = last; |
| } |
| } |
| |
| /** |
| * Save the state of the scan. |
| * |
| * @return Captured state. |
| */ |
| @Override |
| State saveState() { |
| return new State(position, limit, line, pendingLine, linePosition, last); |
| } |
| |
| /** |
| * Restore the state of the scan. |
| * |
| * @param state |
| * Captured state. |
| */ |
| void restoreState(final State state) { |
| super.restoreState(state); |
| |
| pendingLine = state.pendingLine; |
| linePosition = state.linePosition; |
| last = state.last; |
| } |
| |
| /** |
| * Add a new token to the stream. |
| * |
| * @param type |
| * Token type. |
| * @param start |
| * Start position. |
| * @param end |
| * End position. |
| */ |
| protected void add(final TokenType type, final int start, final int end) { |
| // Record last token. |
| last = type; |
| |
| // Only emit the last EOL in a cluster. |
| if (type == EOL) { |
| pendingLine = end; |
| linePosition = start; |
| } else { |
| // Write any pending EOL to stream. |
| if (pendingLine != -1) { |
| stream.put(Token.toDesc(EOL, linePosition, pendingLine)); |
| pendingLine = -1; |
| } |
| |
| // Write token to stream. |
| stream.put(Token.toDesc(type, start, end - start)); |
| } |
| } |
| |
| /** |
| * Add a new token to the stream. |
| * |
| * @param type |
| * Token type. |
| * @param start |
| * Start position. |
| */ |
| protected void add(final TokenType type, final int start) { |
| add(type, start, position); |
| } |
| |
| /** |
| * Return the String of valid whitespace characters for regular |
| * expressions in JavaScript |
| * @return regexp whitespace string |
| */ |
| public static String getWhitespaceRegExp() { |
| return JAVASCRIPT_WHITESPACE_IN_REGEXP; |
| } |
| |
| /** |
| * Skip end of line. |
| * |
| * @param addEOL true if EOL token should be recorded. |
| */ |
| private void skipEOL(final boolean addEOL) { |
| |
| if (ch0 == '\r') { // detect \r\n pattern |
| skip(1); |
| if (ch0 == '\n') { |
| skip(1); |
| } |
| } else { // all other space, ch0 is guaranteed to be EOL or \0 |
| skip(1); |
| } |
| |
| // bump up line count |
| line++; |
| |
| if (addEOL) { |
| // Add an EOL token. |
| add(EOL, position, line); |
| } |
| } |
| |
| /** |
| * Skip over rest of line including end of line. |
| * |
| * @param addEOL true if EOL token should be recorded. |
| */ |
| private void skipLine(final boolean addEOL) { |
| // Ignore characters. |
| while (!isEOL(ch0) && !atEOF()) { |
| skip(1); |
| } |
| // Skip over end of line. |
| skipEOL(addEOL); |
| } |
| |
| /** |
| * Test whether a char is valid JavaScript whitespace |
| * @param ch a char |
| * @return true if valid JavaScript whitespace |
| */ |
| public static boolean isJSWhitespace(final char ch) { |
| return JAVASCRIPT_WHITESPACE.indexOf(ch) != -1; |
| } |
| |
| /** |
| * Test whether a char is valid JavaScript end of line |
| * @param ch a char |
| * @return true if valid JavaScript end of line |
| */ |
| public static boolean isJSEOL(final char ch) { |
| return JAVASCRIPT_WHITESPACE_EOL.indexOf(ch) != -1; |
| } |
| |
| /** |
| * Test whether a char is valid JSON whitespace |
| * @param ch a char |
| * @return true if valid JSON whitespace |
| */ |
| public static boolean isJsonWhitespace(final char ch) { |
| return JSON_WHITESPACE.indexOf(ch) != -1; |
| } |
| |
| /** |
| * Test whether a char is valid JSON end of line |
| * @param ch a char |
| * @return true if valid JSON end of line |
| */ |
| public static boolean isJsonEOL(final char ch) { |
| return JSON_WHITESPACE_EOL.indexOf(ch) != -1; |
| } |
| |
| /** |
| * Test if char is a string delimiter, e.g. '\' or '"'. Also scans exec |
| * strings ('`') in scripting mode. |
| * @param ch a char |
| * @return true if string delimiter |
| */ |
| protected boolean isStringDelimiter(final char ch) { |
| return ch == '\'' || ch == '"' || (scripting && ch == '`'); |
| } |
| |
| /** |
| * Test whether a char is valid JavaScript whitespace |
| * @param ch a char |
| * @return true if valid JavaScript whitespace |
| */ |
| protected boolean isWhitespace(final char ch) { |
| return Lexer.isJSWhitespace(ch); |
| } |
| |
| /** |
| * Test whether a char is valid JavaScript end of line |
| * @param ch a char |
| * @return true if valid JavaScript end of line |
| */ |
| protected boolean isEOL(final char ch) { |
| return Lexer.isJSEOL(ch); |
| } |
| |
| /** |
| * Skip over whitespace and detect end of line, adding EOL tokens if |
| * encountered. |
| * |
| * @param addEOL true if EOL tokens should be recorded. |
| */ |
| private void skipWhitespace(final boolean addEOL) { |
| while (isWhitespace(ch0)) { |
| if (isEOL(ch0)) { |
| skipEOL(addEOL); |
| } else { |
| skip(1); |
| } |
| } |
| } |
| |
| /** |
| * Skip over comments. |
| * |
| * @return True if a comment. |
| */ |
| protected boolean skipComments() { |
| // Save the current position. |
| final int start = position; |
| |
| if (ch0 == '/') { |
| // Is it a // comment. |
| if (ch1 == '/') { |
| // Skip over //. |
| skip(2); |
| |
| boolean directiveComment = false; |
| if ((ch0 == '#' || ch0 == '@') && (ch1 == ' ')) { |
| directiveComment = true; |
| } |
| |
| // Scan for EOL. |
| while (!atEOF() && !isEOL(ch0)) { |
| skip(1); |
| } |
| // Did detect a comment. |
| add(directiveComment? DIRECTIVE_COMMENT : COMMENT, start); |
| return true; |
| } else if (ch1 == '*') { |
| // Skip over /*. |
| skip(2); |
| // Scan for */. |
| while (!atEOF() && !(ch0 == '*' && ch1 == '/')) { |
| // If end of line handle else skip character. |
| if (isEOL(ch0)) { |
| skipEOL(true); |
| } else { |
| skip(1); |
| } |
| } |
| |
| if (atEOF()) { |
| // TODO - Report closing */ missing in parser. |
| add(ERROR, start); |
| } else { |
| // Skip */. |
| skip(2); |
| } |
| |
| // Did detect a comment. |
| add(COMMENT, start); |
| return true; |
| } |
| } else if (ch0 == '#') { |
| assert scripting; |
| // shell style comment |
| // Skip over #. |
| skip(1); |
| // Scan for EOL. |
| while (!atEOF() && !isEOL(ch0)) { |
| skip(1); |
| } |
| // Did detect a comment. |
| add(COMMENT, start); |
| return true; |
| } |
| |
| // Not a comment. |
| return false; |
| } |
| |
| /** |
| * Convert a regex token to a token object. |
| * |
| * @param start Position in source content. |
| * @param length Length of regex token. |
| * @return Regex token object. |
| */ |
| public RegexToken valueOfPattern(final int start, final int length) { |
| // Save the current position. |
| final int savePosition = position; |
| // Reset to beginning of content. |
| reset(start); |
| // Buffer for recording characters. |
| final StringBuilder sb = new StringBuilder(length); |
| |
| // Skip /. |
| skip(1); |
| boolean inBrackets = false; |
| // Scan for closing /, stopping at end of line. |
| while (!atEOF() && ch0 != '/' && !isEOL(ch0) || inBrackets) { |
| // Skip over escaped character. |
| if (ch0 == '\\') { |
| sb.append(ch0); |
| sb.append(ch1); |
| skip(2); |
| } else { |
| if (ch0 == '[') { |
| inBrackets = true; |
| } else if (ch0 == ']') { |
| inBrackets = false; |
| } |
| |
| // Skip literal character. |
| sb.append(ch0); |
| skip(1); |
| } |
| } |
| |
| // Get pattern as string. |
| final String regex = sb.toString(); |
| |
| // Skip /. |
| skip(1); |
| |
| // Options as string. |
| final String options = source.getString(position, scanIdentifier()); |
| |
| reset(savePosition); |
| |
| // Compile the pattern. |
| return new RegexToken(regex, options); |
| } |
| |
| /** |
| * Return true if the given token can be the beginning of a literal. |
| * |
| * @param token a token |
| * @return true if token can start a literal. |
| */ |
| public boolean canStartLiteral(final TokenType token) { |
| return token.startsWith('/') || ((scripting || XML_LITERALS) && token.startsWith('<')); |
| } |
| |
| /** |
| * interface to receive line information for multi-line literals. |
| */ |
| protected interface LineInfoReceiver { |
| /** |
| * Receives line information |
| * @param line last line number |
| * @param linePosition position of last line |
| */ |
| public void lineInfo(int line, int linePosition); |
| } |
| |
| /** |
| * Check whether the given token represents the beginning of a literal. If so scan |
| * the literal and return <tt>true</tt>, otherwise return false. |
| * |
| * @param token the token. |
| * @param startTokenType the token type. |
| * @param lir LineInfoReceiver that receives line info for multi-line string literals. |
| * @return True if a literal beginning with startToken was found and scanned. |
| */ |
| protected boolean scanLiteral(final long token, final TokenType startTokenType, final LineInfoReceiver lir) { |
| // Check if it can be a literal. |
| if (!canStartLiteral(startTokenType)) { |
| return false; |
| } |
| // We break on ambiguous tokens so if we already moved on it can't be a literal. |
| if (stream.get(stream.last()) != token) { |
| return false; |
| } |
| // Rewind to token start position |
| reset(Token.descPosition(token)); |
| |
| if (ch0 == '/') { |
| return scanRegEx(); |
| } else if (ch0 == '<') { |
| if (ch1 == '<') { |
| return scanHereString(lir); |
| } else if (Character.isJavaIdentifierStart(ch1)) { |
| return scanXMLLiteral(); |
| } |
| } |
| |
| return false; |
| } |
| |
| /** |
| * Scan over regex literal. |
| * |
| * @return True if a regex literal. |
| */ |
| private boolean scanRegEx() { |
| assert ch0 == '/'; |
| // Make sure it's not a comment. |
| if (ch1 != '/' && ch1 != '*') { |
| // Record beginning of literal. |
| final int start = position; |
| // Skip /. |
| skip(1); |
| boolean inBrackets = false; |
| |
| // Scan for closing /, stopping at end of line. |
| while (!atEOF() && (ch0 != '/' || inBrackets) && !isEOL(ch0)) { |
| // Skip over escaped character. |
| if (ch0 == '\\') { |
| skip(1); |
| if (isEOL(ch0)) { |
| reset(start); |
| return false; |
| } |
| skip(1); |
| } else { |
| if (ch0 == '[') { |
| inBrackets = true; |
| } else if (ch0 == ']') { |
| inBrackets = false; |
| } |
| |
| // Skip literal character. |
| skip(1); |
| } |
| } |
| |
| // If regex literal. |
| if (ch0 == '/') { |
| // Skip /. |
| skip(1); |
| |
| // Skip over options. |
| while (!atEOF() && Character.isJavaIdentifierPart(ch0) || ch0 == '\\' && ch1 == 'u') { |
| skip(1); |
| } |
| |
| // Add regex token. |
| add(REGEX, start); |
| // Regex literal detected. |
| return true; |
| } |
| |
| // False start try again. |
| reset(start); |
| } |
| |
| // Regex literal not detected. |
| return false; |
| } |
| |
| /** |
| * Convert a digit to a integer. Can't use Character.digit since we are |
| * restricted to ASCII by the spec. |
| * |
| * @param ch Character to convert. |
| * @param base Numeric base. |
| * |
| * @return The converted digit or -1 if invalid. |
| */ |
| protected static int convertDigit(final char ch, final int base) { |
| int digit; |
| |
| if ('0' <= ch && ch <= '9') { |
| digit = ch - '0'; |
| } else if ('A' <= ch && ch <= 'Z') { |
| digit = ch - 'A' + 10; |
| } else if ('a' <= ch && ch <= 'z') { |
| digit = ch - 'a' + 10; |
| } else { |
| return -1; |
| } |
| |
| return digit < base ? digit : -1; |
| } |
| |
| |
| /** |
| * Get the value of a hexadecimal numeric sequence. |
| * |
| * @param length Number of digits. |
| * @param type Type of token to report against. |
| * @return Value of sequence or < 0 if no digits. |
| */ |
| private int hexSequence(final int length, final TokenType type) { |
| int value = 0; |
| |
| for (int i = 0; i < length; i++) { |
| final int digit = convertDigit(ch0, 16); |
| |
| if (digit == -1) { |
| error(Lexer.message("invalid.hex"), type, position, limit); |
| return i == 0 ? -1 : value; |
| } |
| |
| value = digit | value << 4; |
| skip(1); |
| } |
| |
| return value; |
| } |
| |
| /** |
| * Get the value of an octal numeric sequence. This parses up to 3 digits with a maximum value of 255. |
| * |
| * @return Value of sequence. |
| */ |
| private int octalSequence() { |
| int value = 0; |
| |
| for (int i = 0; i < 3; i++) { |
| final int digit = convertDigit(ch0, 8); |
| |
| if (digit == -1) { |
| break; |
| } |
| value = digit | value << 3; |
| skip(1); |
| |
| if (i == 1 && value >= 32) { |
| break; |
| } |
| } |
| return value; |
| } |
| |
| /** |
| * Convert a string to a JavaScript identifier. |
| * |
| * @param start Position in source content. |
| * @param length Length of token. |
| * @return Ident string or null if an error. |
| */ |
| private String valueOfIdent(final int start, final int length) throws RuntimeException { |
| // Save the current position. |
| final int savePosition = position; |
| // End of scan. |
| final int end = start + length; |
| // Reset to beginning of content. |
| reset(start); |
| // Buffer for recording characters. |
| final StringBuilder sb = new StringBuilder(length); |
| |
| // Scan until end of line or end of file. |
| while (!atEOF() && position < end && !isEOL(ch0)) { |
| // If escape character. |
| if (ch0 == '\\' && ch1 == 'u') { |
| skip(2); |
| final int ch = hexSequence(4, TokenType.IDENT); |
| if (isWhitespace((char)ch)) { |
| return null; |
| } |
| if (ch < 0) { |
| sb.append('\\'); |
| sb.append('u'); |
| } else { |
| sb.append((char)ch); |
| } |
| } else { |
| // Add regular character. |
| sb.append(ch0); |
| skip(1); |
| } |
| } |
| |
| // Restore position. |
| reset(savePosition); |
| |
| return sb.toString(); |
| } |
| |
| /** |
| * Scan over and identifier or keyword. Handles identifiers containing |
| * encoded Unicode chars. |
| * |
| * Example: |
| * |
| * var \u0042 = 44; |
| */ |
| private void scanIdentifierOrKeyword() { |
| // Record beginning of identifier. |
| final int start = position; |
| // Scan identifier. |
| final int length = scanIdentifier(); |
| // Check to see if it is a keyword. |
| final TokenType type = TokenLookup.lookupKeyword(content, start, length); |
| // Add keyword or identifier token. |
| add(type, start); |
| } |
| |
| /** |
| * Convert a string to a JavaScript string object. |
| * |
| * @param start Position in source content. |
| * @param length Length of token. |
| * @return JavaScript string object. |
| */ |
| private String valueOfString(final int start, final int length, final boolean strict) throws RuntimeException { |
| // Save the current position. |
| final int savePosition = position; |
| // Calculate the end position. |
| final int end = start + length; |
| // Reset to beginning of string. |
| reset(start); |
| |
| // Buffer for recording characters. |
| final StringBuilder sb = new StringBuilder(length); |
| |
| // Scan until end of string. |
| while (position < end) { |
| // If escape character. |
| if (ch0 == '\\') { |
| skip(1); |
| |
| final char next = ch0; |
| final int afterSlash = position; |
| |
| skip(1); |
| |
| // Special characters. |
| switch (next) { |
| case '0': |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': { |
| if (strict) { |
| // "\0" itself is allowed in strict mode. Only other 'real' |
| // octal escape sequences are not allowed (eg. "\02", "\31"). |
| // See section 7.8.4 String literals production EscapeSequence |
| if (next != '0' || (ch0 >= '0' && ch0 <= '9')) { |
| error(Lexer.message("strict.no.octal"), STRING, position, limit); |
| } |
| } |
| reset(afterSlash); |
| // Octal sequence. |
| final int ch = octalSequence(); |
| |
| if (ch < 0) { |
| sb.append('\\'); |
| sb.append('x'); |
| } else { |
| sb.append((char)ch); |
| } |
| break; |
| } |
| case 'n': |
| sb.append('\n'); |
| break; |
| case 't': |
| sb.append('\t'); |
| break; |
| case 'b': |
| sb.append('\b'); |
| break; |
| case 'f': |
| sb.append('\f'); |
| break; |
| case 'r': |
| sb.append('\r'); |
| break; |
| case '\'': |
| sb.append('\''); |
| break; |
| case '\"': |
| sb.append('\"'); |
| break; |
| case '\\': |
| sb.append('\\'); |
| break; |
| case '\r': // CR | CRLF |
| if (ch0 == '\n') { |
| skip(1); |
| } |
| // fall through |
| case '\n': // LF |
| case '\u2028': // LS |
| case '\u2029': // PS |
| // continue on the next line, slash-return continues string |
| // literal |
| break; |
| case 'x': { |
| // Hex sequence. |
| final int ch = hexSequence(2, STRING); |
| |
| if (ch < 0) { |
| sb.append('\\'); |
| sb.append('x'); |
| } else { |
| sb.append((char)ch); |
| } |
| } |
| break; |
| case 'u': { |
| // Unicode sequence. |
| final int ch = hexSequence(4, STRING); |
| |
| if (ch < 0) { |
| sb.append('\\'); |
| sb.append('u'); |
| } else { |
| sb.append((char)ch); |
| } |
| } |
| break; |
| case 'v': |
| sb.append('\u000B'); |
| break; |
| // All other characters. |
| default: |
| sb.append(next); |
| break; |
| } |
| } else { |
| // Add regular character. |
| sb.append(ch0); |
| skip(1); |
| } |
| } |
| |
| // Restore position. |
| reset(savePosition); |
| |
| return sb.toString(); |
| } |
| |
| /** |
| * Scan over a string literal. |
| * @param add true if we nare not just scanning but should actually modify the token stream |
| */ |
| protected void scanString(final boolean add) { |
| // Type of string. |
| TokenType type = STRING; |
| // Record starting quote. |
| final char quote = ch0; |
| // Skip over quote. |
| skip(1); |
| |
| // Record beginning of string content. |
| final State stringState = saveState(); |
| |
| // Scan until close quote or end of line. |
| while (!atEOF() && ch0 != quote && !isEOL(ch0)) { |
| // Skip over escaped character. |
| if (ch0 == '\\') { |
| type = ESCSTRING; |
| skip(1); |
| if (! isEscapeCharacter(ch0)) { |
| error(Lexer.message("invalid.escape.char"), STRING, position, limit); |
| } |
| if (isEOL(ch0)) { |
| // Multiline string literal |
| skipEOL(false); |
| continue; |
| } |
| } |
| // Skip literal character. |
| skip(1); |
| } |
| |
| // If close quote. |
| if (ch0 == quote) { |
| // Skip close quote. |
| skip(1); |
| } else { |
| error(Lexer.message("missing.close.quote"), STRING, position, limit); |
| } |
| |
| // If not just scanning. |
| if (add) { |
| // Record end of string. |
| stringState.setLimit(position - 1); |
| |
| if (scripting && !stringState.isEmpty()) { |
| switch (quote) { |
| case '`': |
| // Mark the beginning of an exec string. |
| add(EXECSTRING, stringState.position, stringState.limit); |
| // Frame edit string with left brace. |
| add(LBRACE, stringState.position, stringState.position); |
| // Process edit string. |
| editString(type, stringState); |
| // Frame edit string with right brace. |
| add(RBRACE, stringState.limit, stringState.limit); |
| break; |
| case '"': |
| // Only edit double quoted strings. |
| editString(type, stringState); |
| break; |
| case '\'': |
| // Add string token without editing. |
| add(type, stringState.position, stringState.limit); |
| break; |
| default: |
| break; |
| } |
| } else { |
| /// Add string token without editing. |
| add(type, stringState.position, stringState.limit); |
| } |
| } |
| } |
| |
| /** |
| * Is the given character a valid escape char after "\" ? |
| * |
| * @param ch character to be checked |
| * @return if the given character is valid after "\" |
| */ |
| protected boolean isEscapeCharacter(final char ch) { |
| return true; |
| } |
| |
| /** |
| * Convert string to number. |
| * |
| * @param valueString String to convert. |
| * @param radix Numeric base. |
| * @return Converted number. |
| */ |
| private static Number valueOf(final String valueString, final int radix) throws NumberFormatException { |
| try { |
| final long value = Long.parseLong(valueString, radix); |
| if(value >= MIN_INT_L && value <= MAX_INT_L) { |
| return Integer.valueOf((int)value); |
| } |
| return Long.valueOf(value); |
| } catch (final NumberFormatException e) { |
| if (radix == 10) { |
| return Double.valueOf(valueString); |
| } |
| |
| double value = 0.0; |
| |
| for (int i = 0; i < valueString.length(); i++) { |
| final char ch = valueString.charAt(i); |
| // Preverified, should always be a valid digit. |
| final int digit = convertDigit(ch, radix); |
| value *= radix; |
| value += digit; |
| } |
| |
| return value; |
| } |
| } |
| |
| /** |
| * Scan a number. |
| */ |
| protected void scanNumber() { |
| // Record beginning of number. |
| final int start = position; |
| // Assume value is a decimal. |
| TokenType type = DECIMAL; |
| |
| // First digit of number. |
| int digit = convertDigit(ch0, 10); |
| |
| // If number begins with 0x. |
| if (digit == 0 && (ch1 == 'x' || ch1 == 'X') && convertDigit(ch2, 16) != -1) { |
| // Skip over 0xN. |
| skip(3); |
| // Skip over remaining digits. |
| while (convertDigit(ch0, 16) != -1) { |
| skip(1); |
| } |
| |
| type = HEXADECIMAL; |
| } else { |
| // Check for possible octal constant. |
| boolean octal = digit == 0; |
| // Skip first digit if not leading '.'. |
| if (digit != -1) { |
| skip(1); |
| } |
| |
| // Skip remaining digits. |
| while ((digit = convertDigit(ch0, 10)) != -1) { |
| // Check octal only digits. |
| octal = octal && digit < 8; |
| // Skip digit. |
| skip(1); |
| } |
| |
| if (octal && position - start > 1) { |
| type = OCTAL; |
| } else if (ch0 == '.' || ch0 == 'E' || ch0 == 'e') { |
| // Must be a double. |
| if (ch0 == '.') { |
| // Skip period. |
| skip(1); |
| // Skip mantissa. |
| while (convertDigit(ch0, 10) != -1) { |
| skip(1); |
| } |
| } |
| |
| // Detect exponent. |
| if (ch0 == 'E' || ch0 == 'e') { |
| // Skip E. |
| skip(1); |
| // Detect and skip exponent sign. |
| if (ch0 == '+' || ch0 == '-') { |
| skip(1); |
| } |
| // Skip exponent. |
| while (convertDigit(ch0, 10) != -1) { |
| skip(1); |
| } |
| } |
| |
| type = FLOATING; |
| } |
| } |
| |
| if (Character.isJavaIdentifierStart(ch0)) { |
| error(Lexer.message("missing.space.after.number"), type, position, 1); |
| } |
| |
| // Add number token. |
| add(type, start); |
| } |
| |
| /** |
| * Convert a regex token to a token object. |
| * |
| * @param start Position in source content. |
| * @param length Length of regex token. |
| * @return Regex token object. |
| */ |
| XMLToken valueOfXML(final int start, final int length) { |
| return new XMLToken(source.getString(start, length)); |
| } |
| |
| /** |
| * Scan over a XML token. |
| * |
| * @return TRUE if is an XML literal. |
| */ |
| private boolean scanXMLLiteral() { |
| assert ch0 == '<' && Character.isJavaIdentifierStart(ch1); |
| if (XML_LITERALS) { |
| // Record beginning of xml expression. |
| final int start = position; |
| |
| int openCount = 0; |
| |
| do { |
| if (ch0 == '<') { |
| if (ch1 == '/' && Character.isJavaIdentifierStart(ch2)) { |
| skip(3); |
| openCount--; |
| } else if (Character.isJavaIdentifierStart(ch1)) { |
| skip(2); |
| openCount++; |
| } else if (ch1 == '?') { |
| skip(2); |
| } else if (ch1 == '!' && ch2 == '-' && ch3 == '-') { |
| skip(4); |
| } else { |
| reset(start); |
| return false; |
| } |
| |
| while (!atEOF() && ch0 != '>') { |
| if (ch0 == '/' && ch1 == '>') { |
| openCount--; |
| skip(1); |
| break; |
| } else if (ch0 == '\"' || ch0 == '\'') { |
| scanString(false); |
| } else { |
| skip(1); |
| } |
| } |
| |
| if (ch0 != '>') { |
| reset(start); |
| return false; |
| } |
| |
| skip(1); |
| } else if (atEOF()) { |
| reset(start); |
| return false; |
| } else { |
| skip(1); |
| } |
| } while (openCount > 0); |
| |
| add(XML, start); |
| return true; |
| } |
| |
| return false; |
| } |
| |
| /** |
| * Scan over identifier characters. |
| * |
| * @return Length of identifier or zero if none found. |
| */ |
| private int scanIdentifier() { |
| final int start = position; |
| |
| // Make sure first character is valid start character. |
| if (ch0 == '\\' && ch1 == 'u') { |
| skip(2); |
| final int ch = hexSequence(4, TokenType.IDENT); |
| |
| if (!Character.isJavaIdentifierStart(ch)) { |
| error(Lexer.message("illegal.identifier.character"), TokenType.IDENT, start, position); |
| } |
| } else if (!Character.isJavaIdentifierStart(ch0)) { |
| // Not an identifier. |
| return 0; |
| } |
| |
| // Make sure remaining characters are valid part characters. |
| while (!atEOF()) { |
| if (ch0 == '\\' && ch1 == 'u') { |
| skip(2); |
| final int ch = hexSequence(4, TokenType.IDENT); |
| |
| if (!Character.isJavaIdentifierPart(ch)) { |
| error(Lexer.message("illegal.identifier.character"), TokenType.IDENT, start, position); |
| } |
| } else if (Character.isJavaIdentifierPart(ch0)) { |
| skip(1); |
| } else { |
| break; |
| } |
| } |
| |
| // Length of identifier sequence. |
| return position - start; |
| } |
| |
| /** |
| * Compare two identifiers (in content) for equality. |
| * |
| * @param aStart Start of first identifier. |
| * @param aLength Length of first identifier. |
| * @param bStart Start of second identifier. |
| * @param bLength Length of second identifier. |
| * @return True if equal. |
| */ |
| private boolean identifierEqual(final int aStart, final int aLength, final int bStart, final int bLength) { |
| if (aLength == bLength) { |
| for (int i = 0; i < aLength; i++) { |
| if (content[aStart + i] != content[bStart + i]) { |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| |
| return false; |
| } |
| |
| /** |
| * Detect if a line starts with a marker identifier. |
| * |
| * @param identStart Start of identifier. |
| * @param identLength Length of identifier. |
| * @return True if detected. |
| */ |
| private boolean hasHereMarker(final int identStart, final int identLength) { |
| // Skip any whitespace. |
| skipWhitespace(false); |
| |
| return identifierEqual(identStart, identLength, position, scanIdentifier()); |
| } |
| |
| /** |
| * Lexer to service edit strings. |
| */ |
| private static class EditStringLexer extends Lexer { |
| /** Type of string literals to emit. */ |
| final TokenType stringType; |
| |
| /* |
| * Constructor. |
| */ |
| |
| EditStringLexer(final Lexer lexer, final TokenType stringType, final State stringState) { |
| super(lexer, stringState); |
| |
| this.stringType = stringType; |
| } |
| |
| /** |
| * Lexify the contents of the string. |
| */ |
| @Override |
| public void lexify() { |
| // Record start of string position. |
| int stringStart = position; |
| // Indicate that the priming first string has not been emitted. |
| boolean primed = false; |
| |
| while (true) { |
| // Detect end of content. |
| if (atEOF()) { |
| break; |
| } |
| |
| // Honour escapes (should be well formed.) |
| if (ch0 == '\\' && stringType == ESCSTRING) { |
| skip(2); |
| |
| continue; |
| } |
| |
| // If start of expression. |
| if (ch0 == '$' && ch1 == '{') { |
| if (!primed || stringStart != position) { |
| if (primed) { |
| add(ADD, stringStart, stringStart + 1); |
| } |
| |
| add(stringType, stringStart, position); |
| primed = true; |
| } |
| |
| // Skip ${ |
| skip(2); |
| |
| // Save expression state. |
| final State expressionState = saveState(); |
| |
| // Start with one open brace. |
| int braceCount = 1; |
| |
| // Scan for the rest of the string. |
| while (!atEOF()) { |
| // If closing brace. |
| if (ch0 == '}') { |
| // Break only only if matching brace. |
| if (--braceCount == 0) { |
| break; |
| } |
| } else if (ch0 == '{') { |
| // Bump up the brace count. |
| braceCount++; |
| } |
| |
| // Skip to next character. |
| skip(1); |
| } |
| |
| // If braces don't match then report an error. |
| if (braceCount != 0) { |
| error(Lexer.message("edit.string.missing.brace"), LBRACE, expressionState.position - 1, 1); |
| } |
| |
| // Mark end of expression. |
| expressionState.setLimit(position); |
| // Skip closing brace. |
| skip(1); |
| |
| // Start next string. |
| stringStart = position; |
| |
| // Concatenate expression. |
| add(ADD, expressionState.position, expressionState.position + 1); |
| add(LPAREN, expressionState.position, expressionState.position + 1); |
| |
| // Scan expression. |
| final Lexer lexer = new Lexer(this, expressionState); |
| lexer.lexify(); |
| |
| // Close out expression parenthesis. |
| add(RPAREN, position - 1, position); |
| |
| continue; |
| } |
| |
| // Next character in string. |
| skip(1); |
| } |
| |
| // If there is any unemitted string portion. |
| if (stringStart != limit) { |
| // Concatenate remaining string. |
| if (primed) { |
| add(ADD, stringStart, 1); |
| } |
| |
| add(stringType, stringStart, limit); |
| } |
| } |
| |
| } |
| |
| /** |
| * Edit string for nested expressions. |
| * |
| * @param stringType Type of string literals to emit. |
| * @param stringState State of lexer at start of string. |
| */ |
| private void editString(final TokenType stringType, final State stringState) { |
| // Use special lexer to scan string. |
| final EditStringLexer lexer = new EditStringLexer(this, stringType, stringState); |
| lexer.lexify(); |
| |
| // Need to keep lexer informed. |
| last = stringType; |
| } |
| |
| /** |
| * Scan over a here string. |
| * |
| * @return TRUE if is a here string. |
| */ |
| private boolean scanHereString(final LineInfoReceiver lir) { |
| assert ch0 == '<' && ch1 == '<'; |
| if (scripting) { |
| // Record beginning of here string. |
| final State saved = saveState(); |
| |
| // << or <<< |
| final boolean excludeLastEOL = ch2 != '<'; |
| |
| if (excludeLastEOL) { |
| skip(2); |
| } else { |
| skip(3); |
| } |
| |
| // Scan identifier. |
| final int identStart = position; |
| final int identLength = scanIdentifier(); |
| |
| // Check for identifier. |
| if (identLength == 0) { |
| // Treat as shift. |
| restoreState(saved); |
| |
| return false; |
| } |
| |
| // Record rest of line. |
| final State restState = saveState(); |
| // keep line number updated |
| int lastLine = line; |
| |
| skipLine(false); |
| lastLine++; |
| int lastLinePosition = position; |
| restState.setLimit(position); |
| |
| // Record beginning of string. |
| final State stringState = saveState(); |
| int stringEnd = position; |
| |
| // Hunt down marker. |
| while (!atEOF()) { |
| // Skip any whitespace. |
| skipWhitespace(false); |
| |
| if (hasHereMarker(identStart, identLength)) { |
| break; |
| } |
| |
| skipLine(false); |
| lastLine++; |
| lastLinePosition = position; |
| stringEnd = position; |
| } |
| |
| // notify last line information |
| lir.lineInfo(lastLine, lastLinePosition); |
| |
| // Record end of string. |
| stringState.setLimit(stringEnd); |
| |
| // If marker is missing. |
| if (stringState.isEmpty() || atEOF()) { |
| error(Lexer.message("here.missing.end.marker", source.getString(identStart, identLength)), last, position, position); |
| restoreState(saved); |
| |
| return false; |
| } |
| |
| // Remove last end of line if specified. |
| if (excludeLastEOL) { |
| // Handles \n. |
| if (content[stringEnd - 1] == '\n') { |
| stringEnd--; |
| } |
| |
| // Handles \r and \r\n. |
| if (content[stringEnd - 1] == '\r') { |
| stringEnd--; |
| } |
| |
| // Update end of string. |
| stringState.setLimit(stringEnd); |
| } |
| |
| // Edit string if appropriate. |
| if (scripting && !stringState.isEmpty()) { |
| editString(STRING, stringState); |
| } else { |
| // Add here string. |
| add(STRING, stringState.position, stringState.limit); |
| } |
| |
| // Scan rest of original line. |
| final Lexer restLexer = new Lexer(this, restState); |
| |
| restLexer.lexify(); |
| |
| return true; |
| } |
| |
| return false; |
| } |
| |
| /** |
| * Breaks source content down into lex units, adding tokens to the token |
| * stream. The routine scans until the stream buffer is full. Can be called |
| * repeatedly until EOF is detected. |
| */ |
| public void lexify() { |
| while (!stream.isFull() || nested) { |
| // Skip over whitespace. |
| skipWhitespace(true); |
| |
| // Detect end of file. |
| if (atEOF()) { |
| if (!nested) { |
| // Add an EOF token at the end. |
| add(EOF, position); |
| } |
| |
| break; |
| } |
| |
| // Check for comments. Note that we don't scan for regexp and other literals here as |
| // we may not have enough context to distinguish them from similar looking operators. |
| // Instead we break on ambiguous operators below and let the parser decide. |
| if (ch0 == '/' && skipComments()) { |
| continue; |
| } |
| |
| if (scripting && ch0 == '#' && skipComments()) { |
| continue; |
| } |
| |
| // TokenType for lookup of delimiter or operator. |
| TokenType type; |
| |
| if (ch0 == '.' && convertDigit(ch1, 10) != -1) { |
| // '.' followed by digit. |
| // Scan and add a number. |
| scanNumber(); |
| } else if ((type = TokenLookup.lookupOperator(ch0, ch1, ch2, ch3)) != null) { |
| // Get the number of characters in the token. |
| final int typeLength = type.getLength(); |
| // Skip that many characters. |
| skip(typeLength); |
| // Add operator token. |
| add(type, position - typeLength); |
| // Some operator tokens also mark the beginning of regexp, XML, or here string literals. |
| // We break to let the parser decide what it is. |
| if (canStartLiteral(type)) { |
| break; |
| } |
| } else if (Character.isJavaIdentifierStart(ch0) || ch0 == '\\' && ch1 == 'u') { |
| // Scan and add identifier or keyword. |
| scanIdentifierOrKeyword(); |
| } else if (isStringDelimiter(ch0)) { |
| // Scan and add a string. |
| scanString(true); |
| } else if (Character.isDigit(ch0)) { |
| // Scan and add a number. |
| scanNumber(); |
| } else { |
| // Don't recognize this character. |
| skip(1); |
| add(ERROR, position - 1); |
| } |
| } |
| } |
| |
| /** |
| * Return value of token given its token descriptor. |
| * |
| * @param token Token descriptor. |
| * @return JavaScript value. |
| */ |
| Object getValueOf(final long token, final boolean strict) { |
| final int start = Token.descPosition(token); |
| final int len = Token.descLength(token); |
| |
| switch (Token.descType(token)) { |
| case DECIMAL: |
| return Lexer.valueOf(source.getString(start, len), 10); // number |
| case OCTAL: |
| return Lexer.valueOf(source.getString(start, len), 8); // number |
| case HEXADECIMAL: |
| return Lexer.valueOf(source.getString(start + 2, len - 2), 16); // number |
| case FLOATING: |
| final String str = source.getString(start, len); |
| final double value = Double.valueOf(str); |
| if (str.indexOf('.') != -1) { |
| return value; //number |
| } |
| //anything without an explicit decimal point is still subject to a |
| //"representable as int or long" check. Then the programmer does not |
| //explicitly code something as a double. For example new Color(int, int, int) |
| //and new Color(float, float, float) will get ambiguous for cases like |
| //new Color(1.0, 1.5, 1.5) if we don't respect the decimal point. |
| //yet we don't want e.g. 1e6 to be a double unnecessarily |
| if (JSType.isRepresentableAsInt(value) && !JSType.isNegativeZero(value)) { |
| return (int)value; |
| } else if (JSType.isRepresentableAsLong(value) && !JSType.isNegativeZero(value)) { |
| return (long)value; |
| } |
| return value; |
| case STRING: |
| return source.getString(start, len); // String |
| case ESCSTRING: |
| return valueOfString(start, len, strict); // String |
| case IDENT: |
| return valueOfIdent(start, len); // String |
| case REGEX: |
| return valueOfPattern(start, len); // RegexToken::LexerToken |
| case XML: |
| return valueOfXML(start, len); // XMLToken::LexerToken |
| case DIRECTIVE_COMMENT: |
| return source.getString(start, len); |
| default: |
| break; |
| } |
| |
| return null; |
| } |
| |
| /** |
| * Get the correctly localized error message for a given message id format arguments |
| * @param msgId message id |
| * @param args format arguments |
| * @return message |
| */ |
| protected static String message(final String msgId, final String... args) { |
| return ECMAErrors.getMessage("lexer.error." + msgId, args); |
| } |
| |
| /** |
| * Generate a runtime exception |
| * |
| * @param message error message |
| * @param type token type |
| * @param start start position of lexed error |
| * @param length length of lexed error |
| * @throws ParserException unconditionally |
| */ |
| protected void error(final String message, final TokenType type, final int start, final int length) throws ParserException { |
| final long token = Token.toDesc(type, start, length); |
| final int pos = Token.descPosition(token); |
| final int lineNum = source.getLine(pos); |
| final int columnNum = source.getColumn(pos); |
| final String formatted = ErrorManager.format(message, source, lineNum, columnNum, token); |
| throw new ParserException(JSErrorType.SYNTAX_ERROR, formatted, source, lineNum, columnNum, token); |
| } |
| |
| /** |
| * Helper class for Lexer tokens, e.g XML or RegExp tokens. |
| * This is the abstract superclass |
| */ |
| public static abstract class LexerToken { |
| private final String expression; |
| |
| /** |
| * Constructor |
| * @param expression token expression |
| */ |
| protected LexerToken(final String expression) { |
| this.expression = expression; |
| } |
| |
| /** |
| * Get the expression |
| * @return expression |
| */ |
| public String getExpression() { |
| return expression; |
| } |
| } |
| |
| /** |
| * Temporary container for regular expressions. |
| */ |
| public static class RegexToken extends LexerToken { |
| /** Options. */ |
| private final String options; |
| |
| /** |
| * Constructor. |
| * |
| * @param expression regexp expression |
| * @param options regexp options |
| */ |
| public RegexToken(final String expression, final String options) { |
| super(expression); |
| this.options = options; |
| } |
| |
| /** |
| * Get regexp options |
| * @return options |
| */ |
| public String getOptions() { |
| return options; |
| } |
| |
| @Override |
| public String toString() { |
| return '/' + getExpression() + '/' + options; |
| } |
| } |
| |
| /** |
| * Temporary container for XML expression. |
| */ |
| public static class XMLToken extends LexerToken { |
| |
| /** |
| * Constructor. |
| * |
| * @param expression XML expression |
| */ |
| public XMLToken(final String expression) { |
| super(expression); |
| } |
| } |
| } |