blob: ed79dd025ed9b9149fae61415345bda1adbdcd07 [file] [log] [blame]
/*
* Copyright 2016 Google Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.turbine.parse;
import static com.google.common.base.Verify.verify;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.turbine.parse.UnicodeEscapePreprocessor.ASCII_SUB;
import static java.lang.Math.min;
import com.google.common.collect.ImmutableList;
import com.google.turbine.diag.SourceFile;
import com.google.turbine.diag.TurbineError;
import com.google.turbine.diag.TurbineError.ErrorKind;
import org.jspecify.nullness.Nullable;
/** A {@link Lexer} that streams input from a {@link UnicodeEscapePreprocessor}. */
public class StreamLexer implements Lexer {
private final UnicodeEscapePreprocessor reader;
/** The current input character. */
private int ch;
/** The start position of the current token. */
private int position;
/** The start position of the current numeric literal or identifier token. */
private int readFrom;
/** The value of the current string or character literal token. */
private String value = null;
/** A saved javadoc comment. */
private String javadoc = null;
public StreamLexer(UnicodeEscapePreprocessor reader) {
this.reader = reader;
eat();
}
/** Records the value of a literal. */
private void saveValue(String value) {
this.value = value;
}
/** Records the start position of a literal. */
private void readFrom() {
value = null;
readFrom = reader.position();
}
/** Consumes an input character. */
private void eat() {
ch = reader.next();
}
@Override
public @Nullable String javadoc() {
String result = javadoc;
javadoc = null;
if (result == null) {
return null;
}
verify(result.endsWith("*"), result);
return result.substring(0, result.length() - "*".length());
}
@Override
public String stringValue() {
if (value != null) {
return value;
}
return reader.readString(readFrom, reader.position());
}
@Override
public int position() {
return position;
}
@Override
public SourceFile source() {
return reader.source();
}
@Override
public Token next() {
OUTER:
while (true) {
position = reader.position();
switch (ch) {
case '\r':
case '\n':
case ' ':
case '\t':
case '\f':
eat();
continue OUTER;
case '/':
{
eat();
switch (ch) {
case '/':
while (true) {
eat();
switch (ch) {
case '\n':
case '\r':
eat();
continue OUTER;
case ASCII_SUB:
if (reader.done()) {
return Token.EOF;
}
eat();
break;
default: // fall out
}
}
case '*':
eat();
boolean sawStar = false;
boolean isJavadoc = false;
if (ch == '*') {
eat();
// handle empty non-javadoc comments: `/**/`
if (ch == '/') {
eat();
continue OUTER;
}
isJavadoc = true;
readFrom();
}
while (true) {
switch (ch) {
case '*':
eat();
sawStar = true;
break;
case '/':
if (sawStar) {
if (isJavadoc) {
// Save the comment, excluding the leading `/**` and including
// the trailing `/*`. The comment is trimmed and normalized later.
javadoc = stringValue();
verify(javadoc.endsWith("*"), javadoc);
}
eat();
continue OUTER;
}
sawStar = false;
eat();
break;
case ASCII_SUB:
if (reader.done()) {
throw TurbineError.format(
reader.source(), position, ErrorKind.UNCLOSED_COMMENT);
}
eat();
sawStar = false;
break;
default:
eat();
sawStar = false;
break;
}
}
default:
if (ch == '=') {
eat();
return Token.DIVEQ;
}
return Token.DIV;
}
}
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z':
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z':
case '_':
case '$':
return identifier();
case ASCII_SUB:
if (!reader.done()) {
throw error(ErrorKind.UNEXPECTED_EOF);
}
return Token.EOF;
case '-':
case '=':
case '>':
case '<':
case '!':
case '~':
case '+':
case '?':
case ':':
case '*':
case '&':
case '|':
case '^':
case '%':
return operator();
case '(':
eat();
return Token.LPAREN;
case ')':
eat();
return Token.RPAREN;
case '{':
eat();
return Token.LBRACE;
case '}':
eat();
return Token.RBRACE;
case '[':
eat();
return Token.LBRACK;
case ']':
eat();
return Token.RBRACK;
case ';':
eat();
return Token.SEMI;
case ',':
eat();
return Token.COMMA;
case '@':
eat();
return Token.AT; // what about frac, etc.?
case '0':
{
readFrom();
eat();
switch (ch) {
case 'x':
case 'X':
eat();
return hexLiteral();
case 'b':
case 'B':
eat();
return boolLiteral();
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '_':
return octalLiteral();
case '.':
eat();
return floatLiteral();
case 'f':
case 'F':
eat();
return Token.FLOAT_LITERAL;
case 'd':
case 'D':
eat();
return Token.DOUBLE_LITERAL;
case 'l':
case 'L':
eat();
return Token.LONG_LITERAL;
default:
return Token.INT_LITERAL;
}
}
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
readFrom();
return decimalLiteral();
case '.':
{
readFrom();
eat();
switch (ch) {
case '.':
{
eat();
if (ch == '.') {
eat();
return Token.ELLIPSIS;
} else {
throw inputError();
}
}
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return floatLiteral();
default:
return Token.DOT;
}
}
case '\'':
{
eat();
char value;
switch (ch) {
case '\\':
eat();
value = escape();
break;
case '\'':
throw error(ErrorKind.EMPTY_CHARACTER_LITERAL);
default:
value = (char) ch;
eat();
}
if (ch == '\'') {
saveValue(String.valueOf(value));
eat();
return Token.CHAR_LITERAL;
}
throw error(ErrorKind.UNTERMINATED_CHARACTER_LITERAL);
}
case '"':
{
eat();
if (ch == '"') {
eat();
if (ch != '"') {
saveValue("");
return Token.STRING_LITERAL;
}
eat();
return textBlock();
}
readFrom();
StringBuilder sb = new StringBuilder();
STRING:
while (true) {
switch (ch) {
case '\\':
eat();
sb.append(escape());
continue STRING;
case '"':
saveValue(sb.toString());
eat();
return Token.STRING_LITERAL;
case '\n':
throw error(ErrorKind.UNTERMINATED_STRING);
case ASCII_SUB:
if (reader.done()) {
return Token.EOF;
}
// falls through
default:
sb.appendCodePoint(ch);
eat();
continue STRING;
}
}
}
default:
if (Character.isJavaIdentifierStart(ch)) {
// TODO(cushon): the style guide disallows non-ascii identifiers
return identifier();
}
throw inputError();
}
}
}
private Token textBlock() {
OUTER:
while (true) {
switch (ch) {
case ' ':
case '\r':
case '\t':
eat();
break;
default:
break OUTER;
}
}
switch (ch) {
case '\r':
eat();
if (ch == '\n') {
eat();
}
break;
case '\n':
eat();
break;
default:
throw inputError();
}
readFrom();
StringBuilder sb = new StringBuilder();
while (true) {
switch (ch) {
case '"':
eat();
if (ch != '"') {
sb.append("\"");
continue;
}
eat();
if (ch != '"') {
sb.append("\"\"");
continue;
}
eat();
String value = sb.toString();
value = stripIndent(value);
value = translateEscapes(value);
saveValue(value);
return Token.STRING_LITERAL;
case ASCII_SUB:
if (reader.done()) {
return Token.EOF;
}
// falls through
default:
sb.appendCodePoint(ch);
eat();
continue;
}
}
}
static String stripIndent(String value) {
if (value.isEmpty()) {
return value;
}
ImmutableList<String> lines = value.lines().collect(toImmutableList());
// the amount of whitespace to strip from the beginning of every line
int strip = Integer.MAX_VALUE;
char last = value.charAt(value.length() - 1);
boolean trailingNewline = last == '\n' || last == '\r';
if (trailingNewline) {
// If the input contains a trailing newline, we have something like:
//
// |String s = """
// | foo
// |""";
//
// Because the final """ is unindented, nothing should be stripped.
strip = 0;
} else {
// find the longest common prefix of whitespace across all non-blank lines
for (int i = 0; i < lines.size(); i++) {
String line = lines.get(i);
int nonWhitespaceStart = nonWhitespaceStart(line);
if (nonWhitespaceStart == line.length()) {
continue;
}
strip = min(strip, nonWhitespaceStart);
}
}
StringBuilder result = new StringBuilder();
boolean first = true;
for (String line : lines) {
if (!first) {
result.append('\n');
}
int end = trailingWhitespaceStart(line);
if (strip <= end) {
result.append(line, strip, end);
}
first = false;
}
if (trailingNewline) {
result.append('\n');
}
return result.toString();
}
private static int nonWhitespaceStart(String value) {
int i = 0;
while (i < value.length() && Character.isWhitespace(value.charAt(i))) {
i++;
}
return i;
}
private static int trailingWhitespaceStart(String value) {
int i = value.length() - 1;
while (i >= 0 && Character.isWhitespace(value.charAt(i))) {
i--;
}
return i + 1;
}
private static String translateEscapes(String value) {
StreamLexer lexer =
new StreamLexer(new UnicodeEscapePreprocessor(new SourceFile(null, value + ASCII_SUB)));
return lexer.translateEscapes();
}
private String translateEscapes() {
readFrom();
StringBuilder sb = new StringBuilder();
OUTER:
while (true) {
switch (ch) {
case '\\':
eat();
sb.append(escape());
continue;
case ASCII_SUB:
break OUTER;
default:
sb.appendCodePoint(ch);
eat();
continue;
}
}
return sb.toString();
}
private char escape() {
boolean zeroToThree = false;
switch (ch) {
case 'b':
eat();
return '\b';
case 't':
eat();
return '\t';
case 'n':
eat();
return '\n';
case 'f':
eat();
return '\f';
case 'r':
eat();
return '\r';
case '"':
eat();
return '\"';
case '\'':
eat();
return '\'';
case '\\':
eat();
return '\\';
case '0':
case '1':
case '2':
case '3':
zeroToThree = true;
// falls through
case '4':
case '5':
case '6':
case '7':
{
char value = (char) (ch - '0');
eat();
switch (ch) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
{
value = (char) ((value << 3) | (ch - '0'));
eat();
if (zeroToThree) {
switch (ch) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
value = (char) ((value << 3) | (ch - '0'));
eat();
return value;
default:
return value;
}
}
}
// fall through
default:
return value;
}
}
default:
throw inputError();
}
}
private Token decimalLiteral() {
readDigits();
switch (ch) {
case 'e':
case 'E':
return floatLiteral();
case '.':
eat();
return floatLiteral();
case 'f':
case 'F':
eat();
return Token.FLOAT_LITERAL;
case 'd':
case 'D':
eat();
return Token.DOUBLE_LITERAL;
case 'l':
case 'L':
eat();
return Token.LONG_LITERAL;
default:
return Token.INT_LITERAL;
}
}
private Token hexFloatLiteral() {
readHexDigits();
switch (ch) {
case 'p':
case 'P':
eat();
signedInteger();
break;
default: // fall out
}
return floatTypeSuffix();
}
private Token floatLiteral() {
if ('0' <= ch && ch <= '9') {
readDigits();
}
switch (ch) {
case 'e':
case 'E':
eat();
signedInteger();
break;
default: // fall out
}
return floatTypeSuffix();
}
private Token floatTypeSuffix() {
switch (ch) {
case 'd':
case 'D':
eat();
return Token.DOUBLE_LITERAL;
case 'f':
case 'F':
eat();
return Token.FLOAT_LITERAL;
default:
return Token.DOUBLE_LITERAL;
}
}
private void signedInteger() {
switch (ch) {
case '-':
case '+':
eat();
break;
default:
break;
}
readDigits();
}
private void readHexDigits() {
switch (ch) {
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
eat();
break;
default:
throw inputError();
}
OUTER:
while (true) {
switch (ch) {
case '_':
{
do {
eat();
} while (ch == '_');
switch (ch) {
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
continue OUTER;
default:
throw inputError();
}
}
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
eat();
break;
default:
return;
}
}
}
private void readDigits() {
if ('0' <= ch && ch <= '9') {
eat();
} else {
throw inputError();
}
OUTER:
while (true) {
switch (ch) {
case '_':
do {
eat();
} while (ch == '_');
if ('0' <= ch && ch <= '9') {
continue OUTER;
} else {
throw inputError();
}
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
eat();
continue OUTER;
default:
return;
}
}
}
private Token boolLiteral() {
readBinaryDigits();
switch (ch) {
case 'l':
case 'L':
eat();
return Token.LONG_LITERAL;
default:
return Token.INT_LITERAL;
}
}
private void readBinaryDigits() {
switch (ch) {
case '0':
case '1':
eat();
break;
default:
throw inputError();
}
OUTER:
while (true) {
switch (ch) {
case '_':
do {
eat();
} while (ch == '_');
switch (ch) {
case '0':
case '1':
continue OUTER;
default:
throw inputError();
}
case '0':
case '1':
eat();
continue OUTER;
default:
return;
}
}
}
private Token octalLiteral() {
readOctalDigits();
switch (ch) {
case 'l':
case 'L':
eat();
return Token.LONG_LITERAL;
default:
return Token.INT_LITERAL;
}
}
private void readOctalDigits() {
switch (ch) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '_':
eat();
break;
default:
throw inputError();
}
OUTER:
while (true) {
switch (ch) {
case '_':
do {
eat();
} while (ch == '_');
switch (ch) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
continue OUTER;
default:
throw inputError();
}
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
eat();
continue OUTER;
default:
return;
}
}
}
private Token hexLiteral() {
readHexDigits();
switch (ch) {
case '.':
eat();
return hexFloatLiteral();
case 'l':
case 'L':
eat();
return Token.LONG_LITERAL;
case 'p':
case 'P':
eat();
signedInteger();
return floatTypeSuffix();
default:
return Token.INT_LITERAL;
}
}
private Token operator() {
switch (ch) {
case '=':
eat();
if (ch == '=') {
eat();
return Token.EQ;
} else {
return Token.ASSIGN;
}
case '>':
eat();
switch (ch) {
case '=':
eat();
return Token.GTE;
case '>':
eat();
switch (ch) {
case '>':
eat();
if (ch == '=') {
eat();
return Token.GTGTGTE;
} else {
return Token.GTGTGT;
}
case '=':
eat();
return Token.GTGTE;
default:
return Token.GTGT;
}
default:
return Token.GT;
}
case '<':
eat();
switch (ch) {
case '=':
eat();
return Token.LTE;
case '<':
eat();
if (ch == '=') {
eat();
return Token.LTLTE;
} else {
return Token.LTLT;
}
default:
return Token.LT;
}
case '!':
eat();
if (ch == '=') {
eat();
return Token.NOTEQ;
} else {
return Token.NOT;
}
case '~':
eat();
return Token.TILDE;
case '?':
eat();
return Token.COND;
case ':':
eat();
if (ch == ':') {
eat();
return Token.COLONCOLON;
} else {
return Token.COLON;
}
case '-':
eat();
switch (ch) {
case '>':
eat();
return Token.ARROW;
case '-':
eat();
return Token.DECR;
case '=':
eat();
return Token.MINUSEQ;
default:
return Token.MINUS;
}
case '&':
eat();
switch (ch) {
case '&':
eat();
return Token.ANDAND;
case '=':
eat();
return Token.ANDEQ;
default:
return Token.AND;
}
case '|':
eat();
switch (ch) {
case '=':
eat();
return Token.OREQ;
case '|':
eat();
return Token.OROR;
default:
return Token.OR;
}
case '+':
eat();
switch (ch) {
case '+':
eat();
return Token.INCR;
case '=':
eat();
return Token.PLUSEQ;
default:
return Token.PLUS;
}
case '*':
eat();
if (ch == '=') {
eat();
return Token.MULTEQ;
} else {
return Token.MULT;
}
case '/':
// handled with comments
throw inputError();
case '%':
eat();
if (ch == '=') {
eat();
return Token.MODEQ;
} else {
return Token.MOD;
}
case '^':
eat();
if (ch == '=') {
eat();
return Token.XOREQ;
} else {
return Token.XOR;
}
default:
throw inputError();
}
}
private Token identifier() {
readFrom();
eat();
// TODO(cushon): the style guide disallows non-ascii identifiers
while (Character.isJavaIdentifierPart(ch)) {
if (ch == ASCII_SUB && reader.done()) {
break;
}
eat();
}
return makeIdent(stringValue());
}
private static Token makeIdent(String s) {
switch (s) {
case "abstract":
return Token.ABSTRACT;
case "assert":
return Token.ASSERT;
case "boolean":
return Token.BOOLEAN;
case "break":
return Token.BREAK;
case "byte":
return Token.BYTE;
case "case":
return Token.CASE;
case "catch":
return Token.CATCH;
case "char":
return Token.CHAR;
case "class":
return Token.CLASS;
case "const":
return Token.CONST;
case "continue":
return Token.CONTINUE;
case "default":
return Token.DEFAULT;
case "do":
return Token.DO;
case "double":
return Token.DOUBLE;
case "else":
return Token.ELSE;
case "enum":
return Token.ENUM;
case "extends":
return Token.EXTENDS;
case "final":
return Token.FINAL;
case "finally":
return Token.FINALLY;
case "float":
return Token.FLOAT;
case "for":
return Token.FOR;
case "goto":
return Token.GOTO;
case "if":
return Token.IF;
case "implements":
return Token.IMPLEMENTS;
case "import":
return Token.IMPORT;
case "instanceof":
return Token.INSTANCEOF;
case "int":
return Token.INT;
case "interface":
return Token.INTERFACE;
case "long":
return Token.LONG;
case "native":
return Token.NATIVE;
case "new":
return Token.NEW;
case "package":
return Token.PACKAGE;
case "private":
return Token.PRIVATE;
case "protected":
return Token.PROTECTED;
case "public":
return Token.PUBLIC;
case "return":
return Token.RETURN;
case "short":
return Token.SHORT;
case "static":
return Token.STATIC;
case "strictfp":
return Token.STRICTFP;
case "super":
return Token.SUPER;
case "switch":
return Token.SWITCH;
case "synchronized":
return Token.SYNCHRONIZED;
case "this":
return Token.THIS;
case "throw":
return Token.THROW;
case "throws":
return Token.THROWS;
case "transient":
return Token.TRANSIENT;
case "try":
return Token.TRY;
case "void":
return Token.VOID;
case "volatile":
return Token.VOLATILE;
case "while":
return Token.WHILE;
case "true":
return Token.TRUE;
case "false":
return Token.FALSE;
case "null":
return Token.NULL;
default:
return Token.IDENT;
}
}
private TurbineError inputError() {
return error(
ErrorKind.UNEXPECTED_INPUT,
Character.isBmpCodePoint(ch) ? Character.toString((char) ch) : String.format("U+%X", ch));
}
private TurbineError error(ErrorKind kind, Object... args) {
return TurbineError.format(reader.source(), reader.position(), kind, args);
}
}