blob: 4ed6435982c7f4ed82cd35f4835a723a647c8a1a [file] [log] [blame]
/*
* Copyright 2000-2013 JetBrains s.r.o.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.jetbrains.python.lexer;
import com.intellij.lexer.LexerBase;
import com.intellij.openapi.diagnostic.Logger;
import com.intellij.openapi.util.text.StringUtil;
import com.intellij.psi.StringEscapesTokenTypes;
import com.intellij.psi.tree.IElementType;
import com.jetbrains.python.PyTokenTypes;
import org.jetbrains.annotations.NotNull;
/**
* Specialized lexer for string literals. To be used as a layer in a LayeredLexer.
* Mostly handles escapes, differently in byte / unicode / raw strings.
* Snatched from com.intellij.lexer.StringLiteralLexer and may inherit from it in the future.
* Lexes the entire string, with u/b/r designator, quotes, and content, thus self-adjusts for the format.
* User: dcheryasov
* Date: May 13, 2009 7:35:59 PM
*/
public class PyStringLiteralLexer extends LexerBase {
private static final Logger LOG = Logger.getInstance("#com.jetbrains.python.lexer.PyStringLiteralLexer");
private static final short BEFORE_FIRST_QUOTE = 0; // the initial state; may last during 'u' and 'r' prefixes.
private static final short AFTER_FIRST_QUOTE = 1;
private static final short AFTER_LAST_QUOTE = 2;
private CharSequence myBuffer;
private int myStart;
private int myEnd;
private int myState;
private int myLastState;
private int myBufferEnd;
private char myQuoteChar;
private boolean myIsRaw;
private boolean myIsTriple;
private final IElementType myOriginalLiteralToken;
private boolean mySeenEscapedSpacesOnly;
/**
* @param originalLiteralToken the AST node we're layering over.
*/
public PyStringLiteralLexer(final IElementType originalLiteralToken) {
myOriginalLiteralToken = originalLiteralToken;
}
public void start(@NotNull CharSequence buffer, int startOffset, int endOffset, int initialState) {
myBuffer = buffer;
myStart = startOffset;
myState = initialState;
myLastState = initialState;
mySeenEscapedSpacesOnly = true;
myBufferEnd = endOffset;
// the following could be parsing steps if we wanted this info as tokens
int i = myStart;
i = skipEncodingPrefix(buffer, i);
int offset = skipRawPrefix(buffer, i);
if (offset > i) myIsRaw = true;
i = offset;
i = skipEncodingPrefix(buffer, i);
offset = skipRawPrefix(buffer, i);
if (offset > i) myIsRaw = true;
i = offset;
// which quote char?
char c = buffer.charAt(i);
assert (c == '"') || (c == '\'') : "String must be quoted by single or double quote. Found '" + c + "' in string " + buffer;
myQuoteChar = c;
myIsTriple = (buffer.length() > i + 2) && (buffer.charAt(i + 1) == c) && (buffer.charAt(i + 2) == c);
// calculate myEnd at last
myEnd = locateToken(myStart);
}
public static int skipRawPrefix(CharSequence text, int startOffset) {
char c = Character.toUpperCase(text.charAt(startOffset));
if (c == 'R') {
startOffset++;
}
return startOffset;
}
public static int skipEncodingPrefix(CharSequence text, int startOffset) {
char c = Character.toUpperCase(text.charAt(startOffset));
if (c == 'U' || c == 'B' || c == 'C') {
startOffset++;
}
return startOffset;
}
public int getState() {
return myLastState;
}
public IElementType getTokenType() {
if (myStart >= myEnd) return null;
// skip non-escapes immediately
if (myBuffer.charAt(myStart) != '\\' || (myIsRaw && (!isUnicodeMode() || !nextIsUnicodeEscape()))) {
mySeenEscapedSpacesOnly = false;
return myOriginalLiteralToken;
}
// from here on, only escapes
if (myStart + 1 >= myEnd) return StringEscapesTokenTypes.INVALID_CHARACTER_ESCAPE_TOKEN; // escape ends too early
char nextChar = myBuffer.charAt(myStart + 1);
mySeenEscapedSpacesOnly &= nextChar == ' ';
if ((nextChar == '\n' || nextChar == ' ' && (mySeenEscapedSpacesOnly || isTrailingSpace(myStart+2)))) {
return StringEscapesTokenTypes.VALID_STRING_ESCAPE_TOKEN; // escaped EOL
}
if (nextChar == 'u' || nextChar == 'U') {
if (isUnicodeMode()) {
final int width = nextChar == 'u'? 4 : 8; // is it uNNNN or Unnnnnnnn
for(int i = myStart + 2; i < myStart + width + 2; i++) {
if (i >= myEnd || !StringUtil.isHexDigit(myBuffer.charAt(i))) return StringEscapesTokenTypes.INVALID_UNICODE_ESCAPE_TOKEN;
}
return StringEscapesTokenTypes.VALID_STRING_ESCAPE_TOKEN;
}
else return myOriginalLiteralToken; // b"\u1234" is just b"\\u1234", nothing gets escaped
}
if (nextChar == 'x') { // \xNN is allowed both in bytes and unicode.
for(int i = myStart + 2; i < myStart + 4; i++) {
if (i >= myEnd || !StringUtil.isHexDigit(myBuffer.charAt(i))) return StringEscapesTokenTypes.INVALID_UNICODE_ESCAPE_TOKEN;
}
return StringEscapesTokenTypes.VALID_STRING_ESCAPE_TOKEN;
}
if (nextChar == 'N' && isUnicodeMode()) {
int i = myStart+2;
if (i >= myEnd || myBuffer.charAt(i) != '{') return StringEscapesTokenTypes.INVALID_UNICODE_ESCAPE_TOKEN;
i++;
while(i < myEnd && myBuffer.charAt(i) != '}') i++;
if (i >= myEnd) return StringEscapesTokenTypes.INVALID_UNICODE_ESCAPE_TOKEN;
return StringEscapesTokenTypes.VALID_STRING_ESCAPE_TOKEN;
}
switch (nextChar) {
case 'a':
case 'b':
case 'f':
case 'n':
case 'r':
case 't':
case 'v':
case '\'':
case '\"':
case '\\':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7': return StringEscapesTokenTypes.VALID_STRING_ESCAPE_TOKEN;
}
// other unrecognized escapes are just part of string, not an error
return myOriginalLiteralToken;
}
private boolean nextIsUnicodeEscape() {
if (myStart + 1 < myEnd) {
char nextChar = myBuffer.charAt(myStart + 1);
return nextChar == 'u' || nextChar == 'U';
}
return false;
}
private boolean isUnicodeMode() {
return PyTokenTypes.UNICODE_NODES.contains(myOriginalLiteralToken);
}
// all subsequent chars are escaped spaces
private boolean isTrailingSpace(final int start) {
for (int i=start; i<myBufferEnd; i+=2) {
final char c = myBuffer.charAt(i);
if (c != '\\') return false;
if (i == myBufferEnd-1) return false;
if (myBuffer.charAt(i+1) != ' ') return false;
}
return true;
}
public int getTokenStart() {
assert myStart < myEnd || (myStart == myEnd && myEnd == myBufferEnd);
return myStart;
}
public int getTokenEnd() {
if (!(myStart < myEnd || (myStart == myEnd && myEnd == myBufferEnd))) {
LOG.error("myStart=" + myStart + " myEnd="+ myEnd + " myBufferEnd=" + myBufferEnd + " text=" + myBuffer.subSequence(myStart, myBufferEnd));
}
return myEnd;
}
private int locateToken(int start) {
if (start == myBufferEnd) {
myState = AFTER_LAST_QUOTE;
}
if (myState == AFTER_LAST_QUOTE) return start; // exhausted
int i = start;
if (myBuffer.charAt(i) == '\\') {
LOG.assertTrue(myState == AFTER_FIRST_QUOTE);
i++;
if (myIsRaw) return i;
if (i == myBufferEnd) {
myState = AFTER_LAST_QUOTE;
return i;
}
// is octal?
if (myBuffer.charAt(i) >= '0' && myBuffer.charAt(i) <= '7') {
char first = myBuffer.charAt(i);
i++;
if (i < myBufferEnd && myBuffer.charAt(i) >= '0' && myBuffer.charAt(i) <= '7') {
i++;
if (i < myBufferEnd && first <= '3' && myBuffer.charAt(i) >= '0' && myBuffer.charAt(i) <= '7') {
i++;
}
}
return i;
}
// \xNN byte escape
if (myBuffer.charAt(i) == 'x') {
i++;
for (; i < start + 4; i++) {
if (i == myBufferEnd || myBuffer.charAt(i) == '\n' || myBuffer.charAt(i) == myQuoteChar) {
return i;
}
}
return i;
}
// unicode escape
if (myBuffer.charAt(i) == 'u' || myBuffer.charAt(i) == 'U') {
final int width = myBuffer.charAt(i) == 'u'? 4 : 8; // is it uNNNN or Unnnnnnnn
i++;
for (; i < start + width + 2; i++) {
if (i == myBufferEnd || myBuffer.charAt(i) == '\n' || myBuffer.charAt(i) == myQuoteChar) {
return i;
}
}
return i;
}
if (myBuffer.charAt(i) == 'N' && isUnicodeMode()) {
i++;
while(i < myBufferEnd && myBuffer.charAt(i) != '}') {
i++;
}
if (i < myBufferEnd) {
i++;
}
return i;
}
else {
return i + 1;
}
}
else { // not a \something
//LOG.assertTrue(myState == AFTER_FIRST_QUOTE || myBuffer.charAt(i) == myQuoteChar);
final int quote_limit = myIsTriple ? 3 : 1;
int qcnt = 0; // count consequent quotes
while (i < myBufferEnd) { // scan to next \something
if (myBuffer.charAt(i) == '\\' && !myIsRaw) {
return i;
}
if (myState == BEFORE_FIRST_QUOTE && myBuffer.charAt(i) == myQuoteChar) {
qcnt += 1;
if (qcnt == quote_limit) {
myState = AFTER_FIRST_QUOTE;
qcnt = 0; // for last quote detection in the same pass
}
}
else if (myState == AFTER_FIRST_QUOTE && myBuffer.charAt(i) == myQuoteChar && (!myIsRaw || myBuffer.charAt(i-1) != '\\')) { // done?
qcnt += 1;
if (qcnt == quote_limit) {
myState = AFTER_LAST_QUOTE;
return i + 1; // skip the last remaining quote
}
}
else { // not an escape, not a quote
qcnt = 0;
}
i++;
}
}
return i;
}
public void advance() {
myLastState = myState;
myStart = myEnd;
myEnd = locateToken(myStart);
if (! (myStart < myEnd || (myStart == myEnd && myEnd == myBufferEnd))) {
LOG.warn("Inconsistent: start " + myStart + ", end " + myEnd + ", buf end " + myBufferEnd);
}
//assert myStart < myEnd || (myStart == myEnd && myEnd == myBufferEnd) : "Inconsistent: start " + myStart + ", end " + myEnd + ", buf end " + myBufferEnd;
}
@NotNull
public CharSequence getBufferSequence() {
return myBuffer;
}
public int getBufferEnd() {
return myBufferEnd;
}
}