| /* |
| * Copyright 2000-2013 JetBrains s.r.o. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package com.jetbrains.python.lexer; |
| |
| import com.intellij.lexer.LexerBase; |
| import com.intellij.openapi.diagnostic.Logger; |
| import com.intellij.openapi.util.text.StringUtil; |
| import com.intellij.psi.StringEscapesTokenTypes; |
| import com.intellij.psi.tree.IElementType; |
| import com.jetbrains.python.PyTokenTypes; |
| import org.jetbrains.annotations.NotNull; |
| |
| /** |
| * Specialized lexer for string literals. To be used as a layer in a LayeredLexer. |
| * Mostly handles escapes, differently in byte / unicode / raw strings. |
| * Snatched from com.intellij.lexer.StringLiteralLexer and may inherit from it in the future. |
| * Lexes the entire string, with u/b/r designator, quotes, and content, thus self-adjusts for the format. |
| * User: dcheryasov |
| * Date: May 13, 2009 7:35:59 PM |
| */ |
| public class PyStringLiteralLexer extends LexerBase { |
| private static final Logger LOG = Logger.getInstance("#com.jetbrains.python.lexer.PyStringLiteralLexer"); |
| |
| private static final short BEFORE_FIRST_QUOTE = 0; // the initial state; may last during 'u' and 'r' prefixes. |
| private static final short AFTER_FIRST_QUOTE = 1; |
| private static final short AFTER_LAST_QUOTE = 2; |
| |
| private CharSequence myBuffer; |
| private int myStart; |
| private int myEnd; |
| private int myState; |
| private int myLastState; |
| private int myBufferEnd; |
| private char myQuoteChar; |
| |
| private boolean myIsRaw; |
| private boolean myIsTriple; |
| private final IElementType myOriginalLiteralToken; |
| private boolean mySeenEscapedSpacesOnly; |
| |
| |
| /** |
| * @param originalLiteralToken the AST node we're layering over. |
| */ |
| public PyStringLiteralLexer(final IElementType originalLiteralToken) { |
| myOriginalLiteralToken = originalLiteralToken; |
| } |
| |
| public void start(@NotNull CharSequence buffer, int startOffset, int endOffset, int initialState) { |
| myBuffer = buffer; |
| myStart = startOffset; |
| myState = initialState; |
| myLastState = initialState; |
| mySeenEscapedSpacesOnly = true; |
| myBufferEnd = endOffset; |
| |
| // the following could be parsing steps if we wanted this info as tokens |
| int i = myStart; |
| |
| i = skipEncodingPrefix(buffer, i); |
| int offset = skipRawPrefix(buffer, i); |
| if (offset > i) myIsRaw = true; |
| i = offset; |
| i = skipEncodingPrefix(buffer, i); |
| offset = skipRawPrefix(buffer, i); |
| if (offset > i) myIsRaw = true; |
| i = offset; |
| |
| // which quote char? |
| char c = buffer.charAt(i); |
| assert (c == '"') || (c == '\'') : "String must be quoted by single or double quote. Found '" + c + "' in string " + buffer; |
| myQuoteChar = c; |
| |
| myIsTriple = (buffer.length() > i + 2) && (buffer.charAt(i + 1) == c) && (buffer.charAt(i + 2) == c); |
| |
| // calculate myEnd at last |
| myEnd = locateToken(myStart); |
| } |
| |
| public static int skipRawPrefix(CharSequence text, int startOffset) { |
| char c = Character.toUpperCase(text.charAt(startOffset)); |
| if (c == 'R') { |
| startOffset++; |
| } |
| return startOffset; |
| } |
| |
| public static int skipEncodingPrefix(CharSequence text, int startOffset) { |
| char c = Character.toUpperCase(text.charAt(startOffset)); |
| if (c == 'U' || c == 'B' || c == 'C') { |
| startOffset++; |
| } |
| return startOffset; |
| } |
| |
| public int getState() { |
| return myLastState; |
| } |
| |
| public IElementType getTokenType() { |
| if (myStart >= myEnd) return null; |
| |
| // skip non-escapes immediately |
| if (myBuffer.charAt(myStart) != '\\' || (myIsRaw && (!isUnicodeMode() || !nextIsUnicodeEscape()))) { |
| mySeenEscapedSpacesOnly = false; |
| return myOriginalLiteralToken; |
| } |
| |
| // from here on, only escapes |
| if (myStart + 1 >= myEnd) return StringEscapesTokenTypes.INVALID_CHARACTER_ESCAPE_TOKEN; // escape ends too early |
| char nextChar = myBuffer.charAt(myStart + 1); |
| mySeenEscapedSpacesOnly &= nextChar == ' '; |
| if ((nextChar == '\n' || nextChar == ' ' && (mySeenEscapedSpacesOnly || isTrailingSpace(myStart+2)))) { |
| return StringEscapesTokenTypes.VALID_STRING_ESCAPE_TOKEN; // escaped EOL |
| } |
| if (nextChar == 'u' || nextChar == 'U') { |
| if (isUnicodeMode()) { |
| final int width = nextChar == 'u'? 4 : 8; // is it uNNNN or Unnnnnnnn |
| for(int i = myStart + 2; i < myStart + width + 2; i++) { |
| if (i >= myEnd || !StringUtil.isHexDigit(myBuffer.charAt(i))) return StringEscapesTokenTypes.INVALID_UNICODE_ESCAPE_TOKEN; |
| } |
| return StringEscapesTokenTypes.VALID_STRING_ESCAPE_TOKEN; |
| } |
| else return myOriginalLiteralToken; // b"\u1234" is just b"\\u1234", nothing gets escaped |
| } |
| |
| if (nextChar == 'x') { // \xNN is allowed both in bytes and unicode. |
| for(int i = myStart + 2; i < myStart + 4; i++) { |
| if (i >= myEnd || !StringUtil.isHexDigit(myBuffer.charAt(i))) return StringEscapesTokenTypes.INVALID_UNICODE_ESCAPE_TOKEN; |
| } |
| return StringEscapesTokenTypes.VALID_STRING_ESCAPE_TOKEN; |
| } |
| |
| if (nextChar == 'N' && isUnicodeMode()) { |
| int i = myStart+2; |
| if (i >= myEnd || myBuffer.charAt(i) != '{') return StringEscapesTokenTypes.INVALID_UNICODE_ESCAPE_TOKEN; |
| i++; |
| while(i < myEnd && myBuffer.charAt(i) != '}') i++; |
| if (i >= myEnd) return StringEscapesTokenTypes.INVALID_UNICODE_ESCAPE_TOKEN; |
| return StringEscapesTokenTypes.VALID_STRING_ESCAPE_TOKEN; |
| } |
| |
| switch (nextChar) { |
| case 'a': |
| case 'b': |
| case 'f': |
| case 'n': |
| case 'r': |
| case 't': |
| case 'v': |
| case '\'': |
| case '\"': |
| case '\\': |
| case '0': |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': return StringEscapesTokenTypes.VALID_STRING_ESCAPE_TOKEN; |
| } |
| |
| // other unrecognized escapes are just part of string, not an error |
| return myOriginalLiteralToken; |
| } |
| |
| private boolean nextIsUnicodeEscape() { |
| if (myStart + 1 < myEnd) { |
| char nextChar = myBuffer.charAt(myStart + 1); |
| return nextChar == 'u' || nextChar == 'U'; |
| } |
| return false; |
| } |
| |
| private boolean isUnicodeMode() { |
| return PyTokenTypes.UNICODE_NODES.contains(myOriginalLiteralToken); |
| } |
| |
| // all subsequent chars are escaped spaces |
| private boolean isTrailingSpace(final int start) { |
| for (int i=start; i<myBufferEnd; i+=2) { |
| final char c = myBuffer.charAt(i); |
| if (c != '\\') return false; |
| if (i == myBufferEnd-1) return false; |
| if (myBuffer.charAt(i+1) != ' ') return false; |
| } |
| return true; |
| } |
| |
| public int getTokenStart() { |
| assert myStart < myEnd || (myStart == myEnd && myEnd == myBufferEnd); |
| return myStart; |
| } |
| |
| public int getTokenEnd() { |
| if (!(myStart < myEnd || (myStart == myEnd && myEnd == myBufferEnd))) { |
| LOG.error("myStart=" + myStart + " myEnd="+ myEnd + " myBufferEnd=" + myBufferEnd + " text=" + myBuffer.subSequence(myStart, myBufferEnd)); |
| } |
| return myEnd; |
| } |
| |
| private int locateToken(int start) { |
| if (start == myBufferEnd) { |
| myState = AFTER_LAST_QUOTE; |
| } |
| if (myState == AFTER_LAST_QUOTE) return start; // exhausted |
| int i = start; |
| if (myBuffer.charAt(i) == '\\') { |
| LOG.assertTrue(myState == AFTER_FIRST_QUOTE); |
| i++; |
| if (myIsRaw) return i; |
| if (i == myBufferEnd) { |
| myState = AFTER_LAST_QUOTE; |
| return i; |
| } |
| |
| // is octal? |
| if (myBuffer.charAt(i) >= '0' && myBuffer.charAt(i) <= '7') { |
| char first = myBuffer.charAt(i); |
| i++; |
| if (i < myBufferEnd && myBuffer.charAt(i) >= '0' && myBuffer.charAt(i) <= '7') { |
| i++; |
| if (i < myBufferEnd && first <= '3' && myBuffer.charAt(i) >= '0' && myBuffer.charAt(i) <= '7') { |
| i++; |
| } |
| } |
| return i; |
| } |
| |
| // \xNN byte escape |
| if (myBuffer.charAt(i) == 'x') { |
| i++; |
| for (; i < start + 4; i++) { |
| if (i == myBufferEnd || myBuffer.charAt(i) == '\n' || myBuffer.charAt(i) == myQuoteChar) { |
| return i; |
| } |
| } |
| return i; |
| } |
| |
| // unicode escape |
| if (myBuffer.charAt(i) == 'u' || myBuffer.charAt(i) == 'U') { |
| final int width = myBuffer.charAt(i) == 'u'? 4 : 8; // is it uNNNN or Unnnnnnnn |
| i++; |
| for (; i < start + width + 2; i++) { |
| if (i == myBufferEnd || myBuffer.charAt(i) == '\n' || myBuffer.charAt(i) == myQuoteChar) { |
| return i; |
| } |
| } |
| return i; |
| } |
| |
| if (myBuffer.charAt(i) == 'N' && isUnicodeMode()) { |
| i++; |
| while(i < myBufferEnd && myBuffer.charAt(i) != '}') { |
| i++; |
| } |
| if (i < myBufferEnd) { |
| i++; |
| } |
| return i; |
| } |
| |
| else { |
| return i + 1; |
| } |
| } |
| else { // not a \something |
| //LOG.assertTrue(myState == AFTER_FIRST_QUOTE || myBuffer.charAt(i) == myQuoteChar); |
| final int quote_limit = myIsTriple ? 3 : 1; |
| int qcnt = 0; // count consequent quotes |
| while (i < myBufferEnd) { // scan to next \something |
| if (myBuffer.charAt(i) == '\\' && !myIsRaw) { |
| return i; |
| } |
| if (myState == BEFORE_FIRST_QUOTE && myBuffer.charAt(i) == myQuoteChar) { |
| qcnt += 1; |
| if (qcnt == quote_limit) { |
| myState = AFTER_FIRST_QUOTE; |
| qcnt = 0; // for last quote detection in the same pass |
| } |
| } |
| else if (myState == AFTER_FIRST_QUOTE && myBuffer.charAt(i) == myQuoteChar && (!myIsRaw || myBuffer.charAt(i-1) != '\\')) { // done? |
| qcnt += 1; |
| if (qcnt == quote_limit) { |
| myState = AFTER_LAST_QUOTE; |
| return i + 1; // skip the last remaining quote |
| } |
| } |
| else { // not an escape, not a quote |
| qcnt = 0; |
| } |
| i++; |
| } |
| } |
| |
| return i; |
| } |
| |
| public void advance() { |
| myLastState = myState; |
| myStart = myEnd; |
| myEnd = locateToken(myStart); |
| if (! (myStart < myEnd || (myStart == myEnd && myEnd == myBufferEnd))) { |
| LOG.warn("Inconsistent: start " + myStart + ", end " + myEnd + ", buf end " + myBufferEnd); |
| } |
| //assert myStart < myEnd || (myStart == myEnd && myEnd == myBufferEnd) : "Inconsistent: start " + myStart + ", end " + myEnd + ", buf end " + myBufferEnd; |
| } |
| |
| @NotNull |
| public CharSequence getBufferSequence() { |
| return myBuffer; |
| } |
| |
| public int getBufferEnd() { |
| return myBufferEnd; |
| } |
| } |