/**************************************************************************** | |
** | |
** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies). | |
** All rights reserved. | |
** Contact: Nokia Corporation (qt-info@nokia.com) | |
** | |
** This file is part of the QtXmlPatterns module of the Qt Toolkit. | |
** | |
** $QT_BEGIN_LICENSE:LGPL$ | |
** GNU Lesser General Public License Usage | |
** This file may be used under the terms of the GNU Lesser General Public | |
** License version 2.1 as published by the Free Software Foundation and | |
** appearing in the file LICENSE.LGPL included in the packaging of this | |
** file. Please review the following information to ensure the GNU Lesser | |
** General Public License version 2.1 requirements will be met: | |
** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. | |
** | |
** In addition, as a special exception, Nokia gives you certain additional | |
** rights. These rights are described in the Nokia Qt LGPL Exception | |
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. | |
** | |
** GNU General Public License Usage | |
** Alternatively, this file may be used under the terms of the GNU General | |
** Public License version 3.0 as published by the Free Software Foundation | |
** and appearing in the file LICENSE.GPL included in the packaging of this | |
** file. Please review the following information to ensure the GNU General | |
** Public License version 3.0 requirements will be met: | |
** http://www.gnu.org/copyleft/gpl.html. | |
** | |
** Other Usage | |
** Alternatively, this file may be used in accordance with the terms and | |
** conditions contained in a signed written agreement between you and Nokia. | |
** | |
** | |
** | |
** | |
** | |
** $QT_END_LICENSE$ | |
** | |
****************************************************************************/ | |
#include <QByteArray> | |
#include "qquerytransformparser_p.h" | |
#include "qxquerytokenizer_p.h" | |
#include "qtokenlookup.cpp" | |
QT_BEGIN_NAMESPACE | |
namespace QPatternist | |
{ | |
#define handleWhitespace() \ | |
{ \ | |
const TokenType t = consumeWhitespace(); \ | |
if(t != SUCCESS) \ | |
return Token(t); \ | |
} | |
XQueryTokenizer::XQueryTokenizer(const QString &query, | |
const QUrl &location, | |
const State startingState) : Tokenizer(location) | |
, m_data(query) | |
, m_length(query.length()) | |
, m_state(startingState) | |
, m_pos(0) | |
, m_line(1) | |
, m_columnOffset(0) | |
, m_scanOnly(false) | |
{ | |
Q_ASSERT(location.isValid() || location.isEmpty()); | |
} | |
const QChar XQueryTokenizer::current() const | |
{ | |
if(m_pos < m_length) | |
return m_data.at(m_pos); | |
else | |
return QChar(); | |
} | |
char XQueryTokenizer::peekCurrent() const | |
{ | |
return current().toAscii(); | |
} | |
int XQueryTokenizer::peekForColonColon() const | |
{ | |
/* Note, we don't modify m_pos in this function, so we need to do offset | |
* calculations. */ | |
int pos = m_pos; | |
while(pos < m_length) | |
{ | |
switch(m_data.at(pos).toAscii()) | |
{ | |
/* Fallthrough these four. */ | |
case ' ': | |
case '\t': | |
case '\n': | |
case '\r': | |
break; | |
case ':': | |
{ | |
if(peekAhead((pos - m_pos) + 1) == ':') | |
return pos - m_pos; | |
/* Fallthrough. */ | |
} | |
default: | |
return -1; | |
} | |
++pos; | |
} | |
return -1; | |
} | |
Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code, | |
const State s, | |
const int advance) | |
{ | |
Q_ASSERT(advance >= 0); | |
m_pos += advance; | |
setState(s); | |
return Token(code); | |
} | |
Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code, | |
const QString &value, | |
const State s) | |
{ | |
setState(s); | |
return Token(code, value); | |
} | |
Tokenizer::Token XQueryTokenizer::tokenAndAdvance(const TokenType code, | |
const int advance) | |
{ | |
Q_ASSERT(advance >= 0); | |
m_pos += advance; | |
return Token(code); | |
} | |
QString XQueryTokenizer::normalizeEOL(const QString &input, | |
const CharacterSkips &characterSkips) | |
{ | |
const int len = input.count(); | |
QString result; | |
/* The likely hood is rather high it'll be the same content. */ | |
result.reserve(len); | |
for(int i = 0; i < len; ++i) | |
{ | |
const QChar &at = input.at(i); | |
if(characterSkips.contains(i)) | |
{ | |
result.append(at); | |
continue; | |
} | |
switch(input.at(i).unicode()) | |
{ | |
case '\r': | |
{ | |
if(i + 1 < len && input.at(i + 1) == QLatin1Char('\n')) | |
++i; | |
/* Else, fallthrough. */ | |
} | |
case '\n': | |
{ | |
result.append(QLatin1Char('\n')); | |
continue; | |
} | |
default: | |
{ | |
result.append(at); | |
} | |
} | |
} | |
return result; | |
} | |
Tokenizer::TokenType XQueryTokenizer::consumeComment() | |
{ | |
/* Below, we return ERROR instead of END_OF_FILE such that the parser | |
* sees an invalid comment. */ | |
while(m_pos < m_length) | |
{ | |
switch(peekCurrent()) | |
{ | |
case ':': | |
{ | |
++m_pos; /* Consume ':' */ | |
if(atEnd()) | |
return ERROR; | |
if(peekCurrent() == ')') | |
{ | |
++m_pos; /* Consume ')' */ | |
return SUCCESS; /* The comment closed nicely. */ | |
} | |
continue; /* We don't want to increment m_pos twice. */ | |
} | |
case '(': | |
{ /* It looks like the start of a comment. */ | |
++m_pos; | |
if(atEnd()) | |
return END_OF_FILE; | |
else if(peekCurrent() == ':') | |
{ | |
/* And it is a nested comment -- parse it. */ | |
const TokenType retval = consumeComment(); | |
if(retval == SUCCESS) | |
continue; /* Continue with our "own" comment. */ | |
else | |
return retval; /* Return the error in the nested comment. */ | |
} | |
break; | |
} | |
case '\n': | |
/* Fallthrough. */ | |
case '\r': | |
{ | |
/* We want to count \r\n as a single line break. */ | |
if(peekAhead() == '\n') | |
++m_pos; | |
m_columnOffset = m_pos; | |
++m_line; | |
break; | |
} | |
} | |
++m_pos; | |
} | |
return ERROR; /* Error: we reached the end while inside a comment. */ | |
} | |
bool XQueryTokenizer::consumeRawWhitespace() | |
{ | |
while(m_pos < m_length) | |
{ | |
switch(peekCurrent()) | |
{ | |
case ' ': | |
case '\t': | |
break; | |
case '\n': | |
case '\r': | |
{ | |
if(peekAhead() == '\n') | |
++m_pos; | |
m_columnOffset = m_pos; | |
++m_line; | |
break; | |
} | |
default: | |
return false; | |
} | |
++m_pos; | |
} | |
return true; | |
} | |
Tokenizer::TokenType XQueryTokenizer::consumeWhitespace() | |
{ | |
while(m_pos < m_length) | |
{ | |
switch(peekCurrent()) | |
{ | |
case ' ': | |
case '\t': | |
break; | |
case '\n': | |
case '\r': | |
{ | |
/* We want to count \r\n as a single line break. */ | |
if(peekAhead() == '\n') | |
++m_pos; | |
m_columnOffset = m_pos; | |
++m_line; | |
break; | |
} | |
case '(': | |
{ | |
if(peekAhead() == ':') | |
{ | |
m_pos += 2; /* Consume "(:" */ | |
const TokenType comment = consumeComment(); | |
if(comment == SUCCESS) | |
continue; | |
else | |
return comment; | |
} | |
} | |
default: | |
return SUCCESS; | |
} | |
++m_pos; | |
} | |
return END_OF_FILE; | |
} | |
char XQueryTokenizer::peekAhead(const int length) const | |
{ | |
if(m_pos + length < m_length) | |
return m_data.at(m_pos + length).toAscii(); | |
else | |
return 0; | |
} | |
Tokenizer::Token XQueryTokenizer::error() | |
{ | |
return Token(ERROR); | |
} | |
bool XQueryTokenizer::isDigit(const char ch) | |
{ | |
return ch >= '0' && ch <= '9'; | |
} | |
/* Replace with function in QXmlUtils. Write test cases for this. */ | |
bool XQueryTokenizer::isNCNameStart(const QChar ch) | |
{ | |
if(ch == QLatin1Char('_')) | |
return true; | |
switch(ch.category()) | |
{ | |
case QChar::Letter_Lowercase: | |
case QChar::Letter_Uppercase: | |
case QChar::Letter_Other: | |
case QChar::Letter_Titlecase: | |
case QChar::Number_Letter: | |
return true; | |
default: | |
return false; | |
} | |
} | |
bool XQueryTokenizer::isNCNameBody(const QChar ch) | |
{ | |
switch(ch.unicode()) | |
{ | |
case '.': | |
case '_': | |
case '-': | |
return true; | |
} | |
switch(ch.category()) | |
{ | |
case QChar::Letter_Lowercase: | |
case QChar::Letter_Uppercase: | |
case QChar::Letter_Other: | |
case QChar::Letter_Titlecase: | |
case QChar::Number_Letter: | |
case QChar::Mark_SpacingCombining: | |
case QChar::Mark_Enclosing: | |
case QChar::Mark_NonSpacing: | |
case QChar::Letter_Modifier: | |
case QChar::Number_DecimalDigit: | |
return true; | |
default: | |
return false; | |
} | |
} | |
bool XQueryTokenizer::isPhraseKeyword(const TokenType code) | |
{ | |
switch(code) | |
{ | |
/* Fallthrough all these. */ | |
case CASTABLE: | |
case CAST: | |
case COPY_NAMESPACES: | |
case DECLARE: | |
case EMPTY: | |
case MODULE: | |
case IMPORT: | |
case INSTANCE: | |
case ORDER: | |
case ORDERING: | |
case XQUERY: | |
case STABLE: | |
case TREAT: | |
return true; | |
default: | |
return false; | |
} | |
} | |
bool XQueryTokenizer::isOperatorKeyword(const TokenType code) | |
{ | |
switch(code) | |
{ | |
/* Fallthrough all these. */ | |
case AS: | |
case ASCENDING: | |
case AT: | |
case CASE: | |
case CAST: | |
case CASTABLE: | |
case EQ: | |
case EXTERNAL: | |
case GE: | |
case G_EQ: | |
case G_GT: | |
case G_LT: | |
case G_NE: | |
case GT: | |
case IN: | |
case INHERIT: | |
case INSTANCE: | |
case IS: | |
case ITEM: | |
case LE: | |
case LT: | |
case NE: | |
case NO_INHERIT: | |
case NO_PRESERVE: | |
case OF: | |
case PRESERVE: | |
case RETURN: | |
case STABLE: | |
case TO: | |
case TREAT: | |
return true; | |
default: | |
return false; | |
}; | |
} | |
bool XQueryTokenizer::isTypeToken(const TokenType t) | |
{ | |
switch(t) | |
{ | |
/* Fallthrough all these. */ | |
case ATTRIBUTE: | |
case COMMENT: | |
case DOCUMENT: | |
case DOCUMENT_NODE: | |
case ELEMENT: | |
case ITEM: | |
case NODE: | |
case PROCESSING_INSTRUCTION: | |
case SCHEMA_ATTRIBUTE: | |
case SCHEMA_ELEMENT: | |
case TEXT: | |
return true; | |
default: | |
return false; | |
} | |
} | |
Tokenizer::Token XQueryTokenizer::tokenizeNCNameOrQName() | |
{ | |
const int start = m_pos; | |
const Token t1 = tokenizeNCName(); | |
if(t1.hasError()) | |
return t1; | |
if(peekCurrent() != ':' || peekAhead() == '=') | |
return t1; | |
++m_pos; | |
const Token t2 = tokenizeNCName(); | |
if(t2.hasError()) | |
return t2; | |
else | |
return Token(QNAME, m_data.mid(start, m_pos - start)); | |
} | |
Tokenizer::Token XQueryTokenizer::tokenizeNumberLiteral() | |
{ | |
setState(Operator); | |
const int startPos = m_pos; | |
bool hasDot = false; | |
bool isXPath20 = false; | |
for(; m_pos < m_length; ++m_pos) | |
{ | |
QChar ch(current()); | |
char cell = ch.cell(); | |
if(cell == 'e' || cell == 'E') | |
{ | |
isXPath20 = true; | |
++m_pos; | |
ch = current(); | |
if(ch.row() != 0) | |
break; | |
cell = ch.cell(); | |
if(cell == '+' || cell == '-') | |
continue; | |
} | |
if(isNCNameStart(ch)) | |
return error(); | |
if(cell < '0' || cell > '9') | |
{ | |
if(cell == '.' && !hasDot) | |
hasDot = true; | |
else | |
break; | |
} | |
} | |
return Token(isXPath20 ? XPATH2_NUMBER : NUMBER, m_data.mid(startPos, m_pos - startPos)); | |
} | |
QString XQueryTokenizer::tokenizeCharacterReference() | |
{ | |
Q_ASSERT(peekCurrent() == '&'); | |
const int theEnd = m_data.indexOf(QLatin1Char(';'), m_pos + 1); | |
if(theEnd == -1) /* No ';' found, a syntax error. i18n. */ | |
return QString(); | |
QString content(m_data.mid(m_pos + 1, (theEnd - m_pos) - 1)); | |
m_pos = theEnd; | |
const QChar charRef(charForReference(content)); | |
if(!charRef.isNull()) | |
return charRef; | |
else if(content.startsWith(QLatin1Char('#'))) | |
{ | |
int base; | |
/* It is only '#' or '#x'. */ | |
if(content.length() < 2) | |
return QString(); | |
/* We got a hex number if it starts with 'x', otherwise it's a decimal. */ | |
if(content.at(1) == QLatin1Char('x')) | |
{ | |
base = 16; | |
content = content.mid(2); /* Remove "#x". */ | |
} | |
else | |
{ | |
base = 10; | |
content = content.mid(1); /* Remove "#". */ | |
} | |
bool conversionOK = false; | |
const int codepoint = content.toInt(&conversionOK, base); | |
if(conversionOK) | |
{ | |
const QChar ch(codepoint); | |
if(ch.isNull()) | |
{ | |
/* We likely have something which require surrogate pairs. */ | |
QString result; | |
result += QChar(QChar::highSurrogate(codepoint)); | |
result += QChar(QChar::lowSurrogate(codepoint)); | |
return result; | |
} | |
else | |
return ch; | |
} | |
else | |
return QString(); | |
} | |
else | |
return QString(); | |
} | |
int XQueryTokenizer::scanUntil(const char *const content) | |
{ | |
const int end = m_data.indexOf(QString::fromLatin1(content), m_pos); | |
if(end == -1) | |
return -1; | |
else | |
{ | |
const int len = end - m_pos; | |
m_pos += len; | |
return len; | |
} | |
} | |
QChar XQueryTokenizer::charForReference(const QString &reference) | |
{ | |
if(m_charRefs.isEmpty()) | |
{ | |
/* Initialize. */ | |
m_charRefs.reserve(5); | |
m_charRefs.insert(QLatin1String("lt"), QLatin1Char('<')); | |
m_charRefs.insert(QLatin1String("gt"), QLatin1Char('>')); | |
m_charRefs.insert(QLatin1String("amp"), QLatin1Char('&')); | |
m_charRefs.insert(QLatin1String("quot"), QLatin1Char('"')); | |
m_charRefs.insert(QLatin1String("apos"), QLatin1Char('\'')); | |
} | |
return m_charRefs.value(reference); | |
} | |
Tokenizer::Token XQueryTokenizer::tokenizeStringLiteral() | |
{ | |
const QChar delimiter(current()); | |
/* We cannot unfortunately just scan and then do mid(), | |
* since we can encounter character references. */ | |
QString result; | |
/* This is more likely than QString's default allocation. */ | |
result.reserve(8); | |
CharacterSkips skipEOLNormalization; | |
/* Advance over the initial quote character. */ | |
++m_pos; | |
for(; m_pos < m_length; ++m_pos) | |
{ | |
const QChar c(current()); | |
if(c == QLatin1Char('&')) | |
{ | |
const QString charRef(tokenizeCharacterReference()); | |
if(charRef.isNull()) | |
return error(); | |
else | |
{ | |
skipEOLNormalization.insert(result.count()); | |
result.append(charRef); | |
} | |
} | |
else if(c == delimiter) | |
{ | |
/* Maybe the escaping mechanism is used. For instance, "s""s" | |
* has the value `s"s'. */ | |
++m_pos; | |
if(current() == delimiter) /* Double quote. */ | |
result += delimiter; | |
else | |
return Token(STRING_LITERAL, normalizeEOL(result, skipEOLNormalization)); | |
} | |
else | |
result += c; | |
} | |
return error(); | |
} | |
Tokenizer::Token XQueryTokenizer::tokenizeNCName() | |
{ | |
const int startPos = m_pos; | |
if(m_pos < m_length && isNCNameStart(current())) | |
{ | |
++m_pos; | |
for(; m_pos < m_length; ++m_pos) | |
{ | |
if(!isNCNameBody(current())) | |
break; | |
} | |
return Token(NCNAME, m_data.mid(startPos, m_pos - startPos)); | |
} | |
else | |
return error(); | |
} | |
bool XQueryTokenizer::aheadEquals(const char *const chs, | |
const int len, | |
const int offset) const | |
{ | |
Q_ASSERT(len > 0); | |
Q_ASSERT(qstrlen(chs) == uint(len)); | |
if(m_pos + len >= m_length) | |
return false; | |
for(int i = offset; i < (len + offset); ++i) | |
{ | |
if(m_data.at(m_pos + i).toAscii() != chs[i - offset]) | |
return false; | |
} | |
return true; | |
} | |
const TokenMap *XQueryTokenizer::lookupKeyword(const QString &keyword) | |
{ | |
return TokenLookup::value(keyword.toAscii().constData(), keyword.length()); | |
} | |
XQueryTokenizer::State XQueryTokenizer::state() const | |
{ | |
return m_state; | |
} | |
void XQueryTokenizer::setState(const State s) | |
{ | |
m_state = s; | |
} | |
void XQueryTokenizer::pushState(const State s) | |
{ | |
m_stateStack.push(s); | |
} | |
void XQueryTokenizer::pushState() | |
{ | |
m_stateStack.push(m_state); | |
} | |
void XQueryTokenizer::popState() | |
{ | |
/* QStack::pop() asserts if it's empty, so we need to check | |
* it, since we might receive unbalanced curlies. */ | |
if(!m_stateStack.isEmpty()) | |
m_state = m_stateStack.pop(); | |
} | |
Tokenizer::Token XQueryTokenizer::nextToken() | |
{ | |
switch(state()) | |
{ | |
/* We want to skip or do special whitespace handling for these | |
* states. So fallthrough all of the following. */ | |
case AposAttributeContent: | |
case Axis: | |
case ElementContent: | |
case EndTag: | |
case Pragma: | |
case PragmaContent: | |
case ProcessingInstructionName: | |
case QuotAttributeContent: | |
case StartTag: | |
case XMLComment: | |
break; | |
default: | |
handleWhitespace(); | |
} | |
switch(state()) | |
{ | |
case XMLSpaceDecl: | |
/* Fallthrough. */ | |
case NamespaceKeyword: | |
{ | |
switch(peekCurrent()) | |
{ | |
case ',': | |
return tokenAndAdvance(COMMA); | |
case '"': | |
/* Fallthrough. */ | |
case '\'': | |
{ | |
setState(NamespaceDecl); | |
return tokenizeStringLiteral(); | |
} | |
} | |
const Token id(tokenizeNCName()); | |
if(id.type != NCNAME) | |
return id; | |
const TokenMap *const keyword = lookupKeyword(id.value); | |
if(keyword) | |
{ | |
switch(keyword->token) | |
{ | |
case INHERIT: | |
/* Fallthrough. */ | |
case NO_INHERIT: | |
{ | |
setState(Default); | |
break; | |
} | |
case NAMESPACE: | |
{ | |
setState(NamespaceDecl); | |
break; | |
} | |
case ORDERED: | |
/* Fallthrough. */ | |
case UNORDERED: | |
/* Fallthrough. */ | |
case STRIP: | |
{ | |
setState(Default); | |
break; | |
} | |
case PRESERVE: | |
{ | |
if(state() != NamespaceKeyword) | |
setState(Default); | |
} | |
default: | |
break; | |
} | |
return Token(keyword->token); | |
} | |
else | |
return id; | |
Q_ASSERT(false); | |
} | |
case NamespaceDecl: | |
{ | |
switch(peekCurrent()) | |
{ | |
case '=': | |
return tokenAndAdvance(G_EQ); | |
case ';': | |
return tokenAndChangeState(SEMI_COLON, Default); | |
case '\'': | |
/* Fallthrough. */ | |
case '\"': | |
return tokenizeStringLiteral(); | |
} | |
const Token nc(tokenizeNCName()); | |
handleWhitespace(); | |
const char pc = peekCurrent(); | |
const TokenMap* const t = lookupKeyword(nc.value); | |
if(pc == '\'' || (pc == '"' && t)) | |
return tokenAndChangeState(t->token, Default, 0); | |
else | |
return nc; | |
Q_ASSERT(false); | |
} | |
case Axis: | |
{ | |
if(peekCurrent() == ':') | |
{ | |
Q_ASSERT(peekAhead() == ':'); | |
m_pos += 2; | |
setState(AfterAxisSeparator); | |
return Token(COLONCOLON); | |
} | |
/* Fallthrough. */ | |
} | |
case AfterAxisSeparator: | |
/* Fallthrough. */ | |
case Default: | |
/* State Operator and state Default have a lot of tokens in common except | |
* for minor differences. So we treat them the same way, and sprinkles logic | |
* here and there to handle the small differences. */ | |
/* Fallthrough. */ | |
case Operator: | |
{ | |
switch(peekCurrent()) | |
{ | |
case '=': | |
return tokenAndChangeState(G_EQ, Default); | |
case '-': | |
return tokenAndChangeState(MINUS, Default); | |
case '+': | |
return tokenAndChangeState(PLUS, Default); | |
case '[': | |
return tokenAndChangeState(LBRACKET, Default); | |
case ']': | |
return tokenAndChangeState(RBRACKET, Operator); | |
case ',': | |
return tokenAndChangeState(COMMA, Default); | |
case ';': | |
return tokenAndChangeState(SEMI_COLON, Default); | |
case '$': | |
return tokenAndChangeState(DOLLAR, VarName); | |
case '|': | |
return tokenAndChangeState(BAR, Default); | |
case '?': | |
return tokenAndChangeState(QUESTION, Operator); | |
case ')': | |
return tokenAndChangeState(RPAREN, Operator); | |
case '@': | |
return tokenAndChangeState(AT_SIGN, Default); | |
/* Fallthrough all these. */ | |
case '1': | |
case '2': | |
case '3': | |
case '4': | |
case '5': | |
case '6': | |
case '7': | |
case '8': | |
case '9': | |
case '0': | |
return tokenizeNumberLiteral(); | |
case '.': | |
{ | |
const char next = peekAhead(); | |
if(next == '.') | |
return tokenAndChangeState(DOTDOT, Operator, 2); | |
/* .5 is allowed, as short form for 0.5: | |
* <tt>[142] DecimalLiteral ::= ("." Digits) | (Digits "." [0-9]*)</tt> | |
*/ | |
else if(isDigit(next)) | |
return tokenizeNumberLiteral(); | |
else | |
return tokenAndChangeState(DOT, Operator); | |
} | |
case '\'': | |
/* Fallthrough. */ | |
case '"': | |
{ | |
setState(Operator); | |
return tokenizeStringLiteral(); | |
} | |
case '(': | |
{ | |
if(peekAhead() == '#') | |
return tokenAndChangeState(PRAGMA_START, Pragma, 2); | |
else | |
return tokenAndChangeState(LPAREN, Default); | |
} | |
case '*': | |
{ | |
if(peekAhead() == ':') | |
{ | |
m_pos += 2; /* Consume *:. */ | |
const Token nc = tokenizeNCName(); | |
if(nc.hasError()) | |
return error(); | |
else | |
return tokenAndChangeState(ANY_PREFIX, nc.value, Operator); | |
} | |
else | |
return tokenAndChangeState(STAR, state() == Default ? Operator : Default); | |
} | |
case ':': | |
{ | |
switch(peekAhead()) | |
{ | |
case '=': | |
return tokenAndChangeState(ASSIGN, Default, 2); | |
case ':': | |
return tokenAndChangeState(COLONCOLON, Default, 2); | |
default: | |
return error(); | |
} | |
} | |
case '!': | |
{ | |
if(peekAhead() == '=') | |
return tokenAndChangeState(G_NE, Default, 2); | |
else | |
return error(); | |
} | |
case '<': | |
{ | |
switch(peekAhead()) | |
{ | |
case '=': | |
return tokenAndChangeState(G_LE, Default, 2); | |
case '<': | |
return tokenAndChangeState(PRECEDES, Default, 2); | |
case '?': | |
{ | |
pushState(Operator); | |
return tokenAndChangeState(PI_START, ProcessingInstructionName, 2); | |
} | |
case '!': | |
{ | |
if(aheadEquals("!--", 3)) | |
{ | |
m_pos += 3; /* Consume "!--". */ | |
pushState(Operator); | |
return tokenAndChangeState(COMMENT_START, XMLComment); | |
} | |
/* Fallthrough. It's a syntax error, and this is a good way to report it. */ | |
} | |
default: | |
{ | |
if((m_pos + 1) < m_length && isNCNameStart(m_data.at(m_pos + 1))) | |
{ | |
/* We assume it's an element constructor. */ | |
pushState(Operator); | |
} | |
return tokenAndChangeState(G_LT, state() == Operator ? Default : StartTag); | |
} | |
} | |
} | |
case '>': | |
{ | |
switch(peekAhead()) | |
{ | |
case '=': | |
return tokenAndChangeState(G_GE, Default, 2); | |
case '>': | |
return tokenAndChangeState(FOLLOWS, Default, 2); | |
default: | |
return tokenAndChangeState(G_GT, Default); | |
} | |
} | |
case '/': | |
{ | |
if(peekAhead() == '/') | |
return tokenAndChangeState(SLASHSLASH, Default, 2); | |
else | |
return tokenAndChangeState(SLASH, Default); | |
} | |
case '{': | |
{ | |
pushState(Operator); | |
return tokenAndChangeState(CURLY_LBRACE, Default); | |
} | |
case '}': | |
{ | |
popState(); | |
return tokenAndAdvance(CURLY_RBRACE); | |
} | |
} | |
/* Ok. We're in state Default or Operator, and it wasn't a simple | |
* character. */ | |
const Token id(tokenizeNCName()); | |
if(id.type != NCNAME) | |
return id; | |
const TokenMap *const keyword = lookupKeyword(id.value); | |
if(state() == Operator) | |
{ | |
if(keyword) | |
{ | |
if(keyword->token == DEFAULT || keyword->token == ASCENDING || keyword->token == DESCENDING) | |
setState(Operator); | |
else if(keyword->token == RETURN) | |
setState(Default); | |
else if(isPhraseKeyword(keyword->token)) | |
{ | |
const TokenType ws = consumeWhitespace(); | |
if(ws == ERROR) | |
return error(); | |
const Token id2(tokenizeNCName()); | |
const TokenMap *const keyword2 = lookupKeyword(id2.value); | |
if(keyword2) | |
{ | |
if(keyword->token == TREAT && keyword2->token == AS) | |
setState(ItemType); | |
else if (keyword->token == CAST || (keyword->token == CASTABLE && keyword2->token == AS) || keyword2->token == BY) | |
setState(Default); | |
m_tokenStack.push(Token(keyword2->token)); | |
} | |
else | |
m_tokenStack.push(id2); | |
return Token(keyword->token); | |
} | |
else | |
{ | |
/* Such that we tokenize the second token in "empty greatest". */ | |
if(keyword->token != EMPTY) | |
setState(Default); | |
} | |
if(keyword->token == AS || keyword->token == CASE) | |
setState(ItemType); | |
return Token(keyword->token); | |
} | |
else | |
return id; | |
} | |
Q_ASSERT(state() == Default || state() == Axis || state() == AfterAxisSeparator); | |
/* | |
* This is hard. Consider this: | |
* | |
* Valid: child ::nameTest | |
* Valid: child:: nameTest | |
* Syntax Error: child :localName | |
* Syntax Error: child: localName | |
* | |
* Consider "child ::name". Right now, we're here: | |
* ^ | |
* We don't know whether "child" is a prefix and hence the whitespace is invalid, | |
* or whether it's an axis and hence skippable. */ | |
{ | |
const int wsLength = peekForColonColon(); | |
/* We cannot call handleWhitespace() because it returns on | |
* END_OF_FILE, and we have parsed up keyword, and we need to | |
* deal with that. | |
* | |
* If we have a colon colon, which means the whitespace is | |
* allowed, we skip it. */ | |
if(wsLength != -1) | |
m_pos += wsLength; | |
} | |
/* Handle name tests. */ | |
if(peekCurrent() == ':') | |
{ | |
switch(peekAhead()) | |
{ | |
case '=': | |
return id; | |
case '*': | |
{ | |
m_pos += 2; | |
return tokenAndChangeState(ANY_LOCAL_NAME, id.value, Operator); | |
} | |
case ':': | |
{ | |
/* We have an axis. */ | |
setState(Axis); | |
return keyword ? Token(keyword->token) : id; | |
} | |
default: | |
{ | |
/* It's a QName. */ | |
++m_pos; /* Consume the colon. */ | |
const Token id2(tokenizeNCName()); | |
if(id2.type != NCNAME) | |
{ | |
--m_pos; | |
return id; | |
} | |
setState(Operator); | |
const int qNameLen = id.value.length() + id2.value.length() + 1; | |
return Token(QNAME, m_data.mid(m_pos - qNameLen, qNameLen)); | |
} | |
} | |
} | |
if(!keyword || isOperatorKeyword(keyword->token)) | |
{ | |
setState(Operator); | |
return id; | |
} | |
const TokenType ws = consumeWhitespace(); | |
if(ws == ERROR) // TODO this should test for success. Write test. | |
return Token(ERROR); | |
if(atEnd()) | |
{ | |
setState(Operator); | |
return id; | |
} | |
/* Let the if-body apply for constructors, and node type tests. */ | |
if(isTypeToken(keyword->token) || | |
keyword->token == TYPESWITCH || | |
keyword->token == ORDERED || | |
keyword->token == UNORDERED || | |
keyword->token == IF) | |
{ | |
switch(peekCurrent()) | |
{ | |
case '(': | |
{ | |
// TODO See if we can remove DOCUMENT from isTypeToken. | |
if(isTypeToken(keyword->token) && keyword->token != DOCUMENT) | |
{ | |
m_tokenStack.push(Token(LPAREN)); | |
++m_pos; /* Consume '('. */ | |
pushState(Operator); | |
if(keyword->token == PROCESSING_INSTRUCTION) | |
setState(KindTestForPI); | |
else | |
setState(KindTest); | |
return Token(keyword->token); | |
} | |
else if(keyword->token == TYPESWITCH || keyword->token == IF) | |
return Token(keyword->token); | |
else /* It's a function call. */ | |
return id; | |
} | |
case '{': | |
{ | |
m_tokenStack.push(Token(CURLY_LBRACE)); | |
++m_pos; /* Consume '{'. */ | |
pushState(Operator); | |
/* Stay in state Default. */ | |
return Token(keyword->token); | |
} | |
default: | |
{ | |
/* We have read in a token which is for instance | |
* "return", and now it can be an element | |
* test("element") a node kind test("element()"), or a | |
* computed element constructor("element name {..."). | |
* We need to do a two-token lookahead here, because | |
* "element return" can be an element test followed by | |
* the return keyword, but it can also be an element | |
* constructor("element return {"). */ | |
if(isNCNameStart(current())) | |
{ | |
const int currentPos = m_pos; | |
const Token token2 = tokenizeNCNameOrQName(); | |
if(token2.hasError()) | |
return token2; | |
handleWhitespace(); | |
if(peekCurrent() == '{') | |
{ | |
/* An element constructor. */ | |
m_tokenStack.push(token2); | |
return Token(keyword->token); | |
} | |
/* We jump back in the stream, we need to tokenize token2 according | |
* to the state. */ | |
m_pos = currentPos; | |
setState(Operator); | |
return Token(NCNAME, QLatin1String(keyword->name)); | |
} | |
} | |
} | |
} | |
if(peekCurrent() == '$') | |
{ | |
setState(VarName); | |
return Token(keyword->token); | |
} | |
/* It's not a node type, it's not the typeswitch expression, but it is a function callsite. */ | |
if(peekCurrent() == '(') | |
return id; | |
else if(peekCurrent() == '{' && keyword->token == VALIDATE) | |
return Token(keyword->token); | |
if(!isNCNameStart(current())) | |
{ | |
setState(Operator); | |
return id; | |
} | |
const Token id2(tokenizeNCName()); | |
const TokenMap *const keyword2 = lookupKeyword(id2.value); | |
if(!keyword2) | |
{ | |
/* It's a syntax error. All cases of two subsequent ncnames are keywords(e.g, declarations). */ | |
setState(Operator); | |
return id; | |
} | |
switch(keyword->token) | |
{ | |
case DECLARE: | |
{ | |
switch(keyword2->token) | |
{ | |
case VARIABLE: | |
/* Fallthrough. */ | |
case FUNCTION: | |
{ | |
m_tokenStack.push(Token(keyword2->token)); | |
setState(Default); | |
return Token(keyword->token); | |
} | |
case OPTION: | |
{ | |
m_tokenStack.push(Token(keyword2->token)); | |
setState(Default); | |
return Token(keyword->token); | |
} | |
case COPY_NAMESPACES: | |
/* Fallthrough. */ | |
case ORDERING: | |
{ | |
m_tokenStack.push(Token(keyword2->token)); | |
setState(NamespaceKeyword); | |
return Token(keyword->token); | |
} | |
case CONSTRUCTION: | |
{ | |
// TODO identical to CONSTRUCTION? | |
m_tokenStack.push(Token(keyword2->token)); | |
setState(Operator); | |
return Token(keyword->token); | |
} | |
case NAMESPACE: | |
/* Fallthrough. */ | |
case BASEURI: | |
{ | |
m_tokenStack.push(Token(keyword2->token)); | |
setState(NamespaceDecl); | |
return Token(keyword->token); | |
} | |
case BOUNDARY_SPACE: | |
{ | |
m_tokenStack.push(Token(keyword2->token)); | |
setState(XMLSpaceDecl); | |
return Token(keyword->token); | |
} | |
case DEFAULT: | |
{ | |
m_tokenStack.push(Token(keyword2->token)); | |
const TokenType ws2 = consumeWhitespace(); | |
if(ws2 != SUCCESS) | |
{ | |
m_tokenStack.prepend(Token(ws2)); | |
return Token(keyword->token); | |
} | |
const Token id3(tokenizeNCName()); | |
if(id3.type != NCNAME) | |
{ | |
m_tokenStack.prepend(id3); | |
return Token(keyword->token); | |
} | |
const TokenMap *const keyword3 = lookupKeyword(id3.value); | |
if(!keyword3) | |
{ | |
m_tokenStack.prepend(id3); | |
return Token(keyword->token); | |
} | |
else | |
{ | |
m_tokenStack.prepend(Token(keyword3->token)); | |
if(keyword3->token == ORDER) | |
setState(Operator); | |
else | |
setState(NamespaceDecl); | |
} | |
return Token(keyword->token); | |
} | |
default: | |
{ | |
m_tokenStack.push(Token(keyword2->token)); | |
setState(Default); | |
return id; | |
} | |
} | |
} | |
case XQUERY: | |
{ | |
m_tokenStack.push(Token(keyword2->token)); | |
if(keyword2->token == VERSION) | |
{ | |
setState(NamespaceDecl); | |
return Token(keyword->token); | |
} | |
else | |
{ | |
setState(Operator); | |
return id; | |
} | |
} | |
case IMPORT: | |
{ | |
m_tokenStack.push(Token(keyword2->token)); | |
switch(keyword2->token) | |
{ | |
case SCHEMA: | |
/* Fallthrough. */ | |
case MODULE: | |
{ | |
setState(NamespaceKeyword); | |
return Token(keyword->token); | |
} | |
default: | |
{ | |
setState(Operator); | |
return id; | |
} | |
} | |
} | |
case VALIDATE: | |
{ | |
m_tokenStack.push(Token(keyword2->token)); | |
switch(keyword2->token) | |
{ | |
case LAX: | |
case STRICT: | |
{ | |
pushState(Operator); | |
return Token(keyword->token); | |
} | |
default: | |
{ | |
setState(Operator); | |
return id; | |
} | |
} | |
} | |
default: | |
{ | |
m_tokenStack.push(Token(keyword2->token)); | |
setState(Operator); | |
return id; | |
} | |
} | |
Q_ASSERT(false); | |
} | |
case VarName: | |
{ | |
if(peekCurrent() == '$') | |
return tokenAndAdvance(DOLLAR); | |
setState(Operator); | |
return tokenizeNCNameOrQName(); | |
Q_ASSERT(false); | |
} | |
case ItemType: | |
{ | |
switch(peekCurrent()) | |
{ | |
case '(': | |
return tokenAndChangeState(LPAREN, KindTest); | |
case '$': | |
return tokenAndChangeState(DOLLAR, VarName); | |
} | |
const Token name(tokenizeNCNameOrQName()); | |
if(name.hasError()) | |
return error(); | |
else if(name.type == QNAME) | |
{ | |
setState(OccurrenceIndicator); | |
return name; | |
} | |
else | |
{ | |
const TokenMap *const keyword = lookupKeyword(name.value); | |
if(keyword) | |
{ | |
pushState(OccurrenceIndicator); | |
return Token(keyword->token); | |
} | |
else | |
{ | |
setState(Default); | |
return name; | |
} | |
} | |
Q_ASSERT(false); | |
} | |
case KindTest: | |
{ | |
switch(peekCurrent()) | |
{ | |
case ')': | |
{ | |
popState(); | |
return tokenAndAdvance(RPAREN); | |
} | |
case '(': | |
return tokenAndAdvance(LPAREN); | |
case ',': | |
return tokenAndAdvance(COMMA); | |
case '*': | |
return tokenAndAdvance(STAR); | |
case '?': | |
return tokenAndAdvance(QUESTION); | |
case '\'': | |
/* Fallthrough. */ | |
case '"': | |
return tokenizeStringLiteral(); | |
} | |
const Token nc(tokenizeNCNameOrQName()); | |
if(nc.hasError()) | |
return nc; | |
const TokenType ws = consumeWhitespace(); | |
if(ws == ERROR) | |
return error(); | |
if(peekCurrent() == '(') | |
{ | |
const TokenMap *const keyword = lookupKeyword(nc.value); | |
if(keyword) | |
{ | |
pushState(KindTest); | |
return Token(keyword->token); | |
} | |
else | |
return nc; | |
} | |
else | |
return nc; | |
Q_ASSERT(false); | |
} | |
case KindTestForPI: | |
{ | |
switch(peekCurrent()) | |
{ | |
case ')': | |
{ | |
popState(); | |
return tokenAndAdvance(RPAREN); | |
} | |
case '\'': | |
/* Fallthrough. */ | |
case '"': | |
return tokenizeStringLiteral(); | |
default: | |
return tokenizeNCName(); | |
} | |
Q_ASSERT(false); | |
} | |
case OccurrenceIndicator: | |
{ | |
switch(peekCurrent()) | |
{ | |
case '?': | |
return tokenAndChangeState(QUESTION, Operator); | |
case '*': | |
return tokenAndChangeState(STAR, Operator); | |
case '+': | |
return tokenAndChangeState(PLUS, Operator); | |
default: | |
{ | |
setState(Operator); | |
return nextToken(); | |
} | |
} | |
Q_ASSERT(false); | |
} | |
case XQueryVersion: | |
{ | |
switch(peekCurrent()) | |
{ | |
case '\'': | |
/* Fallthrough. */ | |
case '"': | |
return tokenizeStringLiteral(); | |
case ';': | |
return tokenAndChangeState(SEMI_COLON, Default); | |
} | |
const Token id(tokenizeNCName()); | |
if(id.type != NCNAME) | |
return id; | |
const TokenMap *const keyword = lookupKeyword(id.value); | |
if(keyword) | |
return tokenAndChangeState(keyword->token, Default); | |
else | |
return id; | |
Q_ASSERT(false); | |
} | |
case StartTag: | |
{ | |
if(peekAhead(-1) == '<') | |
{ | |
if(current().isSpace()) | |
return Token(ERROR); | |
} | |
else | |
{ | |
if(consumeRawWhitespace()) | |
return Token(END_OF_FILE); | |
} | |
switch(peekCurrent()) | |
{ | |
case '/': | |
{ | |
if(peekAhead() == '>') | |
{ | |
m_pos += 2; | |
if(m_scanOnly) | |
return Token(POSITION_SET); | |
else | |
{ | |
popState(); | |
return Token(QUICK_TAG_END); | |
} | |
} | |
else | |
return error(); | |
} | |
case '>': | |
{ | |
if(m_scanOnly) | |
return tokenAndChangeState(POSITION_SET, StartTag); | |
else | |
return tokenAndChangeState(G_GT, ElementContent); | |
} | |
case '=': | |
return tokenAndAdvance(G_EQ); | |
case '\'': | |
return tokenAndChangeState(APOS, AposAttributeContent); | |
case '"': | |
return tokenAndChangeState(QUOTE, QuotAttributeContent); | |
default: | |
return tokenizeNCNameOrQName(); | |
} | |
Q_ASSERT(false); | |
} | |
case AposAttributeContent: | |
/* Fallthrough. */ | |
case QuotAttributeContent: | |
{ | |
const QChar sep(state() == AposAttributeContent ? QLatin1Char('\'') : QLatin1Char('"')); | |
QString result; | |
result.reserve(20); | |
if(m_scanOnly) | |
{ | |
int stack = 0; | |
return attributeAsRaw(sep, stack, m_pos, true, result); | |
} | |
Q_ASSERT(!m_scanOnly); | |
while(true) | |
{ | |
if(atEnd()) | |
{ | |
/* In the case that the XSL-T tokenizer invokes us with | |
* default state QuotAttributeContent, we need to be able | |
* to return a single string, in case that is all we have | |
* accumulated. */ | |
if(result.isEmpty()) | |
return Token(END_OF_FILE); | |
else | |
return Token(STRING_LITERAL, result); | |
} | |
const QChar curr(current()); | |
if(curr == sep) | |
{ | |
if(m_pos + 1 == m_length) | |
return Token(END_OF_FILE); | |
if(m_data.at(m_pos + 1) == sep) | |
{ | |
/* The quoting mechanism was used. */ | |
m_pos += 2; | |
result.append(sep); | |
continue; | |
} | |
const QChar next(m_data.at(m_pos + 1)); | |
if(!next.isSpace() && next != QLatin1Char('/') && next != QLatin1Char('>')) | |
return Token(ERROR); // i18n Space must separate attributes | |
else if(result.isEmpty()) | |
{ | |
return tokenAndChangeState(state() == AposAttributeContent ? APOS : QUOTE, | |
StartTag, 1); | |
} | |
else | |
{ | |
/* Don't consume the sep, but leave it so we next time return a token for it. */ | |
return Token(STRING_LITERAL, result); | |
} | |
++m_pos; | |
continue; | |
} | |
else if(curr == QLatin1Char('{')) | |
{ | |
if(m_pos + 1 == m_length) | |
return Token(END_OF_FILE); | |
else if(peekAhead() == '{') | |
{ | |
++m_pos; | |
result.append(QLatin1Char('{')); | |
} | |
else | |
{ | |
if(result.isEmpty()) | |
{ | |
/* The Attribute Value Template appeared directly in the attribute. */ | |
pushState(); | |
return tokenAndChangeState(CURLY_LBRACE, Default); | |
} | |
else | |
{ | |
/* We don't advance, keep '{' as next token. */ | |
return Token(STRING_LITERAL, result); | |
} | |
} | |
} | |
else if(curr == QLatin1Char('}')) | |
{ | |
if(m_pos + 1 == m_length) | |
return Token(END_OF_FILE); | |
else if(peekAhead() == '}') | |
{ | |
++m_pos; | |
result.append(QLatin1Char('}')); | |
} | |
else | |
return Token(ERROR); | |
} | |
else if(curr == QLatin1Char('&')) | |
{ | |
const QString ret(tokenizeCharacterReference()); | |
if(ret.isNull()) | |
return Token(ERROR); | |
else | |
result.append(ret); | |
} | |
else if(curr == QLatin1Char('<')) | |
return Token(STRING_LITERAL, result); | |
else | |
{ | |
/* See Extensible Markup Language (XML) 1.0 (Fourth Edition), | |
* 3.3.3 Attribute-Value Normalization. | |
* | |
* However, it is complicated a bit by that AVN is defined on top of | |
* EOL normalization and we do those two in one go here. */ | |
switch(curr.unicode()) | |
{ | |
case 0xD: | |
{ | |
if(peekAhead() == '\n') | |
{ | |
result.append(QLatin1Char(' ')); | |
++m_pos; | |
break; | |
} | |
} | |
case 0xA: | |
/* Fallthrough. */ | |
case 0x9: | |
{ | |
result.append(QLatin1Char(' ')); | |
break; | |
} | |
default: | |
result.append(curr); | |
} | |
} | |
++m_pos; | |
} | |
Q_ASSERT(false); | |
} | |
case ElementContent: | |
{ | |
QString result; | |
result.reserve(20); | |
/* Whether the text node, result, may be whitespace only. Character references | |
* and CDATA sections disables that. */ | |
bool mayBeWS = true; | |
CharacterSkips skipEOLNormalization; | |
while(true) | |
{ | |
if(atEnd()) | |
return Token(END_OF_FILE); | |
switch(peekCurrent()) | |
{ | |
case '<': | |
{ | |
if(!result.isEmpty() && peekAhead(2) != '[') | |
{ | |
/* We encountered the end, and it was not a CDATA section. */ | |
/* We don't advance. Next time we'll handle the <... stuff. */ | |
return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization)); | |
} | |
++m_pos; | |
if(atEnd()) | |
return Token(END_OF_FILE); | |
const QChar ahead(current()); | |
if(ahead.isSpace()) | |
return error(); | |
else if(ahead == QLatin1Char('/')) | |
{ | |
if(m_pos + 1 == m_length) | |
return Token(END_OF_FILE); | |
else if(m_data.at(m_pos + 1).isSpace()) | |
return error(); | |
else | |
return tokenAndChangeState(BEGIN_END_TAG, EndTag); | |
} | |
else if(isNCNameStart(ahead)) | |
{ | |
pushState(); | |
return tokenAndChangeState(G_LT, StartTag, 0); | |
} | |
else if(aheadEquals("!--", 3, 0)) | |
{ | |
pushState(); | |
m_pos += 3; | |
return tokenAndChangeState(COMMENT_START, XMLComment, 0); | |
} | |
else if(aheadEquals("![CDATA[", 8, 0)) | |
{ | |
mayBeWS = false; | |
m_pos += 8; | |
const int start = m_pos; | |
const int len = scanUntil("]]>"); | |
if(len == -1) | |
return Token(END_OF_FILE); | |
m_pos += 2; /* Consume "]]>". Note that m_pos is on '!'. */ | |
result.append(m_data.mid(start, len)); | |
break; | |
} | |
else if(ahead == QLatin1Char('?')) | |
{ | |
pushState(); | |
return tokenAndChangeState(PI_START, ProcessingInstructionName); | |
} | |
else | |
return Token(G_LT); | |
} | |
case '&': | |
{ | |
const QString ret(tokenizeCharacterReference()); | |
if(ret.isNull()) | |
return Token(ERROR); | |
else | |
{ | |
skipEOLNormalization.insert(result.count()); | |
result.append(ret); | |
mayBeWS = false; | |
break; | |
} | |
} | |
case '{': | |
{ | |
// TODO remove this check, also below. | |
if(m_pos + 1 == m_length) | |
return Token(END_OF_FILE); | |
else if(peekAhead() == '{') | |
{ | |
++m_pos; | |
result.append(QLatin1Char('{')); | |
} | |
else | |
{ | |
if(result.isEmpty()) | |
{ | |
pushState(); | |
return tokenAndChangeState(CURLY_LBRACE, Default); | |
} | |
else | |
{ | |
/* We don't advance here. */ | |
return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization)); | |
} | |
} | |
break; | |
} | |
case '}': | |
{ | |
if(m_pos + 1 == m_length) | |
return Token(END_OF_FILE); | |
else if(peekAhead() == '}') | |
{ | |
++m_pos; | |
result.append(QLatin1Char('}')); | |
} | |
else | |
{ | |
/* This is a parse error, and the grammar won't be able | |
* to reduce this CURLY_RBRACE. */ | |
return tokenAndChangeState(CURLY_RBRACE, Default); | |
} | |
break; | |
} | |
case '\n': | |
{ | |
/* We want to translate \r\n into \n. */ | |
if(peekAhead(-1) == '\r') | |
break; | |
/* else, fallthrough. */ | |
} | |
case '\r': | |
{ | |
result.append(QLatin1Char('\n')); | |
break; | |
} | |
default: | |
{ | |
result.append(current()); | |
break; | |
} | |
} | |
++m_pos; | |
} | |
Q_ASSERT(false); | |
} | |
case ProcessingInstructionName: | |
{ | |
const int start = m_pos; | |
while(true) | |
{ | |
++m_pos; | |
if(m_pos >= m_length) | |
return Token(END_OF_FILE); | |
const QChar next(current()); | |
if(next.isSpace() || next == QLatin1Char('?')) | |
{ | |
return tokenAndChangeState(PI_TARGET, m_data.mid(start, m_pos - start), | |
ProcessingInstructionContent); | |
} | |
} | |
Q_ASSERT(false); | |
} | |
case ProcessingInstructionContent: | |
{ | |
/* Consume whitespace between the name and the content. */ | |
if(consumeRawWhitespace()) | |
return Token(END_OF_FILE); | |
const int start = m_pos; | |
const int len = scanUntil("?>"); | |
if(len == -1) | |
return Token(END_OF_FILE); | |
else | |
{ | |
m_pos += 2; /* Consume "?>" */ | |
popState(); | |
return Token(PI_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips())); | |
} | |
Q_ASSERT(false); | |
} | |
case EndTag: | |
{ | |
if(consumeRawWhitespace()) | |
return END_OF_FILE; | |
if(peekCurrent() == '>') | |
{ | |
popState(); | |
return tokenAndAdvance(G_GT); | |
} | |
else | |
return tokenizeNCNameOrQName(); | |
Q_ASSERT(false); | |
} | |
case XMLComment: | |
{ | |
const int start = m_pos; | |
const int len = scanUntil("--"); | |
if(len == -1) | |
return END_OF_FILE; | |
else | |
{ | |
m_pos += 2; /* Consume "--". */ | |
popState(); | |
if(peekCurrent() == '>') | |
{ | |
++m_pos; | |
return Token(COMMENT_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips())); | |
} | |
else | |
return error(); | |
} | |
Q_ASSERT(false); | |
} | |
case Pragma: | |
{ | |
/* Consume whitespace. */ | |
if(consumeRawWhitespace()) | |
return Token(END_OF_FILE); | |
setState(PragmaContent); | |
return tokenizeNCNameOrQName(); | |
} | |
case PragmaContent: | |
{ | |
QString result; | |
result.reserve(20); | |
const bool hasWS = m_pos < m_length && current().isSpace(); | |
/* Consume all whitespace up to the pragma content(if any). */ | |
if(consumeRawWhitespace()) | |
return Token(END_OF_FILE); | |
if(peekCurrent() == '#' && peekAhead() == ')') | |
{ | |
/* We reached the end, and there's no pragma content. */ | |
return tokenAndChangeState(PRAGMA_END, Default, 2); | |
} | |
else if(!hasWS) | |
{ | |
/* A separating space is required if there's pragma content. */ | |
return error(); /* i18n */ | |
} | |
const int start = m_pos; | |
const int len = scanUntil("#)"); | |
if(len == -1) | |
return Token(END_OF_FILE); | |
return Token(STRING_LITERAL, m_data.mid(start, len)); | |
Q_ASSERT(false); | |
} | |
} | |
Q_ASSERT(false); | |
return error(); | |
} | |
Tokenizer::Token XQueryTokenizer::attributeAsRaw(const QChar sep, | |
int &sepStack, | |
const int startPos, | |
const bool aInLiteral, | |
QString &result) | |
{ | |
bool inLiteral = aInLiteral; | |
const char otherSep = (sep == QLatin1Char('"') ? '\'' : '"'); | |
while(true) | |
{ | |
if(atEnd()) | |
return END_OF_FILE; | |
if(peekCurrent() == sep.unicode()) | |
{ | |
if(inLiteral) | |
inLiteral = false; | |
else | |
inLiteral = true; | |
if(peekAhead() == sep.unicode()) | |
{ | |
/* The quoting mechanism was used. */ | |
result.append(current()); | |
m_pos += 2; | |
continue; | |
} | |
else | |
{ | |
/* Don't consume the separator, such that we | |
* return a token for it next time. */ | |
if(m_pos == startPos) | |
{ | |
++m_pos; | |
setState(StartTag); | |
return Token(sep == QLatin1Char('"') ? QUOTE : APOS); | |
} | |
if(sepStack == 0) | |
{ | |
return Token(STRING_LITERAL, result); | |
} | |
else | |
{ | |
result.append(current()); | |
++m_pos; | |
continue; | |
} | |
} | |
} | |
else if(peekCurrent() == '&') | |
{ | |
const QString ret(tokenizeCharacterReference()); | |
if(ret.isNull()) | |
return Token(ERROR); | |
else | |
{ | |
result.append(ret); | |
++m_pos; | |
continue; | |
} | |
} | |
else if(peekCurrent() == otherSep) | |
{ | |
result.append(current()); | |
++m_pos; | |
if(peekCurrent() == otherSep) | |
++m_pos; | |
if(inLiteral) | |
inLiteral = false; | |
else | |
inLiteral = true; | |
continue; | |
} | |
else if(peekCurrent() == '{') | |
{ | |
result.append(current()); | |
if(peekAhead() == '{') | |
{ | |
m_pos += 2; | |
continue; | |
} | |
else | |
{ | |
++m_pos; | |
++sepStack; | |
const Token t(attributeAsRaw(sep, sepStack, startPos, false, result)); | |
if(t.type != SUCCESS) | |
return t; | |
} | |
} | |
else if(peekCurrent() == '}') | |
{ | |
if(inLiteral && peekAhead() == '}') | |
{ | |
result.append(current()); | |
m_pos += 2; | |
continue; | |
} | |
else | |
{ | |
++m_pos; | |
--sepStack; | |
return Token(SUCCESS); /* The return value is arbitrary. */ | |
} | |
} | |
else | |
{ | |
result.append(current()); | |
++m_pos; | |
} | |
} | |
} | |
Tokenizer::Token XQueryTokenizer::nextToken(YYLTYPE *const sourceLocator) | |
{ | |
sourceLocator->first_line = m_line; | |
sourceLocator->first_column = m_pos - m_columnOffset + 1; /* Plus 1, since m_pos is 0-based. */ | |
if(m_tokenStack.isEmpty()) | |
return nextToken(); | |
else | |
{ | |
const Token retval(m_tokenStack.pop()); | |
switch(retval.type) | |
{ | |
case MODULE: | |
/* Fallthrough.*/ | |
case SCHEMA: | |
/* Fallthrough.*/ | |
case COPY_NAMESPACES: | |
{ | |
setState(NamespaceKeyword); | |
break; | |
} | |
case VERSION: | |
{ | |
setState(XQueryVersion); | |
break; | |
} | |
case AS: | |
/* Fallthrough. */ | |
case OF: | |
{ | |
setState(ItemType); | |
break; | |
} | |
default: | |
{ | |
if(isOperatorKeyword(retval.type)) | |
setState(Default); | |
break; | |
} | |
}; | |
return retval; | |
} | |
} | |
int XQueryTokenizer::commenceScanOnly() | |
{ | |
m_scanOnly = true; | |
return m_pos; | |
} | |
void XQueryTokenizer::resumeTokenizationFrom(const int pos) | |
{ | |
m_scanOnly = false; | |
m_pos = pos; | |
} | |
void XQueryTokenizer::setParserContext(const ParserContext::Ptr &) | |
{ | |
} | |
#undef handleWhitespace | |
} // namespace QPatternist | |
QT_END_NAMESPACE |