blob: 2337bc37bf6a55d8fce0e16d3316d6f2c0a23886 [file] [log] [blame]
/****************************************************************************
**
** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
** All rights reserved.
** Contact: Nokia Corporation (qt-info@nokia.com)
**
** This file is part of the QtXmlPatterns module of the Qt Toolkit.
**
** $QT_BEGIN_LICENSE:LGPL$
** GNU Lesser General Public License Usage
** This file may be used under the terms of the GNU Lesser General Public
** License version 2.1 as published by the Free Software Foundation and
** appearing in the file LICENSE.LGPL included in the packaging of this
** file. Please review the following information to ensure the GNU Lesser
** General Public License version 2.1 requirements will be met:
** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
**
** In addition, as a special exception, Nokia gives you certain additional
** rights. These rights are described in the Nokia Qt LGPL Exception
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
**
** GNU General Public License Usage
** Alternatively, this file may be used under the terms of the GNU General
** Public License version 3.0 as published by the Free Software Foundation
** and appearing in the file LICENSE.GPL included in the packaging of this
** file. Please review the following information to ensure the GNU General
** Public License version 3.0 requirements will be met:
** http://www.gnu.org/copyleft/gpl.html.
**
** Other Usage
** Alternatively, this file may be used in accordance with the terms and
** conditions contained in a signed written agreement between you and Nokia.
**
**
**
**
**
** $QT_END_LICENSE$
**
****************************************************************************/
#include <QByteArray>
#include "qquerytransformparser_p.h"
#include "qxquerytokenizer_p.h"
#include "qtokenlookup.cpp"
QT_BEGIN_NAMESPACE
namespace QPatternist
{
#define handleWhitespace() \
{ \
const TokenType t = consumeWhitespace(); \
if(t != SUCCESS) \
return Token(t); \
}
XQueryTokenizer::XQueryTokenizer(const QString &query,
const QUrl &location,
const State startingState) : Tokenizer(location)
, m_data(query)
, m_length(query.length())
, m_state(startingState)
, m_pos(0)
, m_line(1)
, m_columnOffset(0)
, m_scanOnly(false)
{
Q_ASSERT(location.isValid() || location.isEmpty());
}
const QChar XQueryTokenizer::current() const
{
if(m_pos < m_length)
return m_data.at(m_pos);
else
return QChar();
}
char XQueryTokenizer::peekCurrent() const
{
return current().toAscii();
}
int XQueryTokenizer::peekForColonColon() const
{
/* Note, we don't modify m_pos in this function, so we need to do offset
* calculations. */
int pos = m_pos;
while(pos < m_length)
{
switch(m_data.at(pos).toAscii())
{
/* Fallthrough these four. */
case ' ':
case '\t':
case '\n':
case '\r':
break;
case ':':
{
if(peekAhead((pos - m_pos) + 1) == ':')
return pos - m_pos;
/* Fallthrough. */
}
default:
return -1;
}
++pos;
}
return -1;
}
Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
const State s,
const int advance)
{
Q_ASSERT(advance >= 0);
m_pos += advance;
setState(s);
return Token(code);
}
Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
const QString &value,
const State s)
{
setState(s);
return Token(code, value);
}
Tokenizer::Token XQueryTokenizer::tokenAndAdvance(const TokenType code,
const int advance)
{
Q_ASSERT(advance >= 0);
m_pos += advance;
return Token(code);
}
QString XQueryTokenizer::normalizeEOL(const QString &input,
const CharacterSkips &characterSkips)
{
const int len = input.count();
QString result;
/* The likely hood is rather high it'll be the same content. */
result.reserve(len);
for(int i = 0; i < len; ++i)
{
const QChar &at = input.at(i);
if(characterSkips.contains(i))
{
result.append(at);
continue;
}
switch(input.at(i).unicode())
{
case '\r':
{
if(i + 1 < len && input.at(i + 1) == QLatin1Char('\n'))
++i;
/* Else, fallthrough. */
}
case '\n':
{
result.append(QLatin1Char('\n'));
continue;
}
default:
{
result.append(at);
}
}
}
return result;
}
Tokenizer::TokenType XQueryTokenizer::consumeComment()
{
/* Below, we return ERROR instead of END_OF_FILE such that the parser
* sees an invalid comment. */
while(m_pos < m_length)
{
switch(peekCurrent())
{
case ':':
{
++m_pos; /* Consume ':' */
if(atEnd())
return ERROR;
if(peekCurrent() == ')')
{
++m_pos; /* Consume ')' */
return SUCCESS; /* The comment closed nicely. */
}
continue; /* We don't want to increment m_pos twice. */
}
case '(':
{ /* It looks like the start of a comment. */
++m_pos;
if(atEnd())
return END_OF_FILE;
else if(peekCurrent() == ':')
{
/* And it is a nested comment -- parse it. */
const TokenType retval = consumeComment();
if(retval == SUCCESS)
continue; /* Continue with our "own" comment. */
else
return retval; /* Return the error in the nested comment. */
}
break;
}
case '\n':
/* Fallthrough. */
case '\r':
{
/* We want to count \r\n as a single line break. */
if(peekAhead() == '\n')
++m_pos;
m_columnOffset = m_pos;
++m_line;
break;
}
}
++m_pos;
}
return ERROR; /* Error: we reached the end while inside a comment. */
}
bool XQueryTokenizer::consumeRawWhitespace()
{
while(m_pos < m_length)
{
switch(peekCurrent())
{
case ' ':
case '\t':
break;
case '\n':
case '\r':
{
if(peekAhead() == '\n')
++m_pos;
m_columnOffset = m_pos;
++m_line;
break;
}
default:
return false;
}
++m_pos;
}
return true;
}
Tokenizer::TokenType XQueryTokenizer::consumeWhitespace()
{
while(m_pos < m_length)
{
switch(peekCurrent())
{
case ' ':
case '\t':
break;
case '\n':
case '\r':
{
/* We want to count \r\n as a single line break. */
if(peekAhead() == '\n')
++m_pos;
m_columnOffset = m_pos;
++m_line;
break;
}
case '(':
{
if(peekAhead() == ':')
{
m_pos += 2; /* Consume "(:" */
const TokenType comment = consumeComment();
if(comment == SUCCESS)
continue;
else
return comment;
}
}
default:
return SUCCESS;
}
++m_pos;
}
return END_OF_FILE;
}
char XQueryTokenizer::peekAhead(const int length) const
{
if(m_pos + length < m_length)
return m_data.at(m_pos + length).toAscii();
else
return 0;
}
Tokenizer::Token XQueryTokenizer::error()
{
return Token(ERROR);
}
bool XQueryTokenizer::isDigit(const char ch)
{
return ch >= '0' && ch <= '9';
}
/* Replace with function in QXmlUtils. Write test cases for this. */
bool XQueryTokenizer::isNCNameStart(const QChar ch)
{
if(ch == QLatin1Char('_'))
return true;
switch(ch.category())
{
case QChar::Letter_Lowercase:
case QChar::Letter_Uppercase:
case QChar::Letter_Other:
case QChar::Letter_Titlecase:
case QChar::Number_Letter:
return true;
default:
return false;
}
}
bool XQueryTokenizer::isNCNameBody(const QChar ch)
{
switch(ch.unicode())
{
case '.':
case '_':
case '-':
return true;
}
switch(ch.category())
{
case QChar::Letter_Lowercase:
case QChar::Letter_Uppercase:
case QChar::Letter_Other:
case QChar::Letter_Titlecase:
case QChar::Number_Letter:
case QChar::Mark_SpacingCombining:
case QChar::Mark_Enclosing:
case QChar::Mark_NonSpacing:
case QChar::Letter_Modifier:
case QChar::Number_DecimalDigit:
return true;
default:
return false;
}
}
bool XQueryTokenizer::isPhraseKeyword(const TokenType code)
{
switch(code)
{
/* Fallthrough all these. */
case CASTABLE:
case CAST:
case COPY_NAMESPACES:
case DECLARE:
case EMPTY:
case MODULE:
case IMPORT:
case INSTANCE:
case ORDER:
case ORDERING:
case XQUERY:
case STABLE:
case TREAT:
return true;
default:
return false;
}
}
bool XQueryTokenizer::isOperatorKeyword(const TokenType code)
{
switch(code)
{
/* Fallthrough all these. */
case AS:
case ASCENDING:
case AT:
case CASE:
case CAST:
case CASTABLE:
case EQ:
case EXTERNAL:
case GE:
case G_EQ:
case G_GT:
case G_LT:
case G_NE:
case GT:
case IN:
case INHERIT:
case INSTANCE:
case IS:
case ITEM:
case LE:
case LT:
case NE:
case NO_INHERIT:
case NO_PRESERVE:
case OF:
case PRESERVE:
case RETURN:
case STABLE:
case TO:
case TREAT:
return true;
default:
return false;
};
}
bool XQueryTokenizer::isTypeToken(const TokenType t)
{
switch(t)
{
/* Fallthrough all these. */
case ATTRIBUTE:
case COMMENT:
case DOCUMENT:
case DOCUMENT_NODE:
case ELEMENT:
case ITEM:
case NODE:
case PROCESSING_INSTRUCTION:
case SCHEMA_ATTRIBUTE:
case SCHEMA_ELEMENT:
case TEXT:
return true;
default:
return false;
}
}
Tokenizer::Token XQueryTokenizer::tokenizeNCNameOrQName()
{
const int start = m_pos;
const Token t1 = tokenizeNCName();
if(t1.hasError())
return t1;
if(peekCurrent() != ':' || peekAhead() == '=')
return t1;
++m_pos;
const Token t2 = tokenizeNCName();
if(t2.hasError())
return t2;
else
return Token(QNAME, m_data.mid(start, m_pos - start));
}
Tokenizer::Token XQueryTokenizer::tokenizeNumberLiteral()
{
setState(Operator);
const int startPos = m_pos;
bool hasDot = false;
bool isXPath20 = false;
for(; m_pos < m_length; ++m_pos)
{
QChar ch(current());
char cell = ch.cell();
if(cell == 'e' || cell == 'E')
{
isXPath20 = true;
++m_pos;
ch = current();
if(ch.row() != 0)
break;
cell = ch.cell();
if(cell == '+' || cell == '-')
continue;
}
if(isNCNameStart(ch))
return error();
if(cell < '0' || cell > '9')
{
if(cell == '.' && !hasDot)
hasDot = true;
else
break;
}
}
return Token(isXPath20 ? XPATH2_NUMBER : NUMBER, m_data.mid(startPos, m_pos - startPos));
}
QString XQueryTokenizer::tokenizeCharacterReference()
{
Q_ASSERT(peekCurrent() == '&');
const int theEnd = m_data.indexOf(QLatin1Char(';'), m_pos + 1);
if(theEnd == -1) /* No ';' found, a syntax error. i18n. */
return QString();
QString content(m_data.mid(m_pos + 1, (theEnd - m_pos) - 1));
m_pos = theEnd;
const QChar charRef(charForReference(content));
if(!charRef.isNull())
return charRef;
else if(content.startsWith(QLatin1Char('#')))
{
int base;
/* It is only '#' or '#x'. */
if(content.length() < 2)
return QString();
/* We got a hex number if it starts with 'x', otherwise it's a decimal. */
if(content.at(1) == QLatin1Char('x'))
{
base = 16;
content = content.mid(2); /* Remove "#x". */
}
else
{
base = 10;
content = content.mid(1); /* Remove "#". */
}
bool conversionOK = false;
const int codepoint = content.toInt(&conversionOK, base);
if(conversionOK)
{
const QChar ch(codepoint);
if(ch.isNull())
{
/* We likely have something which require surrogate pairs. */
QString result;
result += QChar(QChar::highSurrogate(codepoint));
result += QChar(QChar::lowSurrogate(codepoint));
return result;
}
else
return ch;
}
else
return QString();
}
else
return QString();
}
int XQueryTokenizer::scanUntil(const char *const content)
{
const int end = m_data.indexOf(QString::fromLatin1(content), m_pos);
if(end == -1)
return -1;
else
{
const int len = end - m_pos;
m_pos += len;
return len;
}
}
QChar XQueryTokenizer::charForReference(const QString &reference)
{
if(m_charRefs.isEmpty())
{
/* Initialize. */
m_charRefs.reserve(5);
m_charRefs.insert(QLatin1String("lt"), QLatin1Char('<'));
m_charRefs.insert(QLatin1String("gt"), QLatin1Char('>'));
m_charRefs.insert(QLatin1String("amp"), QLatin1Char('&'));
m_charRefs.insert(QLatin1String("quot"), QLatin1Char('"'));
m_charRefs.insert(QLatin1String("apos"), QLatin1Char('\''));
}
return m_charRefs.value(reference);
}
Tokenizer::Token XQueryTokenizer::tokenizeStringLiteral()
{
const QChar delimiter(current());
/* We cannot unfortunately just scan and then do mid(),
* since we can encounter character references. */
QString result;
/* This is more likely than QString's default allocation. */
result.reserve(8);
CharacterSkips skipEOLNormalization;
/* Advance over the initial quote character. */
++m_pos;
for(; m_pos < m_length; ++m_pos)
{
const QChar c(current());
if(c == QLatin1Char('&'))
{
const QString charRef(tokenizeCharacterReference());
if(charRef.isNull())
return error();
else
{
skipEOLNormalization.insert(result.count());
result.append(charRef);
}
}
else if(c == delimiter)
{
/* Maybe the escaping mechanism is used. For instance, "s""s"
* has the value `s"s'. */
++m_pos;
if(current() == delimiter) /* Double quote. */
result += delimiter;
else
return Token(STRING_LITERAL, normalizeEOL(result, skipEOLNormalization));
}
else
result += c;
}
return error();
}
Tokenizer::Token XQueryTokenizer::tokenizeNCName()
{
const int startPos = m_pos;
if(m_pos < m_length && isNCNameStart(current()))
{
++m_pos;
for(; m_pos < m_length; ++m_pos)
{
if(!isNCNameBody(current()))
break;
}
return Token(NCNAME, m_data.mid(startPos, m_pos - startPos));
}
else
return error();
}
bool XQueryTokenizer::aheadEquals(const char *const chs,
const int len,
const int offset) const
{
Q_ASSERT(len > 0);
Q_ASSERT(qstrlen(chs) == uint(len));
if(m_pos + len >= m_length)
return false;
for(int i = offset; i < (len + offset); ++i)
{
if(m_data.at(m_pos + i).toAscii() != chs[i - offset])
return false;
}
return true;
}
const TokenMap *XQueryTokenizer::lookupKeyword(const QString &keyword)
{
return TokenLookup::value(keyword.toAscii().constData(), keyword.length());
}
XQueryTokenizer::State XQueryTokenizer::state() const
{
return m_state;
}
void XQueryTokenizer::setState(const State s)
{
m_state = s;
}
void XQueryTokenizer::pushState(const State s)
{
m_stateStack.push(s);
}
void XQueryTokenizer::pushState()
{
m_stateStack.push(m_state);
}
void XQueryTokenizer::popState()
{
/* QStack::pop() asserts if it's empty, so we need to check
* it, since we might receive unbalanced curlies. */
if(!m_stateStack.isEmpty())
m_state = m_stateStack.pop();
}
Tokenizer::Token XQueryTokenizer::nextToken()
{
switch(state())
{
/* We want to skip or do special whitespace handling for these
* states. So fallthrough all of the following. */
case AposAttributeContent:
case Axis:
case ElementContent:
case EndTag:
case Pragma:
case PragmaContent:
case ProcessingInstructionName:
case QuotAttributeContent:
case StartTag:
case XMLComment:
break;
default:
handleWhitespace();
}
switch(state())
{
case XMLSpaceDecl:
/* Fallthrough. */
case NamespaceKeyword:
{
switch(peekCurrent())
{
case ',':
return tokenAndAdvance(COMMA);
case '"':
/* Fallthrough. */
case '\'':
{
setState(NamespaceDecl);
return tokenizeStringLiteral();
}
}
const Token id(tokenizeNCName());
if(id.type != NCNAME)
return id;
const TokenMap *const keyword = lookupKeyword(id.value);
if(keyword)
{
switch(keyword->token)
{
case INHERIT:
/* Fallthrough. */
case NO_INHERIT:
{
setState(Default);
break;
}
case NAMESPACE:
{
setState(NamespaceDecl);
break;
}
case ORDERED:
/* Fallthrough. */
case UNORDERED:
/* Fallthrough. */
case STRIP:
{
setState(Default);
break;
}
case PRESERVE:
{
if(state() != NamespaceKeyword)
setState(Default);
}
default:
break;
}
return Token(keyword->token);
}
else
return id;
Q_ASSERT(false);
}
case NamespaceDecl:
{
switch(peekCurrent())
{
case '=':
return tokenAndAdvance(G_EQ);
case ';':
return tokenAndChangeState(SEMI_COLON, Default);
case '\'':
/* Fallthrough. */
case '\"':
return tokenizeStringLiteral();
}
const Token nc(tokenizeNCName());
handleWhitespace();
const char pc = peekCurrent();
const TokenMap* const t = lookupKeyword(nc.value);
if(pc == '\'' || (pc == '"' && t))
return tokenAndChangeState(t->token, Default, 0);
else
return nc;
Q_ASSERT(false);
}
case Axis:
{
if(peekCurrent() == ':')
{
Q_ASSERT(peekAhead() == ':');
m_pos += 2;
setState(AfterAxisSeparator);
return Token(COLONCOLON);
}
/* Fallthrough. */
}
case AfterAxisSeparator:
/* Fallthrough. */
case Default:
/* State Operator and state Default have a lot of tokens in common except
* for minor differences. So we treat them the same way, and sprinkles logic
* here and there to handle the small differences. */
/* Fallthrough. */
case Operator:
{
switch(peekCurrent())
{
case '=':
return tokenAndChangeState(G_EQ, Default);
case '-':
return tokenAndChangeState(MINUS, Default);
case '+':
return tokenAndChangeState(PLUS, Default);
case '[':
return tokenAndChangeState(LBRACKET, Default);
case ']':
return tokenAndChangeState(RBRACKET, Operator);
case ',':
return tokenAndChangeState(COMMA, Default);
case ';':
return tokenAndChangeState(SEMI_COLON, Default);
case '$':
return tokenAndChangeState(DOLLAR, VarName);
case '|':
return tokenAndChangeState(BAR, Default);
case '?':
return tokenAndChangeState(QUESTION, Operator);
case ')':
return tokenAndChangeState(RPAREN, Operator);
case '@':
return tokenAndChangeState(AT_SIGN, Default);
/* Fallthrough all these. */
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '0':
return tokenizeNumberLiteral();
case '.':
{
const char next = peekAhead();
if(next == '.')
return tokenAndChangeState(DOTDOT, Operator, 2);
/* .5 is allowed, as short form for 0.5:
* <tt>[142] DecimalLiteral ::= ("." Digits) | (Digits "." [0-9]*)</tt>
*/
else if(isDigit(next))
return tokenizeNumberLiteral();
else
return tokenAndChangeState(DOT, Operator);
}
case '\'':
/* Fallthrough. */
case '"':
{
setState(Operator);
return tokenizeStringLiteral();
}
case '(':
{
if(peekAhead() == '#')
return tokenAndChangeState(PRAGMA_START, Pragma, 2);
else
return tokenAndChangeState(LPAREN, Default);
}
case '*':
{
if(peekAhead() == ':')
{
m_pos += 2; /* Consume *:. */
const Token nc = tokenizeNCName();
if(nc.hasError())
return error();
else
return tokenAndChangeState(ANY_PREFIX, nc.value, Operator);
}
else
return tokenAndChangeState(STAR, state() == Default ? Operator : Default);
}
case ':':
{
switch(peekAhead())
{
case '=':
return tokenAndChangeState(ASSIGN, Default, 2);
case ':':
return tokenAndChangeState(COLONCOLON, Default, 2);
default:
return error();
}
}
case '!':
{
if(peekAhead() == '=')
return tokenAndChangeState(G_NE, Default, 2);
else
return error();
}
case '<':
{
switch(peekAhead())
{
case '=':
return tokenAndChangeState(G_LE, Default, 2);
case '<':
return tokenAndChangeState(PRECEDES, Default, 2);
case '?':
{
pushState(Operator);
return tokenAndChangeState(PI_START, ProcessingInstructionName, 2);
}
case '!':
{
if(aheadEquals("!--", 3))
{
m_pos += 3; /* Consume "!--". */
pushState(Operator);
return tokenAndChangeState(COMMENT_START, XMLComment);
}
/* Fallthrough. It's a syntax error, and this is a good way to report it. */
}
default:
{
if((m_pos + 1) < m_length && isNCNameStart(m_data.at(m_pos + 1)))
{
/* We assume it's an element constructor. */
pushState(Operator);
}
return tokenAndChangeState(G_LT, state() == Operator ? Default : StartTag);
}
}
}
case '>':
{
switch(peekAhead())
{
case '=':
return tokenAndChangeState(G_GE, Default, 2);
case '>':
return tokenAndChangeState(FOLLOWS, Default, 2);
default:
return tokenAndChangeState(G_GT, Default);
}
}
case '/':
{
if(peekAhead() == '/')
return tokenAndChangeState(SLASHSLASH, Default, 2);
else
return tokenAndChangeState(SLASH, Default);
}
case '{':
{
pushState(Operator);
return tokenAndChangeState(CURLY_LBRACE, Default);
}
case '}':
{
popState();
return tokenAndAdvance(CURLY_RBRACE);
}
}
/* Ok. We're in state Default or Operator, and it wasn't a simple
* character. */
const Token id(tokenizeNCName());
if(id.type != NCNAME)
return id;
const TokenMap *const keyword = lookupKeyword(id.value);
if(state() == Operator)
{
if(keyword)
{
if(keyword->token == DEFAULT || keyword->token == ASCENDING || keyword->token == DESCENDING)
setState(Operator);
else if(keyword->token == RETURN)
setState(Default);
else if(isPhraseKeyword(keyword->token))
{
const TokenType ws = consumeWhitespace();
if(ws == ERROR)
return error();
const Token id2(tokenizeNCName());
const TokenMap *const keyword2 = lookupKeyword(id2.value);
if(keyword2)
{
if(keyword->token == TREAT && keyword2->token == AS)
setState(ItemType);
else if (keyword->token == CAST || (keyword->token == CASTABLE && keyword2->token == AS) || keyword2->token == BY)
setState(Default);
m_tokenStack.push(Token(keyword2->token));
}
else
m_tokenStack.push(id2);
return Token(keyword->token);
}
else
{
/* Such that we tokenize the second token in "empty greatest". */
if(keyword->token != EMPTY)
setState(Default);
}
if(keyword->token == AS || keyword->token == CASE)
setState(ItemType);
return Token(keyword->token);
}
else
return id;
}
Q_ASSERT(state() == Default || state() == Axis || state() == AfterAxisSeparator);
/*
* This is hard. Consider this:
*
* Valid: child ::nameTest
* Valid: child:: nameTest
* Syntax Error: child :localName
* Syntax Error: child: localName
*
* Consider "child ::name". Right now, we're here:
* ^
* We don't know whether "child" is a prefix and hence the whitespace is invalid,
* or whether it's an axis and hence skippable. */
{
const int wsLength = peekForColonColon();
/* We cannot call handleWhitespace() because it returns on
* END_OF_FILE, and we have parsed up keyword, and we need to
* deal with that.
*
* If we have a colon colon, which means the whitespace is
* allowed, we skip it. */
if(wsLength != -1)
m_pos += wsLength;
}
/* Handle name tests. */
if(peekCurrent() == ':')
{
switch(peekAhead())
{
case '=':
return id;
case '*':
{
m_pos += 2;
return tokenAndChangeState(ANY_LOCAL_NAME, id.value, Operator);
}
case ':':
{
/* We have an axis. */
setState(Axis);
return keyword ? Token(keyword->token) : id;
}
default:
{
/* It's a QName. */
++m_pos; /* Consume the colon. */
const Token id2(tokenizeNCName());
if(id2.type != NCNAME)
{
--m_pos;
return id;
}
setState(Operator);
const int qNameLen = id.value.length() + id2.value.length() + 1;
return Token(QNAME, m_data.mid(m_pos - qNameLen, qNameLen));
}
}
}
if(!keyword || isOperatorKeyword(keyword->token))
{
setState(Operator);
return id;
}
const TokenType ws = consumeWhitespace();
if(ws == ERROR) // TODO this should test for success. Write test.
return Token(ERROR);
if(atEnd())
{
setState(Operator);
return id;
}
/* Let the if-body apply for constructors, and node type tests. */
if(isTypeToken(keyword->token) ||
keyword->token == TYPESWITCH ||
keyword->token == ORDERED ||
keyword->token == UNORDERED ||
keyword->token == IF)
{
switch(peekCurrent())
{
case '(':
{
// TODO See if we can remove DOCUMENT from isTypeToken.
if(isTypeToken(keyword->token) && keyword->token != DOCUMENT)
{
m_tokenStack.push(Token(LPAREN));
++m_pos; /* Consume '('. */
pushState(Operator);
if(keyword->token == PROCESSING_INSTRUCTION)
setState(KindTestForPI);
else
setState(KindTest);
return Token(keyword->token);
}
else if(keyword->token == TYPESWITCH || keyword->token == IF)
return Token(keyword->token);
else /* It's a function call. */
return id;
}
case '{':
{
m_tokenStack.push(Token(CURLY_LBRACE));
++m_pos; /* Consume '{'. */
pushState(Operator);
/* Stay in state Default. */
return Token(keyword->token);
}
default:
{
/* We have read in a token which is for instance
* "return", and now it can be an element
* test("element") a node kind test("element()"), or a
* computed element constructor("element name {...").
* We need to do a two-token lookahead here, because
* "element return" can be an element test followed by
* the return keyword, but it can also be an element
* constructor("element return {"). */
if(isNCNameStart(current()))
{
const int currentPos = m_pos;
const Token token2 = tokenizeNCNameOrQName();
if(token2.hasError())
return token2;
handleWhitespace();
if(peekCurrent() == '{')
{
/* An element constructor. */
m_tokenStack.push(token2);
return Token(keyword->token);
}
/* We jump back in the stream, we need to tokenize token2 according
* to the state. */
m_pos = currentPos;
setState(Operator);
return Token(NCNAME, QLatin1String(keyword->name));
}
}
}
}
if(peekCurrent() == '$')
{
setState(VarName);
return Token(keyword->token);
}
/* It's not a node type, it's not the typeswitch expression, but it is a function callsite. */
if(peekCurrent() == '(')
return id;
else if(peekCurrent() == '{' && keyword->token == VALIDATE)
return Token(keyword->token);
if(!isNCNameStart(current()))
{
setState(Operator);
return id;
}
const Token id2(tokenizeNCName());
const TokenMap *const keyword2 = lookupKeyword(id2.value);
if(!keyword2)
{
/* It's a syntax error. All cases of two subsequent ncnames are keywords(e.g, declarations). */
setState(Operator);
return id;
}
switch(keyword->token)
{
case DECLARE:
{
switch(keyword2->token)
{
case VARIABLE:
/* Fallthrough. */
case FUNCTION:
{
m_tokenStack.push(Token(keyword2->token));
setState(Default);
return Token(keyword->token);
}
case OPTION:
{
m_tokenStack.push(Token(keyword2->token));
setState(Default);
return Token(keyword->token);
}
case COPY_NAMESPACES:
/* Fallthrough. */
case ORDERING:
{
m_tokenStack.push(Token(keyword2->token));
setState(NamespaceKeyword);
return Token(keyword->token);
}
case CONSTRUCTION:
{
// TODO identical to CONSTRUCTION?
m_tokenStack.push(Token(keyword2->token));
setState(Operator);
return Token(keyword->token);
}
case NAMESPACE:
/* Fallthrough. */
case BASEURI:
{
m_tokenStack.push(Token(keyword2->token));
setState(NamespaceDecl);
return Token(keyword->token);
}
case BOUNDARY_SPACE:
{
m_tokenStack.push(Token(keyword2->token));
setState(XMLSpaceDecl);
return Token(keyword->token);
}
case DEFAULT:
{
m_tokenStack.push(Token(keyword2->token));
const TokenType ws2 = consumeWhitespace();
if(ws2 != SUCCESS)
{
m_tokenStack.prepend(Token(ws2));
return Token(keyword->token);
}
const Token id3(tokenizeNCName());
if(id3.type != NCNAME)
{
m_tokenStack.prepend(id3);
return Token(keyword->token);
}
const TokenMap *const keyword3 = lookupKeyword(id3.value);
if(!keyword3)
{
m_tokenStack.prepend(id3);
return Token(keyword->token);
}
else
{
m_tokenStack.prepend(Token(keyword3->token));
if(keyword3->token == ORDER)
setState(Operator);
else
setState(NamespaceDecl);
}
return Token(keyword->token);
}
default:
{
m_tokenStack.push(Token(keyword2->token));
setState(Default);
return id;
}
}
}
case XQUERY:
{
m_tokenStack.push(Token(keyword2->token));
if(keyword2->token == VERSION)
{
setState(NamespaceDecl);
return Token(keyword->token);
}
else
{
setState(Operator);
return id;
}
}
case IMPORT:
{
m_tokenStack.push(Token(keyword2->token));
switch(keyword2->token)
{
case SCHEMA:
/* Fallthrough. */
case MODULE:
{
setState(NamespaceKeyword);
return Token(keyword->token);
}
default:
{
setState(Operator);
return id;
}
}
}
case VALIDATE:
{
m_tokenStack.push(Token(keyword2->token));
switch(keyword2->token)
{
case LAX:
case STRICT:
{
pushState(Operator);
return Token(keyword->token);
}
default:
{
setState(Operator);
return id;
}
}
}
default:
{
m_tokenStack.push(Token(keyword2->token));
setState(Operator);
return id;
}
}
Q_ASSERT(false);
}
case VarName:
{
if(peekCurrent() == '$')
return tokenAndAdvance(DOLLAR);
setState(Operator);
return tokenizeNCNameOrQName();
Q_ASSERT(false);
}
case ItemType:
{
switch(peekCurrent())
{
case '(':
return tokenAndChangeState(LPAREN, KindTest);
case '$':
return tokenAndChangeState(DOLLAR, VarName);
}
const Token name(tokenizeNCNameOrQName());
if(name.hasError())
return error();
else if(name.type == QNAME)
{
setState(OccurrenceIndicator);
return name;
}
else
{
const TokenMap *const keyword = lookupKeyword(name.value);
if(keyword)
{
pushState(OccurrenceIndicator);
return Token(keyword->token);
}
else
{
setState(Default);
return name;
}
}
Q_ASSERT(false);
}
case KindTest:
{
switch(peekCurrent())
{
case ')':
{
popState();
return tokenAndAdvance(RPAREN);
}
case '(':
return tokenAndAdvance(LPAREN);
case ',':
return tokenAndAdvance(COMMA);
case '*':
return tokenAndAdvance(STAR);
case '?':
return tokenAndAdvance(QUESTION);
case '\'':
/* Fallthrough. */
case '"':
return tokenizeStringLiteral();
}
const Token nc(tokenizeNCNameOrQName());
if(nc.hasError())
return nc;
const TokenType ws = consumeWhitespace();
if(ws == ERROR)
return error();
if(peekCurrent() == '(')
{
const TokenMap *const keyword = lookupKeyword(nc.value);
if(keyword)
{
pushState(KindTest);
return Token(keyword->token);
}
else
return nc;
}
else
return nc;
Q_ASSERT(false);
}
case KindTestForPI:
{
switch(peekCurrent())
{
case ')':
{
popState();
return tokenAndAdvance(RPAREN);
}
case '\'':
/* Fallthrough. */
case '"':
return tokenizeStringLiteral();
default:
return tokenizeNCName();
}
Q_ASSERT(false);
}
case OccurrenceIndicator:
{
switch(peekCurrent())
{
case '?':
return tokenAndChangeState(QUESTION, Operator);
case '*':
return tokenAndChangeState(STAR, Operator);
case '+':
return tokenAndChangeState(PLUS, Operator);
default:
{
setState(Operator);
return nextToken();
}
}
Q_ASSERT(false);
}
case XQueryVersion:
{
switch(peekCurrent())
{
case '\'':
/* Fallthrough. */
case '"':
return tokenizeStringLiteral();
case ';':
return tokenAndChangeState(SEMI_COLON, Default);
}
const Token id(tokenizeNCName());
if(id.type != NCNAME)
return id;
const TokenMap *const keyword = lookupKeyword(id.value);
if(keyword)
return tokenAndChangeState(keyword->token, Default);
else
return id;
Q_ASSERT(false);
}
case StartTag:
{
if(peekAhead(-1) == '<')
{
if(current().isSpace())
return Token(ERROR);
}
else
{
if(consumeRawWhitespace())
return Token(END_OF_FILE);
}
switch(peekCurrent())
{
case '/':
{
if(peekAhead() == '>')
{
m_pos += 2;
if(m_scanOnly)
return Token(POSITION_SET);
else
{
popState();
return Token(QUICK_TAG_END);
}
}
else
return error();
}
case '>':
{
if(m_scanOnly)
return tokenAndChangeState(POSITION_SET, StartTag);
else
return tokenAndChangeState(G_GT, ElementContent);
}
case '=':
return tokenAndAdvance(G_EQ);
case '\'':
return tokenAndChangeState(APOS, AposAttributeContent);
case '"':
return tokenAndChangeState(QUOTE, QuotAttributeContent);
default:
return tokenizeNCNameOrQName();
}
Q_ASSERT(false);
}
case AposAttributeContent:
/* Fallthrough. */
case QuotAttributeContent:
{
const QChar sep(state() == AposAttributeContent ? QLatin1Char('\'') : QLatin1Char('"'));
QString result;
result.reserve(20);
if(m_scanOnly)
{
int stack = 0;
return attributeAsRaw(sep, stack, m_pos, true, result);
}
Q_ASSERT(!m_scanOnly);
while(true)
{
if(atEnd())
{
/* In the case that the XSL-T tokenizer invokes us with
* default state QuotAttributeContent, we need to be able
* to return a single string, in case that is all we have
* accumulated. */
if(result.isEmpty())
return Token(END_OF_FILE);
else
return Token(STRING_LITERAL, result);
}
const QChar curr(current());
if(curr == sep)
{
if(m_pos + 1 == m_length)
return Token(END_OF_FILE);
if(m_data.at(m_pos + 1) == sep)
{
/* The quoting mechanism was used. */
m_pos += 2;
result.append(sep);
continue;
}
const QChar next(m_data.at(m_pos + 1));
if(!next.isSpace() && next != QLatin1Char('/') && next != QLatin1Char('>'))
return Token(ERROR); // i18n Space must separate attributes
else if(result.isEmpty())
{
return tokenAndChangeState(state() == AposAttributeContent ? APOS : QUOTE,
StartTag, 1);
}
else
{
/* Don't consume the sep, but leave it so we next time return a token for it. */
return Token(STRING_LITERAL, result);
}
++m_pos;
continue;
}
else if(curr == QLatin1Char('{'))
{
if(m_pos + 1 == m_length)
return Token(END_OF_FILE);
else if(peekAhead() == '{')
{
++m_pos;
result.append(QLatin1Char('{'));
}
else
{
if(result.isEmpty())
{
/* The Attribute Value Template appeared directly in the attribute. */
pushState();
return tokenAndChangeState(CURLY_LBRACE, Default);
}
else
{
/* We don't advance, keep '{' as next token. */
return Token(STRING_LITERAL, result);
}
}
}
else if(curr == QLatin1Char('}'))
{
if(m_pos + 1 == m_length)
return Token(END_OF_FILE);
else if(peekAhead() == '}')
{
++m_pos;
result.append(QLatin1Char('}'));
}
else
return Token(ERROR);
}
else if(curr == QLatin1Char('&'))
{
const QString ret(tokenizeCharacterReference());
if(ret.isNull())
return Token(ERROR);
else
result.append(ret);
}
else if(curr == QLatin1Char('<'))
return Token(STRING_LITERAL, result);
else
{
/* See Extensible Markup Language (XML) 1.0 (Fourth Edition),
* 3.3.3 Attribute-Value Normalization.
*
* However, it is complicated a bit by that AVN is defined on top of
* EOL normalization and we do those two in one go here. */
switch(curr.unicode())
{
case 0xD:
{
if(peekAhead() == '\n')
{
result.append(QLatin1Char(' '));
++m_pos;
break;
}
}
case 0xA:
/* Fallthrough. */
case 0x9:
{
result.append(QLatin1Char(' '));
break;
}
default:
result.append(curr);
}
}
++m_pos;
}
Q_ASSERT(false);
}
case ElementContent:
{
QString result;
result.reserve(20);
/* Whether the text node, result, may be whitespace only. Character references
* and CDATA sections disables that. */
bool mayBeWS = true;
CharacterSkips skipEOLNormalization;
while(true)
{
if(atEnd())
return Token(END_OF_FILE);
switch(peekCurrent())
{
case '<':
{
if(!result.isEmpty() && peekAhead(2) != '[')
{
/* We encountered the end, and it was not a CDATA section. */
/* We don't advance. Next time we'll handle the <... stuff. */
return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
}
++m_pos;
if(atEnd())
return Token(END_OF_FILE);
const QChar ahead(current());
if(ahead.isSpace())
return error();
else if(ahead == QLatin1Char('/'))
{
if(m_pos + 1 == m_length)
return Token(END_OF_FILE);
else if(m_data.at(m_pos + 1).isSpace())
return error();
else
return tokenAndChangeState(BEGIN_END_TAG, EndTag);
}
else if(isNCNameStart(ahead))
{
pushState();
return tokenAndChangeState(G_LT, StartTag, 0);
}
else if(aheadEquals("!--", 3, 0))
{
pushState();
m_pos += 3;
return tokenAndChangeState(COMMENT_START, XMLComment, 0);
}
else if(aheadEquals("![CDATA[", 8, 0))
{
mayBeWS = false;
m_pos += 8;
const int start = m_pos;
const int len = scanUntil("]]>");
if(len == -1)
return Token(END_OF_FILE);
m_pos += 2; /* Consume "]]>". Note that m_pos is on '!'. */
result.append(m_data.mid(start, len));
break;
}
else if(ahead == QLatin1Char('?'))
{
pushState();
return tokenAndChangeState(PI_START, ProcessingInstructionName);
}
else
return Token(G_LT);
}
case '&':
{
const QString ret(tokenizeCharacterReference());
if(ret.isNull())
return Token(ERROR);
else
{
skipEOLNormalization.insert(result.count());
result.append(ret);
mayBeWS = false;
break;
}
}
case '{':
{
// TODO remove this check, also below.
if(m_pos + 1 == m_length)
return Token(END_OF_FILE);
else if(peekAhead() == '{')
{
++m_pos;
result.append(QLatin1Char('{'));
}
else
{
if(result.isEmpty())
{
pushState();
return tokenAndChangeState(CURLY_LBRACE, Default);
}
else
{
/* We don't advance here. */
return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
}
}
break;
}
case '}':
{
if(m_pos + 1 == m_length)
return Token(END_OF_FILE);
else if(peekAhead() == '}')
{
++m_pos;
result.append(QLatin1Char('}'));
}
else
{
/* This is a parse error, and the grammar won't be able
* to reduce this CURLY_RBRACE. */
return tokenAndChangeState(CURLY_RBRACE, Default);
}
break;
}
case '\n':
{
/* We want to translate \r\n into \n. */
if(peekAhead(-1) == '\r')
break;
/* else, fallthrough. */
}
case '\r':
{
result.append(QLatin1Char('\n'));
break;
}
default:
{
result.append(current());
break;
}
}
++m_pos;
}
Q_ASSERT(false);
}
case ProcessingInstructionName:
{
const int start = m_pos;
while(true)
{
++m_pos;
if(m_pos >= m_length)
return Token(END_OF_FILE);
const QChar next(current());
if(next.isSpace() || next == QLatin1Char('?'))
{
return tokenAndChangeState(PI_TARGET, m_data.mid(start, m_pos - start),
ProcessingInstructionContent);
}
}
Q_ASSERT(false);
}
case ProcessingInstructionContent:
{
/* Consume whitespace between the name and the content. */
if(consumeRawWhitespace())
return Token(END_OF_FILE);
const int start = m_pos;
const int len = scanUntil("?>");
if(len == -1)
return Token(END_OF_FILE);
else
{
m_pos += 2; /* Consume "?>" */
popState();
return Token(PI_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
}
Q_ASSERT(false);
}
case EndTag:
{
if(consumeRawWhitespace())
return END_OF_FILE;
if(peekCurrent() == '>')
{
popState();
return tokenAndAdvance(G_GT);
}
else
return tokenizeNCNameOrQName();
Q_ASSERT(false);
}
case XMLComment:
{
const int start = m_pos;
const int len = scanUntil("--");
if(len == -1)
return END_OF_FILE;
else
{
m_pos += 2; /* Consume "--". */
popState();
if(peekCurrent() == '>')
{
++m_pos;
return Token(COMMENT_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
}
else
return error();
}
Q_ASSERT(false);
}
case Pragma:
{
/* Consume whitespace. */
if(consumeRawWhitespace())
return Token(END_OF_FILE);
setState(PragmaContent);
return tokenizeNCNameOrQName();
}
case PragmaContent:
{
QString result;
result.reserve(20);
const bool hasWS = m_pos < m_length && current().isSpace();
/* Consume all whitespace up to the pragma content(if any). */
if(consumeRawWhitespace())
return Token(END_OF_FILE);
if(peekCurrent() == '#' && peekAhead() == ')')
{
/* We reached the end, and there's no pragma content. */
return tokenAndChangeState(PRAGMA_END, Default, 2);
}
else if(!hasWS)
{
/* A separating space is required if there's pragma content. */
return error(); /* i18n */
}
const int start = m_pos;
const int len = scanUntil("#)");
if(len == -1)
return Token(END_OF_FILE);
return Token(STRING_LITERAL, m_data.mid(start, len));
Q_ASSERT(false);
}
}
Q_ASSERT(false);
return error();
}
Tokenizer::Token XQueryTokenizer::attributeAsRaw(const QChar sep,
int &sepStack,
const int startPos,
const bool aInLiteral,
QString &result)
{
bool inLiteral = aInLiteral;
const char otherSep = (sep == QLatin1Char('"') ? '\'' : '"');
while(true)
{
if(atEnd())
return END_OF_FILE;
if(peekCurrent() == sep.unicode())
{
if(inLiteral)
inLiteral = false;
else
inLiteral = true;
if(peekAhead() == sep.unicode())
{
/* The quoting mechanism was used. */
result.append(current());
m_pos += 2;
continue;
}
else
{
/* Don't consume the separator, such that we
* return a token for it next time. */
if(m_pos == startPos)
{
++m_pos;
setState(StartTag);
return Token(sep == QLatin1Char('"') ? QUOTE : APOS);
}
if(sepStack == 0)
{
return Token(STRING_LITERAL, result);
}
else
{
result.append(current());
++m_pos;
continue;
}
}
}
else if(peekCurrent() == '&')
{
const QString ret(tokenizeCharacterReference());
if(ret.isNull())
return Token(ERROR);
else
{
result.append(ret);
++m_pos;
continue;
}
}
else if(peekCurrent() == otherSep)
{
result.append(current());
++m_pos;
if(peekCurrent() == otherSep)
++m_pos;
if(inLiteral)
inLiteral = false;
else
inLiteral = true;
continue;
}
else if(peekCurrent() == '{')
{
result.append(current());
if(peekAhead() == '{')
{
m_pos += 2;
continue;
}
else
{
++m_pos;
++sepStack;
const Token t(attributeAsRaw(sep, sepStack, startPos, false, result));
if(t.type != SUCCESS)
return t;
}
}
else if(peekCurrent() == '}')
{
if(inLiteral && peekAhead() == '}')
{
result.append(current());
m_pos += 2;
continue;
}
else
{
++m_pos;
--sepStack;
return Token(SUCCESS); /* The return value is arbitrary. */
}
}
else
{
result.append(current());
++m_pos;
}
}
}
Tokenizer::Token XQueryTokenizer::nextToken(YYLTYPE *const sourceLocator)
{
sourceLocator->first_line = m_line;
sourceLocator->first_column = m_pos - m_columnOffset + 1; /* Plus 1, since m_pos is 0-based. */
if(m_tokenStack.isEmpty())
return nextToken();
else
{
const Token retval(m_tokenStack.pop());
switch(retval.type)
{
case MODULE:
/* Fallthrough.*/
case SCHEMA:
/* Fallthrough.*/
case COPY_NAMESPACES:
{
setState(NamespaceKeyword);
break;
}
case VERSION:
{
setState(XQueryVersion);
break;
}
case AS:
/* Fallthrough. */
case OF:
{
setState(ItemType);
break;
}
default:
{
if(isOperatorKeyword(retval.type))
setState(Default);
break;
}
};
return retval;
}
}
int XQueryTokenizer::commenceScanOnly()
{
m_scanOnly = true;
return m_pos;
}
void XQueryTokenizer::resumeTokenizationFrom(const int pos)
{
m_scanOnly = false;
m_pos = pos;
}
void XQueryTokenizer::setParserContext(const ParserContext::Ptr &)
{
}
#undef handleWhitespace
} // namespace QPatternist
QT_END_NAMESPACE