blob: 9308c3e6cadb42e80edced1226f97f214dc3f1d4 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Id: RegxParser.cpp 568078 2007-08-21 11:43:25Z amassari $
*/
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <xercesc/util/regx/RegxParser.hpp>
#include <xercesc/util/XMLString.hpp>
#include <xercesc/util/ParseException.hpp>
#include <xercesc/util/regx/RegularExpression.hpp>
#include <xercesc/util/regx/RegxUtil.hpp>
#include <xercesc/util/regx/RegxDefs.hpp>
#include <xercesc/util/regx/TokenInc.hpp>
#include <xercesc/framework/XMLErrorCodes.hpp>
XERCES_CPP_NAMESPACE_BEGIN
// ---------------------------------------------------------------------------
// Static member data initialization
// ---------------------------------------------------------------------------
const unsigned short RegxParser::S_NORMAL = 0;
const unsigned short RegxParser::S_INBRACKETS = 1;
const unsigned short RegxParser::S_INXBRACKETS = 2;
// ---------------------------------------------------------------------------
// RegxParser::ReferencePostion: Constructors and Destructor
// ---------------------------------------------------------------------------
RegxParser::ReferencePosition::ReferencePosition(const int refNo,
const int position)
:fReferenceNo(refNo)
, fPosition(position)
{
}
// ---------------------------------------------------------------------------
// RegxParser: Constructors and Destructors
// ---------------------------------------------------------------------------
RegxParser::RegxParser(MemoryManager* const manager)
:fMemoryManager(manager),
fHasBackReferences(false),
fOptions(0),
fOffset(0),
fNoGroups(1),
fParseContext(S_NORMAL),
fStringLen(0),
fState(0),
fCharData(0),
fString(0),
fReferences(0),
fTokenFactory(0)
{
}
RegxParser::~RegxParser() {
fMemoryManager->deallocate(fString);//delete [] fString;
delete fReferences;
}
// ---------------------------------------------------------------------------
// RegxParser: Parsing methods
// ---------------------------------------------------------------------------
Token* RegxParser::parse(const XMLCh* const regxStr, const int options) {
// if TokenFactory is not set do nothing.
// REVISIT - should we throw an exception
if (fTokenFactory == 0) {
return 0;
}
fOptions = options;
fOffset = 0;
fNoGroups = 1;
fHasBackReferences = false;
setParseContext(S_NORMAL);
if (fString)
fMemoryManager->deallocate(fString);//delete [] fString;
fString = XMLString::replicate(regxStr, fMemoryManager);
if (isSet(RegularExpression::EXTENDED_COMMENT)) {
if (fString)
fMemoryManager->deallocate(fString);//delete [] fString;
fString = RegxUtil::stripExtendedComment(regxStr, fMemoryManager);
}
fStringLen = XMLString::stringLen(fString);
processNext();
Token* retTok = parseRegx();
if (fOffset != fStringLen) {
XMLCh value1[65];
XMLString::binToText(fOffset, value1, 64, 10, fMemoryManager);
ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_Parse1, value1, fString, fMemoryManager);
}
if (fReferences != 0) {
unsigned int refSize = fReferences->size();
for (unsigned int i = 0; i < refSize; i++) {
if (fNoGroups <= fReferences->elementAt(i)->fReferenceNo) {
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Parse2, fMemoryManager);
}
}
fReferences->removeAllElements();
}
return retTok;
}
void RegxParser::processNext() {
if (fOffset >= fStringLen) {
fCharData = -1;
fState = REGX_T_EOF;
return;
}
unsigned short nextState;
XMLCh ch = fString[fOffset++];
fCharData = ch;
if (fParseContext == S_INBRACKETS) {
switch (ch) {
case chBackSlash:
nextState = REGX_T_BACKSOLIDUS;
if (fOffset >= fStringLen) {
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next1, fMemoryManager);
}
fCharData = fString[fOffset++];
break;
case chDash:
if (isSet(RegularExpression::XMLSCHEMA_MODE)
&& fOffset < fStringLen && fString[fOffset] == chOpenSquare) {
fOffset++;
nextState = REGX_T_XMLSCHEMA_CC_SUBTRACTION;
}
else {
nextState = REGX_T_CHAR;
}
break;
case chOpenSquare:
if (!isSet(RegularExpression::XMLSCHEMA_MODE)
&& fOffset < fStringLen && fString[fOffset] == chColon) {
fOffset++;
nextState = REGX_T_POSIX_CHARCLASS_START;
break;
} // Through down
default:
if (RegxUtil::isHighSurrogate(ch) && fOffset < fStringLen) {
XMLCh lowCh = fString[fOffset];
if (RegxUtil::isLowSurrogate(lowCh)) {
fCharData = RegxUtil::composeFromSurrogate(ch, lowCh);
fOffset++;
}
else {
throw XMLErrs::Expected2ndSurrogateChar;
}
}
nextState = REGX_T_CHAR;
}
fState = nextState;
return;
}
switch (ch) {
case chPipe:
nextState = REGX_T_OR;
break;
case chAsterisk:
nextState = REGX_T_STAR;
break;
case chPlus:
nextState = REGX_T_PLUS;
break;
case chQuestion:
nextState = REGX_T_QUESTION;
break;
case chCloseParen:
nextState = REGX_T_RPAREN;
break;
case chPeriod:
nextState = REGX_T_DOT;
break;
case chOpenSquare:
nextState = REGX_T_LBRACKET;
break;
case chCaret:
nextState = REGX_T_CARET;
break;
case chDollarSign:
nextState = REGX_T_DOLLAR;
break;
case chOpenParen:
{
nextState = REGX_T_LPAREN;
if (fOffset >= fStringLen)
break;
if (fString[fOffset] != chQuestion)
break;
if (++fOffset >= fStringLen)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next2, fMemoryManager);
ch = fString[fOffset++];
switch (ch) {
case chColon:
nextState = REGX_T_LPAREN2;
break;
case chEqual:
nextState = REGX_T_LOOKAHEAD;
break;
case chBang:
nextState = REGX_T_NEGATIVELOOKAHEAD;
break;
case chOpenSquare:
nextState = REGX_T_SET_OPERATIONS;
break;
case chCloseAngle:
nextState = REGX_T_INDEPENDENT;
break;
case chOpenAngle:
if (fOffset >= fStringLen)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next2, fMemoryManager);
ch = fString[fOffset++];
if (ch == chEqual) {
nextState = REGX_T_LOOKBEHIND;
}
else if (ch == chBang) {
nextState = REGX_T_NEGATIVELOOKBEHIND;
}
else {
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next3, fMemoryManager);
}
break;
case chPound:
while (fOffset < fStringLen) {
ch = fString[fOffset++];
if (ch == chCloseParen)
break;
}
if (ch != chCloseParen)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next4, fMemoryManager);
nextState = REGX_T_COMMENT;
break;
default:
if (ch == chDash || chLatin_a <= ch && ch <= chLatin_z
|| chLatin_A <= ch && ch <= chLatin_Z) { // Options
fOffset--;
nextState = REGX_T_MODIFIERS;
break;
}
else if (ch == chOpenParen) {
nextState = REGX_T_CONDITION;
break;
}
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next2, fMemoryManager);
}
}
break;
case chBackSlash:
nextState = REGX_T_BACKSOLIDUS;
if (fOffset >= fStringLen) {
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next1, fMemoryManager);
}
fCharData = fString[fOffset++];
break;
default:
nextState = REGX_T_CHAR;
if (RegxUtil::isHighSurrogate(ch) && fOffset < fStringLen) {
XMLCh lowCh = fString[fOffset];
if (RegxUtil::isLowSurrogate(lowCh)) {
fCharData = RegxUtil::composeFromSurrogate(ch, lowCh);
fOffset++;
}
else {
throw XMLErrs::Expected2ndSurrogateChar;
}
}
}
fState = nextState;
}
Token* RegxParser::parseRegx(const bool matchingRParen) {
Token* tok = parseTerm(matchingRParen);
Token* parentTok = 0;
while (fState == REGX_T_OR) {
processNext();
if (parentTok == 0) {
parentTok = fTokenFactory->createUnion();
parentTok->addChild(tok, fTokenFactory);
tok = parentTok;
}
tok->addChild(parseTerm(matchingRParen), fTokenFactory);
}
return tok;
}
Token* RegxParser::parseTerm(const bool matchingRParen) {
unsigned short state = fState;
if (state == REGX_T_OR || state == REGX_T_EOF
|| (state == REGX_T_RPAREN && matchingRParen)) {
return fTokenFactory->createToken(Token::T_EMPTY);
}
else {
Token* tok = parseFactor();
Token* concatTok = 0;
while ((state = fState) != REGX_T_OR && state != REGX_T_EOF
&& (state != REGX_T_RPAREN || !matchingRParen))
{
if (concatTok == 0) {
concatTok = fTokenFactory->createUnion(true);
concatTok->addChild(tok, fTokenFactory);
tok = concatTok;
}
concatTok->addChild(parseFactor(), fTokenFactory);
}
return tok;
}
}
Token* RegxParser::processCaret() {
processNext();
return fTokenFactory->getLineBegin();
}
Token* RegxParser::processDollar() {
processNext();
return fTokenFactory->getLineEnd();
}
Token* RegxParser::processLook(const unsigned short tokType) {
processNext();
Token* tok = fTokenFactory->createLook(tokType, parseRegx());
if (fState != REGX_T_RPAREN) {
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Factor1, fMemoryManager);
}
processNext();
return tok;
}
Token* RegxParser::processBacksolidus_A() {
processNext();
return fTokenFactory->getStringBegin();
}
Token* RegxParser::processBacksolidus_Z() {
processNext();
return fTokenFactory->getStringEnd2();
}
Token* RegxParser::processBacksolidus_z() {
processNext();
return fTokenFactory->getStringEnd();
}
Token* RegxParser::processBacksolidus_b() {
processNext();
return fTokenFactory->getWordEdge();
}
Token* RegxParser::processBacksolidus_B() {
processNext();
return fTokenFactory->getNotWordEdge();
}
Token* RegxParser::processBacksolidus_lt() {
processNext();
return fTokenFactory->getWordBegin();
}
Token* RegxParser::processBacksolidus_gt() {
processNext();
return fTokenFactory->getWordEnd();
}
Token* RegxParser::processStar(Token* const tok) {
processNext();
if (fState == REGX_T_QUESTION) {
processNext();
return fTokenFactory->createClosure(tok, true);
}
return fTokenFactory->createClosure(tok);
}
Token* RegxParser::processPlus(Token* const tok) {
processNext();
if (fState == REGX_T_QUESTION) {
processNext();
return fTokenFactory->createConcat(tok,
fTokenFactory->createClosure(tok,true));
}
return fTokenFactory->createConcat(tok,
fTokenFactory->createClosure(tok));
}
Token* RegxParser::processQuestion(Token* const tok) {
processNext();
Token* parentTok = fTokenFactory->createUnion();
if (fState == REGX_T_QUESTION) {
processNext();
parentTok->addChild(fTokenFactory->createToken(Token::T_EMPTY), fTokenFactory);
parentTok->addChild(tok, fTokenFactory);
}
else {
parentTok->addChild(tok, fTokenFactory);
parentTok->addChild(fTokenFactory->createToken(Token::T_EMPTY), fTokenFactory);
}
return parentTok;
}
Token* RegxParser::processParen() {
processNext();
int num = fNoGroups++;
Token* tok = fTokenFactory->createParenthesis(parseRegx(true),num);
if (fState != REGX_T_RPAREN)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Factor1, fMemoryManager);
processNext();
return tok;
}
Token* RegxParser::processParen2() {
processNext();
Token* tok = fTokenFactory->createParenthesis(parseRegx(), 0);
if (fState != REGX_T_RPAREN)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Factor1, fMemoryManager);
processNext();
return tok;
}
Token* RegxParser::processCondition() {
if (fOffset + 1 >= fStringLen)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Factor4, fMemoryManager);
int refNo = -1;
Token* conditionTok = 0;
XMLInt32 ch = fString[fOffset];
if (chDigit_1 <= ch && ch <= chDigit_9) {
refNo = ch - chDigit_0;
fHasBackReferences = true;
if (fReferences == 0) {
this->fReferences = new (fMemoryManager) RefVectorOf<ReferencePosition>(8, true, fMemoryManager);
}
fReferences->addElement(new (fMemoryManager) ReferencePosition(refNo, fOffset));
fOffset++;
if (fString[fOffset] != chCloseParen)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Factor1, fMemoryManager);
fOffset++;
}
else {
if (ch == chQuestion) {
fOffset--;
}
processNext();
conditionTok = parseFactor();
switch(conditionTok->getTokenType()) {
case Token::T_LOOKAHEAD:
case Token::T_NEGATIVELOOKAHEAD:
case Token::T_LOOKBEHIND:
case Token::T_NEGATIVELOOKBEHIND:
break;
case Token::T_ANCHOR:
if (fState != REGX_T_RPAREN)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Factor1, fMemoryManager);
break;
default:
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Factor5, fMemoryManager);
}
}
processNext();
Token* yesPattern = parseRegx();
Token* noPattern = 0;
if (yesPattern->getTokenType() == Token::T_UNION) {
if (yesPattern->size() != 2)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Factor6, fMemoryManager);
noPattern = yesPattern->getChild(1);
yesPattern = yesPattern->getChild(0);
}
if (fState != REGX_T_RPAREN)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Factor1, fMemoryManager);
processNext();
return fTokenFactory->createCondition(refNo,conditionTok,
yesPattern,noPattern);
}
Token* RegxParser::processModifiers() {
// fOffset points to the next '?'.
// modifiers ::= [imsw]* ('-' [imsw]*)? ':'
int add = 0;
int mask = 0;
XMLInt32 ch = -1;
while (fOffset < fStringLen) {
int v = RegularExpression::getOptionValue(fString[fOffset]);
ch = fString[fOffset];
if (v == 0)
break;
add |= v;
fOffset++;
} // end while
if (fOffset >= fStringLen)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Factor2, fMemoryManager);
if (ch == chDash) {
fOffset++;
while(fOffset < fStringLen) {
int v = RegularExpression::getOptionValue(fString[fOffset]);
ch = fString[fOffset];
if (v == 0)
break;
mask |= v;
fOffset++;
}
if (fOffset >= fStringLen)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Factor2, fMemoryManager);
}
Token* tok = 0;
if (ch == chColon) {
fOffset++;
processNext();
tok = fTokenFactory->createModifierGroup(parseRegx(),add,mask);
if (fState != REGX_T_RPAREN)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Factor1, fMemoryManager);
processNext();
}
else if (ch == chCloseParen) {
fOffset++;
processNext();
tok = fTokenFactory->createModifierGroup(parseRegx(),add,mask);
}
else {
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Factor3, fMemoryManager);
}
return tok;
}
Token* RegxParser::processIndependent() {
processNext();
Token* tok = fTokenFactory->createLook(Token::T_INDEPENDENT, parseRegx());
if (fState != REGX_T_RPAREN)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Factor1, fMemoryManager);
processNext();
return tok;
}
Token* RegxParser::processBacksolidus_c() {
XMLCh ch; //Must be in 0x0040-0x005F
if (fOffset >= fStringLen
|| ((ch = fString[fOffset++]) & 0xFFE0) != 0x0040)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom1, fMemoryManager);
processNext();
return fTokenFactory->createChar(ch - 0x40);
}
Token* RegxParser::processBacksolidus_C() {
// REVISIT - Do we throw an exception - we do not want to throw too
// many exceptions
return 0;
}
Token* RegxParser::processBacksolidus_i() {
processNext();
return fTokenFactory->createChar(chLatin_i);
}
Token* RegxParser::processBacksolidus_I() {
//Ditto
return 0;
}
Token* RegxParser::processBacksolidus_g() {
processNext();
return fTokenFactory->getGraphemePattern();
}
Token* RegxParser::processBacksolidus_X() {
processNext();
return fTokenFactory->getCombiningCharacterSequence();
}
Token* RegxParser::processBackReference() {
int refNo = fCharData - chDigit_0;
Token* tok = fTokenFactory->createBackReference(refNo);
fHasBackReferences = true;
if (fReferences == 0) {
fReferences = new (fMemoryManager) RefVectorOf<ReferencePosition>(8, true, fMemoryManager);
}
fReferences->addElement(new (fMemoryManager) ReferencePosition(refNo, fOffset - 2));
processNext();
return tok;
}
Token* RegxParser::parseFactor() {
switch (fState) {
case REGX_T_CARET:
return processCaret();
case REGX_T_DOLLAR:
return processDollar();
case REGX_T_LOOKAHEAD:
return processLook(Token::T_LOOKAHEAD);
case REGX_T_NEGATIVELOOKAHEAD:
return processLook(Token::T_NEGATIVELOOKAHEAD);
case REGX_T_LOOKBEHIND:
return processLook(Token::T_LOOKBEHIND);
case REGX_T_NEGATIVELOOKBEHIND:
return processLook(Token::T_NEGATIVELOOKBEHIND);
case REGX_T_COMMENT:
processNext();
return fTokenFactory->createToken(Token::T_EMPTY);
case REGX_T_BACKSOLIDUS:
switch(fCharData) {
case chLatin_A:
return processBacksolidus_A();
case chLatin_Z:
return processBacksolidus_Z();
case chLatin_z:
return processBacksolidus_z();
case chLatin_b:
return processBacksolidus_B();
case chLatin_B:
return processBacksolidus_B();
case chOpenAngle:
return processBacksolidus_lt();
case chCloseAngle:
return processBacksolidus_gt();
}
}
Token* tok = parseAtom();
switch(fState) {
case REGX_T_STAR:
return processStar(tok);
case REGX_T_PLUS:
return processPlus(tok);
case REGX_T_QUESTION:
return processQuestion(tok);
case REGX_T_CHAR:
if (fCharData == chOpenCurly && fOffset < fStringLen) {
int min = 0;
int max = -1;
XMLInt32 ch = fString[fOffset++];
if (ch >= chDigit_0 && ch <= chDigit_9) {
min = ch - chDigit_0;
while (fOffset < fStringLen
&& (ch = fString[fOffset++]) >= chDigit_0
&& ch <= chDigit_9) {
min = min*10 + ch - chDigit_0;
}
if (min < 0)
ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Parser_Quantifier5, fString, fMemoryManager);
}
else {
ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Parser_Quantifier1, fString, fMemoryManager);
}
max = min;
if (ch == chComma) {
if (fOffset >= fStringLen) {
ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Parser_Quantifier3, fString, fMemoryManager);
}
else if ((ch = fString[fOffset++]) >= chDigit_0 && ch <= chDigit_9) {
max = ch - chDigit_0;
while (fOffset < fStringLen
&& (ch = fString[fOffset++]) >= chDigit_0
&& ch <= chDigit_9) {
max = max*10 + ch - chDigit_0;
}
if (max < 0)
ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Parser_Quantifier5, fString, fMemoryManager);
else if (min > max)
ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Parser_Quantifier4, fString, fMemoryManager);
}
else {
max = -1;
}
}
if (ch != chCloseCurly) {
ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Parser_Quantifier2, fString, fMemoryManager);
}
if (checkQuestion(fOffset)) {
tok = fTokenFactory->createClosure(tok, true);
fOffset++;
}
else {
tok = fTokenFactory->createClosure(tok);
}
tok->setMin(min);
tok->setMax(max);
processNext();
}
break;
}
return tok;
}
Token* RegxParser::parseAtom() {
Token* tok = 0;
switch(fState) {
case REGX_T_LPAREN:
return processParen();
case REGX_T_LPAREN2:
return processParen2();
case REGX_T_CONDITION:
return processCondition();
case REGX_T_MODIFIERS:
return processModifiers();
case REGX_T_INDEPENDENT:
return processIndependent();
case REGX_T_DOT:
processNext();
tok = fTokenFactory->getDot();
break;
case REGX_T_LBRACKET:
return parseCharacterClass(true);
case REGX_T_SET_OPERATIONS:
return parseSetOperations();
case REGX_T_BACKSOLIDUS:
switch(fCharData) {
case chLatin_d:
case chLatin_D:
case chLatin_w:
case chLatin_W:
case chLatin_s:
case chLatin_S:
tok = getTokenForShorthand(fCharData);
processNext();
return tok;
case chLatin_c:
return processBacksolidus_c();
case chLatin_C:
return processBacksolidus_C();
case chLatin_i:
return processBacksolidus_i();
case chLatin_I:
return processBacksolidus_I();
case chLatin_g:
return processBacksolidus_g();
case chLatin_X:
return processBacksolidus_X();
case chDigit_0:
case chDigit_1:
case chDigit_2:
case chDigit_3:
case chDigit_4:
case chDigit_5:
case chDigit_6:
case chDigit_7:
case chDigit_8:
case chDigit_9:
return processBackReference();
case chLatin_p:
case chLatin_P:
{
tok = processBacksolidus_pP(fCharData);
if (tok == 0) {
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom5, fMemoryManager);
}
}
break;
default:
{
XMLInt32 ch = decodeEscaped();
if (ch < 0x10000) {
tok = fTokenFactory->createChar(ch);
}
else {
XMLCh* surrogateStr = RegxUtil::decomposeToSurrogates(ch, fMemoryManager);
ArrayJanitor<XMLCh> janSurrogate(surrogateStr, fMemoryManager);
tok = fTokenFactory->createString(surrogateStr);
}
}
break;
} // end switch
processNext();
break;
case REGX_T_CHAR:
if (fCharData == chOpenCurly
|| fCharData == chCloseCurly
|| fCharData == chCloseSquare)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom4, fMemoryManager);
tok = fTokenFactory->createChar(fCharData);
processNext();
break;
default:
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom4, fMemoryManager);
} //end switch
return tok;
}
RangeToken* RegxParser::processBacksolidus_pP(const XMLInt32 ch) {
processNext();
if (fState != REGX_T_CHAR || fCharData != chOpenCurly)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom2, fMemoryManager);
int nameStart = fOffset;
int nameEnd = XMLString::indexOf(fString,chCloseCurly,nameStart, fMemoryManager);
if (nameEnd < 0)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom3, fMemoryManager);
fOffset = nameEnd + 1;
XMLCh* rangeName = (XMLCh*) fMemoryManager->allocate
(
(nameEnd - nameStart + 1) * sizeof(XMLCh)
);//new XMLCh[(nameEnd - nameStart) + 1];
ArrayJanitor<XMLCh> janRangeName(rangeName, fMemoryManager);
XMLString::subString(rangeName, fString, nameStart, nameEnd, fMemoryManager);
return fTokenFactory->getRange(rangeName, !(ch == chLatin_p));
}
XMLInt32 RegxParser::processCInCharacterClass(RangeToken* const,
const XMLInt32) {
return decodeEscaped();
}
RangeToken* RegxParser::parseCharacterClass(const bool useNRange) {
setParseContext(S_INBRACKETS);
processNext();
RangeToken* base = 0;
RangeToken* tok = 0;
bool nRange = false;
if (fState == REGX_T_CHAR && fCharData == chCaret) {
nRange = true;
processNext();
if (useNRange) {
tok = fTokenFactory->createRange(true);
}
else {
base = fTokenFactory->createRange();
base->addRange(0, Token::UTF16_MAX);
tok = fTokenFactory->createRange();
}
}
else {
tok = fTokenFactory->createRange();
}
bool firstLoop = true;
while (fState != REGX_T_EOF) {
if (fState == REGX_T_CHAR && fCharData == chCloseSquare && !firstLoop)
break;
bool end = false;
XMLInt32 ch = fCharData;
firstLoop = false;
if (fState == REGX_T_BACKSOLIDUS) {
switch(ch) {
case chLatin_d:
case chLatin_D:
case chLatin_w:
case chLatin_W:
case chLatin_s:
case chLatin_S:
tok->mergeRanges(getTokenForShorthand(ch));
end = true;
break;
case chLatin_i:
case chLatin_I:
case chLatin_c:
case chLatin_C:
ch = processCInCharacterClass(tok, ch);
if (ch < 0){
end = true;
}
break;
case chLatin_p:
case chLatin_P:
{
RangeToken* tok2 = processBacksolidus_pP(ch);
if (tok2 == 0) {
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom5, fMemoryManager);
}
tok->mergeRanges(tok2);
end = true;
}
break;
default:
ch = decodeEscaped();
}
} // end if REGX_T_BACKSOLIDUS
else if (fState == REGX_T_POSIX_CHARCLASS_START) {
int nameEnd = XMLString::indexOf(fString, chColon, fOffset, fMemoryManager);
if (nameEnd < 0) {
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC1, fMemoryManager);
}
bool positive = true;
if (fString[fOffset] == chCaret) {
fOffset++;
positive = false;
}
XMLCh* name = (XMLCh*) fMemoryManager->allocate
(
(nameEnd - fOffset + 1) * sizeof(XMLCh)
);//new XMLCh[(nameEnd - fOffset) + 1];
ArrayJanitor<XMLCh> janName(name, fMemoryManager);
XMLString::subString(name, fString, fOffset, nameEnd, fMemoryManager);
RangeToken* rangeTok = fTokenFactory->getRange(name, !positive);
if (rangeTok == 0) {
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC3, fMemoryManager);
}
tok->mergeRanges(rangeTok);
end = true;
if (nameEnd+1 >= fStringLen || fString[nameEnd+1] != chCloseSquare) {
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC1, fMemoryManager);
}
fOffset = nameEnd + 2;
}
processNext();
if (!end) {
if (fState != REGX_T_CHAR || fCharData != chDash) {
tok->addRange(ch, ch);
}
else {
processNext();
if (fState == REGX_T_EOF)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC2, fMemoryManager);
if (fState == REGX_T_CHAR && fCharData == chCloseSquare) {
tok->addRange(ch, ch);
tok->addRange(chDash, chDash);
}
else {
XMLInt32 rangeEnd = fCharData;
if (fState == REGX_T_BACKSOLIDUS) {
rangeEnd = decodeEscaped();
}
processNext();
tok->addRange(ch, rangeEnd);
}
}
}
if (isSet(RegularExpression::SPECIAL_COMMA)
&& fState == REGX_T_CHAR && fCharData == chComma) {
processNext();
}
} // end while fState
if (fState == REGX_T_EOF) {
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC2, fMemoryManager);
}
if (!useNRange && nRange) {
base->subtractRanges(tok);
tok = base;
}
tok->sortRanges();
tok->compactRanges();
// If the case-insensitive option is enabled, we need to
// have the new RangeToken instance build its internal
// case-insensitive RangeToken.
if (RegularExpression::isSet(fOptions, RegularExpression::IGNORE_CASE))
{
tok->getCaseInsensitiveToken(fTokenFactory);
}
setParseContext(S_NORMAL);
processNext();
return tok;
}
RangeToken* RegxParser::parseSetOperations() {
RangeToken* tok = parseCharacterClass(false);
while (fState != REGX_T_RPAREN) {
if (fState == REGX_T_CHAR
&& (fCharData == chDash || fCharData == chAmpersand)
|| fState == REGX_T_PLUS) {
processNext();
if (fState != REGX_T_LBRACKET)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Ope1, fMemoryManager);
RangeToken* tok2 = parseCharacterClass(false);
if (fState == REGX_T_PLUS) {
tok->mergeRanges(tok2);
}
else if (fCharData == chDash) {
tok->subtractRanges(tok2);
}
else if (fCharData == chAmpersand) {
tok->intersectRanges(tok2);
}
else {
throw 0; // ThrowXMLwithMemMgr(RuntimeException, "ASSERT")
}
}
else {
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Ope2, fMemoryManager);
}
}
processNext();
return tok;
}
Token* RegxParser::getTokenForShorthand(const XMLInt32 ch) {
Token* tok = 0;
bool useUnicode = isSet(RegularExpression::USE_UNICODE_CATEGORY);
switch (ch) {
case chLatin_d:
tok = useUnicode ? fTokenFactory->getRange(fgUniDecimalDigit)
: fTokenFactory->getRange(fgASCIIDigit);
break;
case chLatin_D:
tok = useUnicode ? fTokenFactory->getRange(fgUniDecimalDigit, true)
: fTokenFactory->getRange(fgASCIIDigit, true);
break;
case chLatin_w:
tok = useUnicode ? fTokenFactory->getRange(fgUniIsWord)
: fTokenFactory->getRange(fgASCIIWord);
break;
case chLatin_W:
tok = useUnicode ? fTokenFactory->getRange(fgUniIsWord, true)
: fTokenFactory->getRange(fgASCIIWord, true);
break;
case chLatin_s:
tok = useUnicode ? fTokenFactory->getRange(fgUniIsSpace)
: fTokenFactory->getRange(fgASCIISpace);
break;
case chLatin_S:
tok = useUnicode ? fTokenFactory->getRange(fgUniIsSpace, true)
: fTokenFactory->getRange(fgASCIISpace, true);
// default:
// ThrowXMLwithMemMgr(RuntimeException, "Invalid shorthand {0}", chAsString)
}
return tok;
}
XMLInt32 RegxParser::decodeEscaped() {
if (fState != REGX_T_BACKSOLIDUS)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next1, fMemoryManager);
XMLInt32 ch = fCharData;
switch (ch) {
case chLatin_e:
ch = 0x1B; // Escape
break;
case chLatin_f:
ch = chFF;
break;
case chLatin_n:
ch = chLF;
break;
case chLatin_r:
ch = chCR;
break;
case chLatin_t:
ch = chHTab;
break;
case chLatin_x:
{
processNext();
if (fState != REGX_T_CHAR) {
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape1, fMemoryManager);
}
if (fCharData == chOpenCurly) {
int v1 = 0;
XMLInt32 uv = 0;
do {
processNext();
if (fState != REGX_T_CHAR)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape1, fMemoryManager);
if ((v1 = hexChar(fCharData)) < 0)
break;
uv = uv*16 + v1;
} while (true);
if (fCharData != chCloseCurly)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape3, fMemoryManager);
if (uv > Token::UTF16_MAX)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape4, fMemoryManager);
ch = uv;
}
else {
int v1 = 0;
if (fState != REGX_T_CHAR || (v1 = hexChar(fCharData)) < 0)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape1, fMemoryManager);
int uv = v1;
processNext();
if (fState != REGX_T_CHAR || (v1 = hexChar(fCharData)) < 0)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape1, fMemoryManager);
ch = uv*16 + v1;
}
}
break;
case chLatin_u:
{
int v1 = 0;
int uv = 0;
for (int i=0; i< 4; i++) {
processNext();
if (fState != REGX_T_CHAR || (v1 = hexChar(fCharData)) < 0)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape1, fMemoryManager);
uv = (i == 0) ? v1 : uv*16 + v1;
}
ch = uv;
}
break;
case chLatin_v:
{
int v1 = 0;
int uv = 0;
for (int i=0; i< 6; i++) {
processNext();
if (fState != REGX_T_CHAR || (v1 = hexChar(fCharData)) < 0)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape1, fMemoryManager);
uv = (i == 0) ? v1 : uv*16 + v1;
}
if (uv > Token::UTF16_MAX)
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape1, fMemoryManager);
ch = uv;
}
break;
case chLatin_A:
case chLatin_Z:
case chLatin_z:
ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape5, fMemoryManager);
} // end switch
return ch;
}
// ---------------------------------------------------------------------------
// RegxParser: Helper Methods
// ---------------------------------------------------------------------------
bool RegxParser::checkQuestion(const int off) {
return ((off < fStringLen) && fString[off] == chQuestion);
}
XERCES_CPP_NAMESPACE_END
/**
* End file RegxParser.cpp
*/