| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /* |
| * $Id: RegularExpression.hpp 568078 2007-08-21 11:43:25Z amassari $ |
| */ |
| |
| #if !defined(REGULAREXPRESSION_HPP) |
| #define REGULAREXPRESSION_HPP |
| |
| // --------------------------------------------------------------------------- |
| // Includes |
| // --------------------------------------------------------------------------- |
| #include <xercesc/util/RefArrayVectorOf.hpp> |
| #include <xercesc/util/XMLString.hpp> |
| #include <xercesc/util/Janitor.hpp> |
| #include <xercesc/util/regx/Op.hpp> |
| #include <xercesc/util/regx/TokenFactory.hpp> |
| #include <xercesc/util/regx/BMPattern.hpp> |
| #include <xercesc/util/regx/ModifierToken.hpp> |
| #include <xercesc/util/regx/ConditionToken.hpp> |
| #include <xercesc/util/regx/OpFactory.hpp> |
| #include <xercesc/util/regx/RegxUtil.hpp> |
| |
| XERCES_CPP_NAMESPACE_BEGIN |
| |
| // --------------------------------------------------------------------------- |
| // Forward Declaration |
| // --------------------------------------------------------------------------- |
| class RangeToken; |
| class Match; |
| |
| class XMLUTIL_EXPORT RegularExpression : public XMemory |
| { |
| public: |
| // ----------------------------------------------------------------------- |
| // Public Constructors and Destructor |
| // ----------------------------------------------------------------------- |
| RegularExpression |
| ( |
| const char* const pattern |
| , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager |
| ); |
| RegularExpression |
| ( |
| const char* const pattern |
| , const char* const options |
| , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager |
| ); |
| RegularExpression |
| ( |
| const XMLCh* const pattern |
| , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager |
| ); |
| RegularExpression |
| ( |
| const XMLCh* const pattern |
| , const XMLCh* const options |
| , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager |
| ); |
| ~RegularExpression(); |
| |
| // ----------------------------------------------------------------------- |
| // Public Constants |
| // ----------------------------------------------------------------------- |
| static const unsigned int MARK_PARENS; |
| static const unsigned int IGNORE_CASE; |
| static const unsigned int SINGLE_LINE; |
| static const unsigned int MULTIPLE_LINE; |
| static const unsigned int EXTENDED_COMMENT; |
| static const unsigned int USE_UNICODE_CATEGORY; |
| static const unsigned int UNICODE_WORD_BOUNDARY; |
| static const unsigned int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION; |
| static const unsigned int PROHIBIT_FIXED_STRING_OPTIMIZATION; |
| static const unsigned int XMLSCHEMA_MODE; |
| static const unsigned int SPECIAL_COMMA; |
| static const unsigned short WT_IGNORE; |
| static const unsigned short WT_LETTER; |
| static const unsigned short WT_OTHER; |
| |
| // ----------------------------------------------------------------------- |
| // Public Helper methods |
| // ----------------------------------------------------------------------- |
| static int getOptionValue(const XMLCh ch); |
| |
| // ----------------------------------------------------------------------- |
| // Matching methods |
| // ----------------------------------------------------------------------- |
| bool matches(const char* const matchString, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
| bool matches(const char* const matchString, const int start, |
| const int end, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
| bool matches(const char* const matchString, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
| bool matches(const char* const matchString, const int start, |
| const int end, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
| |
| bool matches(const XMLCh* const matchString, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
| bool matches(const XMLCh* const matchString, const int start, |
| const int end, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
| bool matches(const XMLCh* const matchString, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
| bool matches(const XMLCh* const matchString, const int start, |
| const int end, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
| |
| // ----------------------------------------------------------------------- |
| // Tokenize methods |
| // ----------------------------------------------------------------------- |
| // Note: The caller owns the string vector that is returned, and is responsible |
| // for deleting it. |
| RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString); |
| RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString, const int start, |
| const int end); |
| |
| RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString); |
| RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString, |
| const int start, const int end); |
| |
| // ----------------------------------------------------------------------- |
| // Replace methods |
| // ----------------------------------------------------------------------- |
| // Note: The caller owns the XMLCh* that is returned, and is responsible for |
| // deleting it. |
| XMLCh *replace(const char* const matchString, const char* const replaceString); |
| XMLCh *replace(const char* const matchString, const char* const replaceString, |
| const int start, const int end); |
| |
| XMLCh *replace(const XMLCh* const matchString, const XMLCh* const replaceString); |
| XMLCh *replace(const XMLCh* const matchString, const XMLCh* const replaceString, |
| const int start, const int end); |
| |
| // ----------------------------------------------------------------------- |
| // Static initialize and cleanup methods |
| // ----------------------------------------------------------------------- |
| static void |
| staticInitialize(MemoryManager* memoryManager); |
| |
| static void |
| staticCleanup(); |
| |
| static bool isSet(const int options, const int flag); |
| |
| private: |
| // ----------------------------------------------------------------------- |
| // Private data types |
| // ----------------------------------------------------------------------- |
| class XMLUTIL_EXPORT Context : public XMemory |
| { |
| public : |
| Context(MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
| Context(Context* src); |
| ~Context(); |
| |
| Context& operator= (const Context& other); |
| inline const XMLCh* getString() const { return fString; } |
| void reset(const XMLCh* const string, const int stringLen, |
| const int start, const int limit, const int noClosures); |
| bool nextCh(XMLInt32& ch, int& offset, const short direction); |
| |
| bool fAdoptMatch; |
| int fStart; |
| int fLimit; |
| int fLength; // fLimit - fStart |
| int fSize; |
| int fStringMaxLen; |
| int* fOffsets; |
| Match* fMatch; |
| const XMLCh* fString; |
| MemoryManager* fMemoryManager; |
| }; |
| |
| // ----------------------------------------------------------------------- |
| // Unimplemented constructors and operators |
| // ----------------------------------------------------------------------- |
| RegularExpression(const RegularExpression&); |
| RegularExpression& operator=(const RegularExpression&); |
| |
| // ----------------------------------------------------------------------- |
| // Cleanup methods |
| // ----------------------------------------------------------------------- |
| void cleanUp(); |
| |
| // ----------------------------------------------------------------------- |
| // Setter methods |
| // ----------------------------------------------------------------------- |
| void setPattern(const XMLCh* const pattern, const XMLCh* const options=0); |
| |
| // ----------------------------------------------------------------------- |
| // Private Helper methods |
| // ----------------------------------------------------------------------- |
| void prepare(); |
| int parseOptions(const XMLCh* const options); |
| unsigned short getWordType(const XMLCh* const target, const int begin, |
| const int end, const int offset); |
| unsigned short getCharType(const XMLCh ch); |
| unsigned short getPreviousWordType(const XMLCh* const target, |
| const int start, const int end, |
| int offset); |
| |
| /** |
| * Matching helpers |
| */ |
| int match(Context* const context, const Op* const operations, int offset, |
| const short direction); |
| bool matchIgnoreCase(const XMLInt32 ch1, const XMLInt32 ch2); |
| |
| /** |
| * Helper methods used by match(Context* ...) |
| */ |
| bool matchChar(Context* const context, const XMLInt32 ch, int& offset, |
| const short direction, const bool ignoreCase); |
| bool matchDot(Context* const context, int& offset, const short direction); |
| bool matchRange(Context* const context, const Op* const op, |
| int& offset, const short direction, const bool ignoreCase); |
| bool matchAnchor(Context* const context, const XMLInt32 ch, |
| const int offset); |
| bool matchBackReference(Context* const context, const XMLInt32 ch, |
| int& offset, const short direction, |
| const bool ignoreCase); |
| bool matchString(Context* const context, const XMLCh* const literal, |
| int& offset, const short direction, const bool ignoreCase); |
| int matchUnion(Context* const context, const Op* const op, int offset, |
| const short direction); |
| int matchCapture(Context* const context, const Op* const op, int offset, |
| const short direction); |
| bool matchCondition(Context* const context, const Op* const op, int offset, |
| const short direction); |
| int matchModifier(Context* const context, const Op* const op, int offset, |
| const short direction); |
| |
| /** |
| * Tokenize helper |
| * |
| * This overloaded tokenize is for internal use only. It provides a way to |
| * keep track of the sub-expressions in each match of the pattern. |
| * |
| * It is called by the other tokenize methods, and by the replace method. |
| * The caller is responsible for the deletion of the returned |
| * RefArrayVectorOf<XMLCh*> |
| */ |
| RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString, |
| const int start, const int end, |
| RefVectorOf<Match> *subEx); |
| /** |
| * Replace helpers |
| * |
| * Note: the caller owns the XMLCh* that is returned |
| */ |
| const XMLCh *subInExp(const XMLCh* const repString, |
| const XMLCh* const origString, |
| const Match* subEx); |
| /** |
| * Converts a token tree into an operation tree |
| */ |
| void compile(const Token* const token); |
| Op* compile(const Token* const token, Op* const next, |
| const bool reverse); |
| /** |
| * Helper methods used by compile |
| */ |
| Op* compileSingle(const Token* const token, Op* const next, |
| const unsigned short tokType); |
| Op* compileUnion(const Token* const token, Op* const next, |
| const bool reverse); |
| Op* compileCondition(const Token* const token, Op* const next, |
| const bool reverse); |
| Op* compileParenthesis(const Token* const token, Op* const next, |
| const bool reverse); |
| Op* compileLook(const Token* const token, const Op* const next, |
| const bool reverse, const unsigned short tokType); |
| Op* compileConcat(const Token* const token, Op* const next, |
| const bool reverse); |
| Op* compileClosure(const Token* const token, Op* const next, |
| const bool reverse, const unsigned short tokType); |
| |
| // ----------------------------------------------------------------------- |
| // Private data members |
| // ----------------------------------------------------------------------- |
| bool fHasBackReferences; |
| bool fFixedStringOnly; |
| int fNoGroups; |
| int fMinLength; |
| int fNoClosures; |
| unsigned int fOptions; |
| BMPattern* fBMPattern; |
| XMLCh* fPattern; |
| XMLCh* fFixedString; |
| Op* fOperations; |
| Token* fTokenTree; |
| RangeToken* fFirstChar; |
| static RangeToken* fWordRange; |
| OpFactory fOpFactory; |
| TokenFactory* fTokenFactory; |
| MemoryManager* fMemoryManager; |
| }; |
| |
| |
| |
| // ----------------------------------------------------------------------- |
| // RegularExpression: Static initialize and cleanup methods |
| // ----------------------------------------------------------------------- |
| inline void RegularExpression::staticCleanup() |
| { |
| fWordRange = 0; |
| } |
| |
| // --------------------------------------------------------------------------- |
| // RegularExpression: Cleanup methods |
| // --------------------------------------------------------------------------- |
| inline void RegularExpression::cleanUp() { |
| |
| fMemoryManager->deallocate(fPattern);//delete [] fPattern; |
| fMemoryManager->deallocate(fFixedString);//delete [] fFixedString; |
| delete fBMPattern; |
| delete fTokenFactory; |
| } |
| |
| // --------------------------------------------------------------------------- |
| // RegularExpression: Helper methods |
| // --------------------------------------------------------------------------- |
| inline bool RegularExpression::isSet(const int options, const int flag) { |
| |
| return (options & flag) == flag; |
| } |
| |
| inline Op* RegularExpression::compileLook(const Token* const token, |
| const Op* const next, |
| const bool reverse, |
| const unsigned short tokType) { |
| |
| Op* ret = 0; |
| Op* result = compile(token->getChild(0), 0, reverse); |
| |
| switch(tokType) { |
| case Token::T_LOOKAHEAD: |
| ret = fOpFactory.createLookOp(Op::O_LOOKAHEAD, next, result); |
| break; |
| case Token::T_NEGATIVELOOKAHEAD: |
| ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKAHEAD, next, result); |
| break; |
| case Token::T_LOOKBEHIND: |
| ret = fOpFactory.createLookOp(Op::O_LOOKBEHIND, next, result); |
| break; |
| case Token::T_NEGATIVELOOKBEHIND: |
| ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKBEHIND, next, result); |
| break; |
| case Token::T_INDEPENDENT: |
| ret = fOpFactory.createIndependentOp(next, result); |
| break; |
| case Token::T_MODIFIERGROUP: |
| ret = fOpFactory.createModifierOp(next, result, |
| ((const ModifierToken *) token)->getOptions(), |
| ((const ModifierToken *) token)->getOptionsMask()); |
| break; |
| } |
| |
| |
| return ret; |
| } |
| |
| inline Op* RegularExpression::compileSingle(const Token* const token, |
| Op* const next, |
| const unsigned short tokType) { |
| |
| Op* ret = 0; |
| |
| switch (tokType) { |
| case Token::T_DOT: |
| ret = fOpFactory.createDotOp(); |
| break; |
| case Token::T_CHAR: |
| ret = fOpFactory.createCharOp(token->getChar()); |
| break; |
| case Token::T_ANCHOR: |
| ret = fOpFactory.createAnchorOp(token->getChar()); |
| break; |
| case Token::T_RANGE: |
| case Token::T_NRANGE: |
| ret = fOpFactory.createRangeOp(token); |
| break; |
| case Token::T_EMPTY: |
| ret = next; |
| break; |
| case Token::T_STRING: |
| ret = fOpFactory.createStringOp(token->getString()); |
| break; |
| case Token::T_BACKREFERENCE: |
| ret = fOpFactory.createBackReferenceOp(token->getReferenceNo()); |
| break; |
| } |
| |
| if (tokType != Token::T_EMPTY) |
| ret->setNextOp(next); |
| |
| return ret; |
| } |
| |
| |
| inline Op* RegularExpression::compileUnion(const Token* const token, |
| Op* const next, |
| const bool reverse) { |
| |
| int tokSize = token->size(); |
| UnionOp* uniOp = fOpFactory.createUnionOp(tokSize); |
| |
| for (int i=0; i<tokSize; i++) { |
| |
| uniOp->addElement(compile(token->getChild(i), next, reverse)); |
| } |
| |
| return uniOp; |
| } |
| |
| |
| inline Op* RegularExpression::compileCondition(const Token* const token, |
| Op* const next, |
| const bool reverse) { |
| |
| Token* condTok = ((const ConditionToken*) token)->getConditionToken(); |
| Token* yesTok = token->getChild(0); |
| Token* noTok = token->getChild(1); |
| int refNo = token->getReferenceNo(); |
| Op* condOp = (condTok == 0) ? 0 : compile(condTok, 0, reverse); |
| Op* yesOp = compile(yesTok, next, reverse); |
| Op* noOp = (noTok == 0) ? 0 : compile(noTok, next, reverse); |
| |
| return fOpFactory.createConditionOp(next, refNo, condOp, yesOp, noOp); |
| } |
| |
| |
| inline Op* RegularExpression::compileParenthesis(const Token* const token, |
| Op* const next, |
| const bool reverse) { |
| |
| if (token->getNoParen() == 0) |
| return compile(token->getChild(0), next, reverse); |
| |
| Op* captureOp = 0; |
| |
| if (reverse) { |
| |
| captureOp = fOpFactory.createCaptureOp(token->getNoParen(), next); |
| captureOp = compile(token->getChild(0), captureOp, reverse); |
| |
| return fOpFactory.createCaptureOp(-token->getNoParen(), captureOp); |
| } |
| |
| captureOp = fOpFactory.createCaptureOp(-token->getNoParen(), next); |
| captureOp = compile(token->getChild(0), captureOp, reverse); |
| |
| return fOpFactory.createCaptureOp(token->getNoParen(), captureOp); |
| } |
| |
| inline Op* RegularExpression::compileConcat(const Token* const token, |
| Op* const next, |
| const bool reverse) { |
| |
| Op* ret = next; |
| int tokSize = token->size(); |
| |
| if (!reverse) { |
| |
| for (int i= tokSize - 1; i>=0; i--) { |
| ret = compile(token->getChild(i), ret, false); |
| } |
| } |
| else { |
| |
| for (int i= 0; i< tokSize; i++) { |
| ret = compile(token->getChild(i), ret, true); |
| } |
| } |
| |
| return ret; |
| } |
| |
| inline Op* RegularExpression::compileClosure(const Token* const token, |
| Op* const next, |
| const bool reverse, |
| const unsigned short tokType) { |
| |
| Op* ret = 0; |
| Token* childTok = token->getChild(0); |
| int min = token->getMin(); |
| int max = token->getMax(); |
| |
| if (min >= 0 && min == max) { |
| |
| ret = next; |
| for (int i=0; i< min; i++) { |
| ret = compile(childTok, ret, reverse); |
| } |
| |
| return ret; |
| } |
| |
| if (min > 0 && max > 0) |
| max -= min; |
| |
| if (max > 0) { |
| |
| ret = next; |
| for (int i=0; i<max; i++) { |
| |
| ChildOp* childOp = fOpFactory.createQuestionOp( |
| tokType == Token::T_NONGREEDYCLOSURE); |
| |
| childOp->setNextOp(next); |
| childOp->setChild(compile(childTok, ret, reverse)); |
| ret = childOp; |
| } |
| } |
| else { |
| |
| ChildOp* childOp = 0; |
| |
| if (tokType == Token::T_NONGREEDYCLOSURE) { |
| childOp = fOpFactory.createNonGreedyClosureOp(); |
| } |
| else { |
| |
| if (childTok->getMinLength() == 0) |
| childOp = fOpFactory.createClosureOp(fNoClosures++); |
| else |
| childOp = fOpFactory.createClosureOp(-1); |
| } |
| |
| childOp->setNextOp(next); |
| childOp->setChild(compile(childTok, childOp, reverse)); |
| ret = childOp; |
| } |
| |
| if (min > 0) { |
| |
| for (int i=0; i< min; i++) { |
| ret = compile(childTok, ret, reverse); |
| } |
| } |
| |
| return ret; |
| } |
| |
| inline int RegularExpression::matchModifier(Context* const context, |
| const Op* const op, int offset, |
| const short direction) |
| { |
| int saveOptions = fOptions; |
| fOptions |= (int) op->getData(); |
| fOptions &= (int) ~op->getData2(); |
| |
| int ret = match(context, op->getChild(), offset, direction); |
| |
| fOptions = saveOptions; |
| |
| return ret; |
| } |
| |
| inline unsigned short RegularExpression::getWordType(const XMLCh* const target |
| , const int begin |
| , const int end |
| , const int offset) |
| { |
| if (offset < begin || offset >= end) |
| return WT_OTHER; |
| |
| return getCharType(target[offset]); |
| } |
| |
| inline |
| unsigned short RegularExpression::getPreviousWordType(const XMLCh* const target |
| , const int start |
| , const int end |
| , int offset) |
| { |
| unsigned short ret = getWordType(target, start, end, --offset); |
| |
| while (ret == WT_IGNORE) { |
| ret = getWordType(target, start, end, --offset); |
| } |
| |
| return ret; |
| } |
| |
| XERCES_CPP_NAMESPACE_END |
| |
| #endif |
| /** |
| * End of file RegularExpression.hpp |
| */ |
| |