blob: 159915b26aeb09acc026fd8421bcc0dd98fe40a7 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Id: UnicodeRangeFactory.cpp 568078 2007-08-21 11:43:25Z amassari $
*/
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <xercesc/util/regx/UnicodeRangeFactory.hpp>
#include <xercesc/util/regx/TokenFactory.hpp>
#include <xercesc/util/regx/RangeToken.hpp>
#include <xercesc/util/regx/RangeTokenMap.hpp>
#include <xercesc/util/regx/RegxDefs.hpp>
#include <xercesc/util/regx/XMLUniCharacter.hpp>
XERCES_CPP_NAMESPACE_BEGIN
// ---------------------------------------------------------------------------
// Local data
// ---------------------------------------------------------------------------
const int UNICATEGSIZE = 37;
const unsigned short CHAR_LETTER = 30;
const unsigned short CHAR_MARK = 31;
const unsigned short CHAR_NUMBER = 32;
const unsigned short CHAR_SEPARATOR = 33;
const unsigned short CHAR_OTHER = 34;
const unsigned short CHAR_PUNCTUATION = 35;
const unsigned short CHAR_SYMBOL = 36;
const XMLCh uniCategNames[][3] =
{
{chLatin_C, chLatin_n, chNull},
{chLatin_L, chLatin_u, chNull},
{chLatin_L, chLatin_l, chNull},
{chLatin_L, chLatin_t, chNull},
{chLatin_L, chLatin_m, chNull},
{chLatin_L, chLatin_o, chNull},
{chLatin_M, chLatin_n, chNull},
{chLatin_M, chLatin_e, chNull},
{chLatin_M, chLatin_c, chNull},
{chLatin_N, chLatin_d, chNull},
{chLatin_N, chLatin_l, chNull},
{chLatin_N, chLatin_o, chNull},
{chLatin_Z, chLatin_s, chNull},
{chLatin_Z, chLatin_l, chNull},
{chLatin_Z, chLatin_p, chNull},
{chLatin_C, chLatin_c, chNull},
{chLatin_C, chLatin_f, chNull},
{chLatin_C, chLatin_o, chNull},
{chLatin_C, chLatin_s, chNull},
{chLatin_P, chLatin_d, chNull},
{chLatin_P, chLatin_s, chNull},
{chLatin_P, chLatin_e, chNull},
{chLatin_P, chLatin_c, chNull},
{chLatin_P, chLatin_o, chNull},
{chLatin_S, chLatin_m, chNull},
{chLatin_S, chLatin_c, chNull},
{chLatin_S, chLatin_k, chNull},
{chLatin_S, chLatin_o, chNull},
{chLatin_P, chLatin_i, chNull},
{chLatin_P, chLatin_f, chNull},
{chLatin_L, chNull},
{chLatin_M, chNull},
{chLatin_N, chNull},
{chLatin_Z, chNull},
{chLatin_C, chNull},
{chLatin_P, chNull},
{chLatin_S, chNull},
};
// ---------------------------------------------------------------------------
// UnicodeRangeFactory: Constructors and Destructor
// ---------------------------------------------------------------------------
UnicodeRangeFactory::UnicodeRangeFactory()
{
}
UnicodeRangeFactory::~UnicodeRangeFactory() {
}
// ---------------------------------------------------------------------------
// UnicodeRangeFactory: Range creation methods
// ---------------------------------------------------------------------------
void UnicodeRangeFactory::buildRanges(RangeTokenMap *rangeTokMap) {
if (fRangesCreated)
return;
if (!fKeywordsInitialized) {
initializeKeywordMap(rangeTokMap);
}
TokenFactory* tokFactory = rangeTokMap->getTokenFactory();
RangeToken* ranges[UNICATEGSIZE];
RangeToken* tok;
for (int i=0; i < UNICATEGSIZE; i++) {
ranges[i] = tokFactory->createRange();
}
for (int j=0; j < 0x10000; j++) {
unsigned short charType = XMLUniCharacter::getType(j);
ranges[charType]->addRange(j, j);
charType = getUniCategory(charType);
ranges[charType]->addRange(j, j);
}
ranges[XMLUniCharacter::UNASSIGNED]->addRange(0x10000, Token::UTF16_MAX);
for (int k=0; k < UNICATEGSIZE; k++) {
tok = (RangeToken*) RangeToken::complementRanges(ranges[k], tokFactory);
// build the internal map.
tok->createMap();
rangeTokMap->setRangeToken(uniCategNames[k], ranges[k]);
rangeTokMap->setRangeToken(uniCategNames[k], tok , true);
}
// Create all range
tok = tokFactory->createRange();
tok->addRange(0, Token::UTF16_MAX);
// build the internal map.
tok->createMap();
rangeTokMap->setRangeToken(fgUniAll, tok);
// Create alpha range
tok = tokFactory->createRange();
tok->mergeRanges(ranges[XMLUniCharacter::UPPERCASE_LETTER]);
tok->mergeRanges(ranges[XMLUniCharacter::LOWERCASE_LETTER]);
tok->mergeRanges(ranges[XMLUniCharacter::OTHER_LETTER]);
// build the internal map.
tok->createMap();
rangeTokMap->setRangeToken(fgUniIsAlpha, tok);
// Create alpha-num range
RangeToken* alnumTok = tokFactory->createRange();
alnumTok->mergeRanges(tok);
alnumTok->mergeRanges(ranges[XMLUniCharacter::DECIMAL_DIGIT_NUMBER]);
// build the internal map.
alnumTok->createMap();
rangeTokMap->setRangeToken(fgUniIsAlnum, alnumTok);
// Create word range
tok = tokFactory->createRange();
tok->mergeRanges(alnumTok);
tok->addRange(chUnderscore, chUnderscore);
// build the internal map.
tok->createMap();
rangeTokMap->setRangeToken(fgUniIsWord, tok);
tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory);
// build the internal map.
tok->createMap();
rangeTokMap->setRangeToken(fgUniIsWord, tok , true);
// Create assigned range
tok = (RangeToken*)RangeToken::complementRanges(
ranges[XMLUniCharacter::UNASSIGNED],
tokFactory,
tokFactory->getMemoryManager());
// build the internal map.
tok->createMap();
rangeTokMap->setRangeToken(fgUniAssigned,tok);
// Create space range
tok = tokFactory->createRange();
tok->mergeRanges(ranges[XMLUniCharacter::SPACE_SEPARATOR]);
tok->mergeRanges(ranges[XMLUniCharacter::LINE_SEPARATOR]);
//tok->mergeRanges(ranges[XMLUniCharacter::PARAGRAPH_SEPARATOR]);
// build the internal map.
tok->createMap();
rangeTokMap->setRangeToken(fgUniIsSpace, tok);
tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory);
// build the internal map.
tok->createMap();
rangeTokMap->setRangeToken(fgUniIsSpace, tok , true);
RangeToken* const dummyToken =
tokFactory->createRange();
dummyToken->addRange(-1, -2);
dummyToken->createMap();
// build the internal maps.
for (int l=0; l < UNICATEGSIZE; l++) {
ranges[l]->createMap();
ranges[l]->setCaseInsensitiveToken(dummyToken);
}
fRangesCreated = true;
}
// ---------------------------------------------------------------------------
// UnicodeRangeFactory: Initialization methods
// ---------------------------------------------------------------------------
void UnicodeRangeFactory::initializeKeywordMap(RangeTokenMap *rangeTokMap) {
if (fKeywordsInitialized)
return;
for (int k=0; k < UNICATEGSIZE; k++) {
rangeTokMap->addKeywordMap(uniCategNames[k], fgUnicodeCategory);
}
rangeTokMap->addKeywordMap(fgUniAll, fgUnicodeCategory);
rangeTokMap->addKeywordMap(fgUniIsAlpha, fgUnicodeCategory);
rangeTokMap->addKeywordMap(fgUniIsAlnum, fgUnicodeCategory);
rangeTokMap->addKeywordMap(fgUniIsWord, fgUnicodeCategory);
rangeTokMap->addKeywordMap(fgUniAssigned, fgUnicodeCategory);
rangeTokMap->addKeywordMap(fgUniIsSpace, fgUnicodeCategory);
fKeywordsInitialized = true;
}
// ---------------------------------------------------------------------------
// UnicodeRangeFactory: Helper methods
// ---------------------------------------------------------------------------
unsigned short UnicodeRangeFactory::getUniCategory(const unsigned short type)
{
switch(type) {
case XMLUniCharacter::UPPERCASE_LETTER:
case XMLUniCharacter::LOWERCASE_LETTER:
case XMLUniCharacter::TITLECASE_LETTER:
case XMLUniCharacter::MODIFIER_LETTER:
case XMLUniCharacter::OTHER_LETTER:
return CHAR_LETTER;
case XMLUniCharacter::NON_SPACING_MARK:
case XMLUniCharacter::COMBINING_SPACING_MARK:
case XMLUniCharacter::ENCLOSING_MARK:
return CHAR_MARK;
case XMLUniCharacter::DECIMAL_DIGIT_NUMBER:
case XMLUniCharacter::LETTER_NUMBER:
case XMLUniCharacter::OTHER_NUMBER:
return CHAR_NUMBER;
case XMLUniCharacter::SPACE_SEPARATOR:
case XMLUniCharacter::LINE_SEPARATOR:
case XMLUniCharacter::PARAGRAPH_SEPARATOR:
return CHAR_SEPARATOR;
case XMLUniCharacter::CONTROL:
case XMLUniCharacter::FORMAT:
case XMLUniCharacter::SURROGATE:
case XMLUniCharacter::PRIVATE_USE:
case XMLUniCharacter::UNASSIGNED:
return CHAR_OTHER;
case XMLUniCharacter::CONNECTOR_PUNCTUATION:
case XMLUniCharacter::DASH_PUNCTUATION:
case XMLUniCharacter::START_PUNCTUATION:
case XMLUniCharacter::END_PUNCTUATION:
case XMLUniCharacter::OTHER_PUNCTUATION:
case XMLUniCharacter::INITIAL_PUNCTUATION:
case XMLUniCharacter::FINAL_PUNCTUATION:
return CHAR_PUNCTUATION;
case XMLUniCharacter::MATH_SYMBOL:
case XMLUniCharacter::CURRENCY_SYMBOL:
case XMLUniCharacter::MODIFIER_SYMBOL:
case XMLUniCharacter::OTHER_SYMBOL:
return CHAR_SYMBOL;
}
return 0;
}
XERCES_CPP_NAMESPACE_END
/**
* End of file UnicodeRangeFactory.cpp
*/