| /*------------------------------------------------------------------------------ |
| * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team |
| * |
| * Distributable under the terms of either the Apache License (Version 2.0) or |
| * the GNU Lesser General Public License, as specified in the COPYING file. |
| ------------------------------------------------------------------------------*/ |
| #include "CLucene/StdHeader.h" |
| #include "StandardTokenizer.h" |
| |
| CL_NS_USE(analysis) |
| CL_NS_USE(util) |
| CL_NS_DEF2(analysis,standard) |
| |
| const static TCHAR* tokenImageArray[] = { |
| _T("<EOF>"), |
| _T("<UNKNOWN>"), |
| _T("<ALPHANUM>"), |
| _T("<APOSTROPHE>"), |
| _T("<ACRONYM>"), |
| _T("<COMPANY>"), |
| _T("<EMAIL>"), |
| _T("<HOST>"), |
| _T("<NUM>"), |
| _T("<CJK>") |
| }; |
| const TCHAR** tokenImage = tokenImageArray; |
| |
| /* A bunch of shortcut macros, many of which make assumptions about variable |
| ** names. These macros enhance readability, not just convenience! */ |
| #define EOS (ch==-1 || rd->Eos()) |
| #define SPACE (_istspace((TCHAR)ch) != 0) |
| #define ALPHA (_istalpha((TCHAR)ch) != 0) |
| #define ALNUM (_istalnum(ch) != 0) |
| #define DIGIT (_istdigit(ch) != 0) |
| #define UNDERSCORE (ch == '_') |
| |
| #define _CJK ( (ch>=0x3040 && ch<=0x318f) || \ |
| (ch>=0x3300 && ch<=0x337f) || \ |
| (ch>=0x3400 && ch<=0x3d2d) || \ |
| (ch>=0x4e00 && ch<=0x9fff) || \ |
| (ch>=0xf900 && ch<=0xfaff) || \ |
| (ch>=0xac00 && ch<=0xd7af) ) //korean |
| |
| |
| #define DASH (ch == '-') |
| #define NEGATIVE_SIGN_ DASH |
| //#define POSITIVE_SIGN_ (ch == '+') |
| //#define SIGN (NEGATIVE_SIGN_ || POSITIVE_SIGN_) |
| |
| #define DOT (ch == '.') |
| #define DECIMAL DOT |
| |
| |
| //freebsd seems to have a problem with defines over multiple lines, so this has to be one long line |
| #define _CONSUME_AS_LONG_AS(conditionFails) while (true) { ch = readChar(); if (ch==-1 || (!(conditionFails) || str.len >= LUCENE_MAX_WORD_LEN)) { break; } str.appendChar(ch);} |
| |
| #define CONSUME_ALPHAS _CONSUME_AS_LONG_AS(ALPHA) |
| |
| #define CONSUME_DIGITS _CONSUME_AS_LONG_AS(DIGIT) |
| |
| /* otherMatches is a condition (possibly compound) under which a character |
| ** that's not an ALNUM or UNDERSCORE can be considered not to break the |
| ** span. Callers should pass false if only ALNUM/UNDERSCORE are acceptable. */ |
| #define CONSUME_WORD _CONSUME_AS_LONG_AS(ALNUM || UNDERSCORE) |
| |
| /* |
| ** Consume CJK characters |
| */ |
| #define CONSUME_CJK _CONSUME_AS_LONG_AS(_CJK) |
| |
| |
| /* It is considered that "nothing of value" has been read if: |
| ** a) The "read head" hasn't moved since specialCharPos was established. |
| ** or |
| ** b) The "read head" has moved by one character, but that character was |
| ** either whitespace or not among the characters found in the body of |
| ** a token (deliberately doesn't include the likes of '@'/'&'). */ |
| #define CONSUMED_NOTHING_OF_VALUE (rdPos == specialCharPos || (rdPos == specialCharPos+1 && ( SPACE || !(ALNUM || DOT || DASH || UNDERSCORE) ))) |
| |
| #define RIGHTMOST(sb) (sb.getBuffer()[sb.len-1]) |
| #define RIGHTMOST_IS(sb, c) (RIGHTMOST(sb) == c) |
| /* To discard the last character in a StringBuffer, we decrement the buffer's |
| ** length indicator and move the terminator back by one character. */ |
| #define SHAVE_RIGHTMOST(sb) (sb.getBuffer()[--sb.len] = '\0') |
| |
| //#define REMOVE_TRAILING_CHARS(sb, charMatchesCondition) { TCHAR* sbBuf = sb.getBuffer(); for (int32_t i = sb.len-1; i >= 0; i--) { TCHAR c = sbBuf[i]; if (charMatchesCondition) { sbBuf[--sb.len] = '\0'; } else {break;}}} |
| |
| /* Does StringBuffer sb contain any of the characters in string ofThese? */ |
| #define CONTAINS_ANY(sb, ofThese) (_tcscspn(sb.getBuffer(), _T(ofThese)) != static_cast<size_t>(sb.len)) |
| |
| |
| StandardTokenizer::StandardTokenizer(Reader* reader): |
| rd(_CLNEW FastCharStream(reader)), |
| /* rdPos is zero-based. It starts at -1, and will advance to the first |
| ** position when readChar() is first called. */ |
| rdPos(-1), |
| tokenStart(-1) |
| { |
| } |
| |
| StandardTokenizer::~StandardTokenizer() { |
| _CLDELETE(rd); |
| } |
| |
| int StandardTokenizer::readChar() { |
| /* Increment by 1 because we're speaking in terms of characters, not |
| ** necessarily bytes: */ |
| rdPos++; |
| return rd->GetNext(); |
| } |
| |
| void StandardTokenizer::unReadChar() { |
| rd->UnGet(); |
| rdPos--; |
| } |
| |
| inline bool StandardTokenizer::setToken(Token* t, StringBuffer* sb, TokenTypes tokenCode) { |
| t->setStartOffset(tokenStart); |
| t->setEndOffset(tokenStart+sb->length()); |
| t->setType(tokenImage[tokenCode]); |
| sb->getBuffer(); //null terminates the buffer |
| t->resetTermTextLen(); |
| return true; |
| } |
| |
| bool StandardTokenizer::next(Token* t) { |
| int ch=0; |
| while (!EOS) { |
| ch = readChar(); |
| |
| if ( ch == 0 || ch == -1 ){ |
| continue; |
| } else if (SPACE) { |
| continue; |
| } else if (ALPHA || UNDERSCORE) { |
| tokenStart = rdPos; |
| return ReadAlphaNum(ch,t); |
| } else if (DIGIT || NEGATIVE_SIGN_ || DECIMAL) { |
| tokenStart = rdPos; |
| /* ReadNumber returns NULL if it fails to extract a valid number; in |
| ** that case, we just continue. */ |
| if (ReadNumber(NULL, ch,t)) |
| return true; |
| } else if ( _CJK ){ |
| if ( ReadCJK(ch,t) ) |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| bool StandardTokenizer::ReadNumber(const TCHAR* previousNumber, const TCHAR prev,Token* t) { |
| /* previousNumber is only non-NULL if this function already read a complete |
| ** number in a previous recursion, yet has been asked to read additional |
| ** numeric segments. For example, in the HOST "192.168.1.3", "192.168" is |
| ** a complete number, but this function will recurse to read the "1.3", |
| ** generating a single HOST token "192.168.1.3". */ |
| t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word |
| StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText |
| TokenTypes tokenType; |
| bool decExhausted; |
| if (previousNumber != NULL) { |
| str.prepend(previousNumber); |
| tokenType = CL_NS2(analysis,standard)::HOST; |
| decExhausted = false; |
| } else { |
| tokenType = CL_NS2(analysis,standard)::NUM; |
| decExhausted = (prev == '.'); |
| } |
| if ( str.len >= LUCENE_MAX_WORD_LEN ){ |
| //if a number is too long, i would say there is no point |
| //storing it, because its going to be the wrong number anyway? |
| //what do people think? |
| return false; |
| } |
| str.appendChar(prev); |
| |
| const bool signExhausted = (prev == '-'); |
| int ch = prev; |
| |
| CONSUME_DIGITS; |
| |
| if (str.len < 2 /* CONSUME_DIGITS didn't find any digits. */ |
| && ( |
| (signExhausted && !DECIMAL) |
| || (decExhausted /* && !DIGIT is implied, since CONSUME_DIGITS stopped on a non-digit. */) |
| ) |
| ) |
| { |
| /* We have either: |
| ** a) a negative sign that's not followed by either digit(s) or a decimal |
| ** b) a decimal that's not followed by digit(s) |
| ** so this is not a valid number. */ |
| if (!EOS) { |
| /* Unread the character that stopped CONSUME_DIGITS: */ |
| unReadChar(); |
| } |
| return false; |
| } |
| |
| /* We just read a group of digits. Is it followed by a decimal symbol, |
| ** implying that there might be another group of digits available? */ |
| if (!EOS) { |
| if (DECIMAL) { |
| if ( str.len >= LUCENE_MAX_WORD_LEN ) |
| return false; //read above for rationale |
| str.appendChar(ch); |
| } else { |
| unReadChar(); |
| goto SUCCESSFULLY_EXTRACTED_NUMBER; |
| } |
| |
| CONSUME_DIGITS; |
| if (!DIGIT && !DECIMAL) { |
| unReadChar(); |
| } else if (!EOS && DECIMAL && _istdigit(rd->Peek())) { |
| /* We just read the fractional digit group, but it's also followed by |
| ** a decimal symbol and at least one more digit, so this must be a |
| ** HOST rather than a real number. */ |
| return ReadNumber(str.getBuffer(), '.',t); |
| } |
| } |
| |
| SUCCESSFULLY_EXTRACTED_NUMBER: |
| TCHAR rightmost = RIGHTMOST(str); |
| /* Don't including a trailing decimal point. */ |
| if (rightmost == '.') { |
| SHAVE_RIGHTMOST(str); |
| unReadChar(); |
| rightmost = RIGHTMOST(str); |
| } |
| /* If all we have left is a negative sign, it's not a valid number. */ |
| if (rightmost == '-') { |
| CND_PRECONDITION (str.len == 1, "Number is invalid"); |
| return false; |
| } |
| |
| return setToken(t,&str,tokenType); |
| } |
| |
| bool StandardTokenizer::ReadAlphaNum(const TCHAR prev, Token* t) { |
| t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word |
| StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText |
| if ( str.len < LUCENE_MAX_WORD_LEN ){ |
| str.appendChar(prev); |
| int ch = prev; |
| |
| CONSUME_WORD; |
| if (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) { //still have space for 1 more character? |
| switch(ch) { /* What follows the first alphanum segment? */ |
| case '.': |
| str.appendChar('.'); |
| return ReadDotted(&str, CL_NS2(analysis,standard)::UNKNOWN,t); |
| case '\'': |
| str.appendChar('\''); |
| return ReadApostrophe(&str,t); |
| case '@': |
| str.appendChar('@'); |
| return ReadAt(&str,t); |
| case '&': |
| str.appendChar('&'); |
| return ReadCompany(&str,t); |
| /* default: fall through to end of this function. */ |
| } |
| } |
| } |
| return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM); |
| } |
| |
| bool StandardTokenizer::ReadCJK(const TCHAR prev, Token* t) { |
| t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word |
| StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText |
| if ( str.len < LUCENE_MAX_WORD_LEN ){ |
| str.appendChar(prev); |
| int ch = prev; |
| |
| CONSUME_CJK; |
| } |
| return setToken(t,&str,CL_NS2(analysis,standard)::CJK); |
| } |
| |
| |
| bool StandardTokenizer::ReadDotted(StringBuffer* _str, TokenTypes forcedType, Token* t) { |
| const int32_t specialCharPos = rdPos; |
| StringBuffer& str=*_str; |
| |
| /* A segment of a "dotted" is not allowed to begin with another dot or a dash. |
| ** Even though hosts, e-mail addresses, etc., could have a dotted-segment |
| ** that begins with a dot or a dash, it's far more common in source text |
| ** for a pattern like "abc.--def" to be intended as two tokens. */ |
| int ch = rd->Peek(); |
| if (!(DOT || DASH)) { |
| bool prevWasDot; |
| bool prevWasDash; |
| if (str.len == 0) { |
| prevWasDot = false; |
| prevWasDash = false; |
| } else { |
| prevWasDot = RIGHTMOST(str) == '.'; |
| prevWasDash = RIGHTMOST(str) == '-'; |
| } |
| while (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) { |
| ch = readChar(); |
| const bool dot = ch == '.'; |
| const bool dash = ch == '-'; |
| |
| if (!(ALNUM || UNDERSCORE || dot || dash)) { |
| break; |
| } |
| /* Multiple dots or dashes in succession end the token. |
| ** Consider the following inputs: |
| ** "Visit windowsupdate.microsoft.com--update today!" |
| ** "In the U.S.A.--yes, even there!" */ |
| if ((dot || dash) && (prevWasDot || prevWasDash)) { |
| /* We're not going to append the character we just read, in any case. |
| ** As to the character before it (which is currently RIGHTMOST(str)): |
| ** Unless RIGHTMOST(str) is a dot, in which we need to save it so the |
| ** acronym-versus-host detection can work, we want to get rid of it. */ |
| if (!prevWasDot) { |
| SHAVE_RIGHTMOST(str); |
| } |
| break; |
| } |
| |
| str.appendChar(ch); |
| |
| prevWasDot = dot; |
| prevWasDash = dash; |
| } |
| } |
| |
| /* There's a potential StringBuffer.append call in the code above, which |
| ** could cause str to reallocate its internal buffer. We must wait to |
| ** obtain the optimization-oriented strBuf pointer until after the initial |
| ** potentially realloc-triggering operations on str. |
| ** Because there can be other such ops much later in this function, strBuf |
| ** is guarded within a block to prevent its use during or after the calls |
| ** that would potentially invalidate it. */ |
| { /* Begin block-guard of strBuf */ |
| TCHAR* strBuf = str.getBuffer(); |
| |
| bool rightmostIsDot = RIGHTMOST_IS(str, '.'); |
| if (CONSUMED_NOTHING_OF_VALUE) { |
| /* No more alphanums available for this token; shave trailing dot, if any. */ |
| if (rightmostIsDot) { |
| SHAVE_RIGHTMOST(str); |
| } |
| /* If there are no dots remaining, this is a generic ALPHANUM. */ |
| if (_tcschr(strBuf, '.') == NULL) { |
| forcedType = CL_NS2(analysis,standard)::ALPHANUM; |
| } |
| |
| /* Check the token to see if it's an acronym. An acronym must have a |
| ** letter in every even slot and a dot in every odd slot, including the |
| ** last slot (for example, "U.S.A."). */ |
| } else if (rightmostIsDot) { |
| bool isAcronym = true; |
| const int32_t upperCheckLimit = str.len - 1; /* -1 b/c we already checked the last slot. */ |
| |
| for (int32_t i = 0; i < upperCheckLimit; i++) { |
| const bool even = (i % 2 == 0); |
| ch = strBuf[i]; |
| if ( (even && !ALPHA) || (!even && !DOT) ) { |
| isAcronym = false; |
| break; |
| } |
| } |
| if (isAcronym) { |
| forcedType = CL_NS2(analysis,standard)::ACRONYM; |
| } else { |
| /* If it's not an acronym, we don't want the trailing dot. */ |
| SHAVE_RIGHTMOST(str); |
| /* If there are no dots remaining, this is a generic ALPHANUM. */ |
| if (_tcschr(strBuf, '.') == NULL) { |
| forcedType = CL_NS2(analysis,standard)::ALPHANUM; |
| } |
| } |
| } |
| } /* End block-guard of strBuf */ |
| |
| if (!EOS) { |
| if (ch == '@' && str.len < LUCENE_MAX_WORD_LEN-1) { |
| str.appendChar('@'); |
| return ReadAt(&str,t); |
| } else { |
| unReadChar(); |
| } |
| } |
| |
| return setToken(t,&str,CL_NS2(analysis,standard)::UNKNOWN |
| ? forcedType : CL_NS2(analysis,standard)::HOST); |
| } |
| |
| bool StandardTokenizer::ReadApostrophe(StringBuffer* _str, Token* t) { |
| StringBuffer& str=*_str; |
| |
| TokenTypes tokenType = CL_NS2(analysis,standard)::APOSTROPHE; |
| const int32_t specialCharPos = rdPos; |
| int ch=0; |
| |
| CONSUME_ALPHAS; |
| if (RIGHTMOST_IS(str, '\'') || CONSUMED_NOTHING_OF_VALUE) { |
| /* After the apostrophe, no more alphanums were available within this |
| ** token; shave trailing apostrophe and revert to generic ALPHANUM. */ |
| SHAVE_RIGHTMOST(str); |
| tokenType = CL_NS2(analysis,standard)::ALPHANUM; |
| } |
| if (!EOS) { |
| unReadChar(); |
| } |
| |
| return setToken(t,&str,tokenType); |
| } |
| |
| bool StandardTokenizer::ReadAt(StringBuffer* str, Token* t) { |
| ReadDotted(str, CL_NS2(analysis,standard)::EMAIL,t); |
| /* JLucene grammar indicates dots/digits not allowed in company name: */ |
| if (!CONTAINS_ANY((*str), ".0123456789")) { |
| setToken(t,str,CL_NS2(analysis,standard)::COMPANY); |
| } |
| return true; |
| } |
| |
| bool StandardTokenizer::ReadCompany(StringBuffer* _str, Token* t) { |
| StringBuffer& str = *_str; |
| const int32_t specialCharPos = rdPos; |
| int ch=0; |
| |
| CONSUME_WORD; |
| if (CONSUMED_NOTHING_OF_VALUE) { |
| /* After the ampersand, no more alphanums were available within this |
| ** token; shave trailing ampersand and revert to ALPHANUM. */ |
| CND_PRECONDITION(RIGHTMOST_IS(str, '&'),"ReadCompany failed"); |
| SHAVE_RIGHTMOST(str); |
| |
| |
| return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM); |
| } |
| if (!EOS) { |
| unReadChar(); |
| } |
| |
| return setToken(t,&str,CL_NS2(analysis,standard)::COMPANY); |
| } |
| |
| CL_NS_END2 |