Mac-4.7.4/src/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.cpp - platform/external/qt - Git at Google

 /*------------------------------------------------------------------------------
 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
 *
 * Distributable under the terms of either the Apache License (Version 2.0) or
 * the GNU Lesser General Public License, as specified in the COPYING file.
 ------------------------------------------------------------------------------*/
 #include "CLucene/StdHeader.h"
 #include "StandardTokenizer.h"

 CL_NS_USE(analysis)
 CL_NS_USE(util)
 CL_NS_DEF2(analysis,standard)

   const static TCHAR* tokenImageArray[] = {
     _T("<EOF>"),
     _T("<UNKNOWN>"),
     _T("<ALPHANUM>"),
     _T("<APOSTROPHE>"),
     _T("<ACRONYM>"),
     _T("<COMPANY>"),
     _T("<EMAIL>"),
     _T("<HOST>"),
     _T("<NUM>"),
     _T("<CJK>")
   };
   const TCHAR** tokenImage = tokenImageArray;

   /* A bunch of shortcut macros, many of which make assumptions about variable
   ** names.  These macros enhance readability, not just convenience! */
   #define EOS           (ch==-1 || rd->Eos())
   #define SPACE         (_istspace((TCHAR)ch) != 0)
   #define ALPHA         (_istalpha((TCHAR)ch) != 0)
   #define ALNUM         (_istalnum(ch) != 0)
   #define DIGIT         (_istdigit(ch) != 0)
   #define UNDERSCORE    (ch == '_')

   #define _CJK			(  (ch>=0x3040 && ch<=0x318f) || \
   						   (ch>=0x3300 && ch<=0x337f) || \
   						   (ch>=0x3400 && ch<=0x3d2d) || \
   						   (ch>=0x4e00 && ch<=0x9fff) || \
   						   (ch>=0xf900 && ch<=0xfaff) || \
   						   (ch>=0xac00 && ch<=0xd7af) ) //korean


   #define DASH          (ch == '-')
   #define NEGATIVE_SIGN_ DASH
   //#define POSITIVE_SIGN_ (ch == '+')
   //#define SIGN          (NEGATIVE_SIGN_ || POSITIVE_SIGN_)

   #define DOT             (ch == '.')
   #define DECIMAL         DOT


   //freebsd seems to have a problem with defines over multiple lines, so this has to be one long line
   #define _CONSUME_AS_LONG_AS(conditionFails) while (true) { ch = readChar(); if (ch==-1 || (!(conditionFails) || str.len >= LUCENE_MAX_WORD_LEN)) { break; } str.appendChar(ch);}

   #define CONSUME_ALPHAS _CONSUME_AS_LONG_AS(ALPHA)

   #define CONSUME_DIGITS _CONSUME_AS_LONG_AS(DIGIT)

   /* otherMatches is a condition (possibly compound) under which a character
   ** that's not an ALNUM or UNDERSCORE can be considered not to break the
   ** span.  Callers should pass false if only ALNUM/UNDERSCORE are acceptable. */
   #define CONSUME_WORD                  _CONSUME_AS_LONG_AS(ALNUM || UNDERSCORE)

   /*
   ** Consume CJK characters
   */
   #define CONSUME_CJK                   _CONSUME_AS_LONG_AS(_CJK)


   /* It is considered that "nothing of value" has been read if:
   ** a) The "read head" hasn't moved since specialCharPos was established.
   ** or
   ** b) The "read head" has moved by one character, but that character was
   **    either whitespace or not among the characters found in the body of
   **    a token (deliberately doesn't include the likes of '@'/'&'). */
   #define CONSUMED_NOTHING_OF_VALUE (rdPos == specialCharPos || (rdPos == specialCharPos+1 && ( SPACE || !(ALNUM || DOT || DASH || UNDERSCORE) )))

   #define RIGHTMOST(sb) (sb.getBuffer()[sb.len-1])
   #define RIGHTMOST_IS(sb, c) (RIGHTMOST(sb) == c)
   /* To discard the last character in a StringBuffer, we decrement the buffer's
   ** length indicator and move the terminator back by one character. */
   #define SHAVE_RIGHTMOST(sb) (sb.getBuffer()[--sb.len] = '\0')

   //#define REMOVE_TRAILING_CHARS(sb, charMatchesCondition) { TCHAR* sbBuf = sb.getBuffer(); for (int32_t i = sb.len-1; i >= 0; i--) { TCHAR c = sbBuf[i]; if (charMatchesCondition) { sbBuf[--sb.len] = '\0'; } else {break;}}}

   /* Does StringBuffer sb contain any of the characters in string ofThese? */
   #define CONTAINS_ANY(sb, ofThese) (_tcscspn(sb.getBuffer(), _T(ofThese)) != static_cast<size_t>(sb.len))


   StandardTokenizer::StandardTokenizer(Reader* reader):
     rd(_CLNEW FastCharStream(reader)),
     /* rdPos is zero-based.  It starts at -1, and will advance to the first
     ** position when readChar() is first called. */
     rdPos(-1),
     tokenStart(-1)
   {
   }

   StandardTokenizer::~StandardTokenizer() {
     _CLDELETE(rd);
   }

   int StandardTokenizer::readChar() {
     /* Increment by 1 because we're speaking in terms of characters, not
     ** necessarily bytes: */
     rdPos++;
     return rd->GetNext();
   }

   void StandardTokenizer::unReadChar() {
     rd->UnGet();
     rdPos--;
   }

   inline bool StandardTokenizer::setToken(Token* t, StringBuffer* sb, TokenTypes tokenCode) {
     t->setStartOffset(tokenStart);
 	t->setEndOffset(tokenStart+sb->length());
 	t->setType(tokenImage[tokenCode]);
 	sb->getBuffer(); //null terminates the buffer
 	t->resetTermTextLen();
 	return true;
   }

   bool StandardTokenizer::next(Token* t) {
     int ch=0;
 	while (!EOS) {
       ch = readChar();

 	  if ( ch == 0 || ch == -1 ){
 		continue;
 	  } else if (SPACE) {
         continue;
       } else if (ALPHA || UNDERSCORE) {
         tokenStart = rdPos;
         return ReadAlphaNum(ch,t);
       } else if (DIGIT || NEGATIVE_SIGN_ || DECIMAL) {
         tokenStart = rdPos;
         /* ReadNumber returns NULL if it fails to extract a valid number; in
         ** that case, we just continue. */
         if (ReadNumber(NULL, ch,t))
           return true;
 	  } else if ( _CJK ){
       	if ( ReadCJK(ch,t) )
       		return true;
       }
     }
     return false;
   }

   bool StandardTokenizer::ReadNumber(const TCHAR* previousNumber, const TCHAR prev,Token* t) {
     /* previousNumber is only non-NULL if this function already read a complete
     ** number in a previous recursion, yet has been asked to read additional
     ** numeric segments.  For example, in the HOST "192.168.1.3", "192.168" is
     ** a complete number, but this function will recurse to read the "1.3",
     ** generating a single HOST token "192.168.1.3". */
     t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
     StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
     TokenTypes tokenType;
     bool decExhausted;
     if (previousNumber != NULL) {
       str.prepend(previousNumber);
       tokenType = CL_NS2(analysis,standard)::HOST;
       decExhausted = false;
     } else {
       tokenType = CL_NS2(analysis,standard)::NUM;
       decExhausted = (prev == '.');
     }
 	if (  str.len >= LUCENE_MAX_WORD_LEN ){
 		//if a number is too long, i would say there is no point
 		//storing it, because its going to be the wrong number anyway?
 		//what do people think?
 		return false;
 	}
     str.appendChar(prev);

     const bool signExhausted = (prev == '-');
     int ch = prev;

     CONSUME_DIGITS;

     if (str.len < 2 /* CONSUME_DIGITS didn't find any digits. */
         && (
                 (signExhausted && !DECIMAL)
              || (decExhausted /* && !DIGIT is implied, since CONSUME_DIGITS stopped on a non-digit. */)
            )
        )
     {
       /* We have either:
       **   a) a negative sign that's not followed by either digit(s) or a decimal
       **   b) a decimal that's not followed by digit(s)
       ** so this is not a valid number. */
       if (!EOS) {
         /* Unread the character that stopped CONSUME_DIGITS: */
         unReadChar();
       }
       return false;
     }

     /* We just read a group of digits.  Is it followed by a decimal symbol,
     ** implying that there might be another group of digits available? */
     if (!EOS) {
       if (DECIMAL) {
 		if (  str.len >= LUCENE_MAX_WORD_LEN )
 			return false; //read above for rationale
         str.appendChar(ch);
       } else {
         unReadChar();
         goto SUCCESSFULLY_EXTRACTED_NUMBER;
       }

       CONSUME_DIGITS;
       if (!DIGIT && !DECIMAL) {
         unReadChar();
       } else if (!EOS && DECIMAL && _istdigit(rd->Peek())) {
         /* We just read the fractional digit group, but it's also followed by
         ** a decimal symbol and at least one more digit, so this must be a
         ** HOST rather than a real number. */
         return ReadNumber(str.getBuffer(), '.',t);
       }
     }

     SUCCESSFULLY_EXTRACTED_NUMBER:
     TCHAR rightmost = RIGHTMOST(str);
     /* Don't including a trailing decimal point. */
     if (rightmost == '.') {
       SHAVE_RIGHTMOST(str);
       unReadChar();
       rightmost = RIGHTMOST(str);
     }
     /* If all we have left is a negative sign, it's not a valid number. */
     if (rightmost == '-') {
       CND_PRECONDITION (str.len == 1, "Number is invalid");
       return false;
     }

 	return setToken(t,&str,tokenType);
   }

   bool StandardTokenizer::ReadAlphaNum(const TCHAR prev, Token* t) {
     t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
     StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
 	if (  str.len < LUCENE_MAX_WORD_LEN ){
 		str.appendChar(prev);
 		int ch = prev;

 		CONSUME_WORD;
 		if (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) { //still have space for 1 more character?
 			switch(ch) { /* What follows the first alphanum segment? */
 				case '.':
 					str.appendChar('.');
 					return ReadDotted(&str, CL_NS2(analysis,standard)::UNKNOWN,t);
 				case '\'':
 					str.appendChar('\'');
 					return ReadApostrophe(&str,t);
 				case '@':
 					str.appendChar('@');
 					return ReadAt(&str,t);
 				case '&':
 					str.appendChar('&');
 					return ReadCompany(&str,t);
 				/* default: fall through to end of this function. */
 			}
 		}
 	}
 	return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM);
   }

   bool StandardTokenizer::ReadCJK(const TCHAR prev, Token* t) {
     t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
     StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
 	if ( str.len < LUCENE_MAX_WORD_LEN ){
 		str.appendChar(prev);
 		int ch = prev;

 		CONSUME_CJK;
 	}
 	return setToken(t,&str,CL_NS2(analysis,standard)::CJK);
   }


   bool StandardTokenizer::ReadDotted(StringBuffer* _str, TokenTypes forcedType, Token* t) {
     const int32_t specialCharPos = rdPos;
 	StringBuffer& str=*_str;

     /* A segment of a "dotted" is not allowed to begin with another dot or a dash.
     ** Even though hosts, e-mail addresses, etc., could have a dotted-segment
     ** that begins with a dot or a dash, it's far more common in source text
     ** for a pattern like "abc.--def" to be intended as two tokens. */
     int ch = rd->Peek();
     if (!(DOT || DASH)) {
       bool prevWasDot;
       bool prevWasDash;
       if (str.len == 0) {
         prevWasDot = false;
         prevWasDash = false;
       } else {
         prevWasDot = RIGHTMOST(str) == '.';
         prevWasDash = RIGHTMOST(str) == '-';
       }
       while (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) {
         ch = readChar();
         const bool dot = ch == '.';
         const bool dash = ch == '-';

         if (!(ALNUM || UNDERSCORE || dot || dash)) {
           break;
         }
         /* Multiple dots or dashes in succession end the token.
         ** Consider the following inputs:
         **   "Visit windowsupdate.microsoft.com--update today!"
         **   "In the U.S.A.--yes, even there!"                 */
         if ((dot || dash) && (prevWasDot || prevWasDash)) {
           /* We're not going to append the character we just read, in any case.
           ** As to the character before it (which is currently RIGHTMOST(str)):
           ** Unless RIGHTMOST(str) is a dot, in which we need to save it so the
           ** acronym-versus-host detection can work, we want to get rid of it. */
           if (!prevWasDot) {
             SHAVE_RIGHTMOST(str);
           }
           break;
         }

         str.appendChar(ch);

         prevWasDot = dot;
         prevWasDash = dash;
       }
     }

     /* There's a potential StringBuffer.append call in the code above, which
     ** could cause str to reallocate its internal buffer.  We must wait to
     ** obtain the optimization-oriented strBuf pointer until after the initial
     ** potentially realloc-triggering operations on str.
     ** Because there can be other such ops much later in this function, strBuf
     ** is guarded within a block to prevent its use during or after the calls
     ** that would potentially invalidate it. */
     { /* Begin block-guard of strBuf */
     TCHAR* strBuf = str.getBuffer();

     bool rightmostIsDot = RIGHTMOST_IS(str, '.');
     if (CONSUMED_NOTHING_OF_VALUE) {
       /* No more alphanums available for this token; shave trailing dot, if any. */
       if (rightmostIsDot) {
         SHAVE_RIGHTMOST(str);
       }
       /* If there are no dots remaining, this is a generic ALPHANUM. */
       if (_tcschr(strBuf, '.') == NULL) {
         forcedType = CL_NS2(analysis,standard)::ALPHANUM;
       }

     /* Check the token to see if it's an acronym.  An acronym must have a
     ** letter in every even slot and a dot in every odd slot, including the
     ** last slot (for example, "U.S.A."). */
     } else if (rightmostIsDot) {
       bool isAcronym = true;
       const int32_t upperCheckLimit = str.len - 1; /* -1 b/c we already checked the last slot. */

       for (int32_t i = 0; i < upperCheckLimit; i++) {
         const bool even = (i % 2 == 0);
         ch = strBuf[i];
         if ( (even && !ALPHA) || (!even && !DOT) ) {
           isAcronym = false;
           break;
         }
       }
       if (isAcronym) {
         forcedType = CL_NS2(analysis,standard)::ACRONYM;
       } else {
         /* If it's not an acronym, we don't want the trailing dot. */
         SHAVE_RIGHTMOST(str);
         /* If there are no dots remaining, this is a generic ALPHANUM. */
         if (_tcschr(strBuf, '.') == NULL) {
           forcedType = CL_NS2(analysis,standard)::ALPHANUM;
         }
       }
     }
     } /* End block-guard of strBuf */

     if (!EOS) {
       if (ch == '@' && str.len < LUCENE_MAX_WORD_LEN-1) {
         str.appendChar('@');
         return ReadAt(&str,t);
       } else {
         unReadChar();
       }
     }

 	return setToken(t,&str,CL_NS2(analysis,standard)::UNKNOWN
 			? forcedType : CL_NS2(analysis,standard)::HOST);
   }

   bool StandardTokenizer::ReadApostrophe(StringBuffer* _str, Token* t) {
     StringBuffer& str=*_str;

     TokenTypes tokenType = CL_NS2(analysis,standard)::APOSTROPHE;
     const int32_t specialCharPos = rdPos;
     int ch=0;

     CONSUME_ALPHAS;
     if (RIGHTMOST_IS(str, '\'') || CONSUMED_NOTHING_OF_VALUE) {
       /* After the apostrophe, no more alphanums were available within this
       ** token; shave trailing apostrophe and revert to generic ALPHANUM. */
       SHAVE_RIGHTMOST(str);
       tokenType = CL_NS2(analysis,standard)::ALPHANUM;
     }
     if (!EOS) {
       unReadChar();
     }

 	return setToken(t,&str,tokenType);
   }

   bool StandardTokenizer::ReadAt(StringBuffer* str, Token* t) {
     ReadDotted(str, CL_NS2(analysis,standard)::EMAIL,t);
     /* JLucene grammar indicates dots/digits not allowed in company name: */
     if (!CONTAINS_ANY((*str), ".0123456789")) {
 		setToken(t,str,CL_NS2(analysis,standard)::COMPANY);
     }
     return true;
   }

   bool StandardTokenizer::ReadCompany(StringBuffer* _str, Token* t) {
     StringBuffer& str = *_str;
     const int32_t specialCharPos = rdPos;
     int ch=0;

     CONSUME_WORD;
     if (CONSUMED_NOTHING_OF_VALUE) {
       /* After the ampersand, no more alphanums were available within this
       ** token; shave trailing ampersand and revert to ALPHANUM. */
       CND_PRECONDITION(RIGHTMOST_IS(str, '&'),"ReadCompany failed");
       SHAVE_RIGHTMOST(str);


 	  return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM);
     }
     if (!EOS) {
       unReadChar();
     }

 	return setToken(t,&str,CL_NS2(analysis,standard)::COMPANY);
   }

 CL_NS_END2
	/*------------------------------------------------------------------------------
	* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
	*
	* Distributable under the terms of either the Apache License (Version 2.0) or
	* the GNU Lesser General Public License, as specified in the COPYING file.
	------------------------------------------------------------------------------*/
	#include "CLucene/StdHeader.h"
	#include "StandardTokenizer.h"

	CL_NS_USE(analysis)
	CL_NS_USE(util)
	CL_NS_DEF2(analysis,standard)

	const static TCHAR* tokenImageArray[] = {
	_T("<EOF>"),
	_T("<UNKNOWN>"),
	_T("<ALPHANUM>"),
	_T("<APOSTROPHE>"),
	_T("<ACRONYM>"),
	_T("<COMPANY>"),
	_T("<EMAIL>"),
	_T("<HOST>"),
	_T("<NUM>"),
	_T("<CJK>")
	};
	const TCHAR** tokenImage = tokenImageArray;

	/* A bunch of shortcut macros, many of which make assumptions about variable
	** names. These macros enhance readability, not just convenience! */
	#define EOS (ch==-1 \|\| rd->Eos())
	#define SPACE (_istspace((TCHAR)ch) != 0)
	#define ALPHA (_istalpha((TCHAR)ch) != 0)
	#define ALNUM (_istalnum(ch) != 0)
	#define DIGIT (_istdigit(ch) != 0)
	#define UNDERSCORE (ch == '_')

	#define _CJK ( (ch>=0x3040 && ch<=0x318f) \|\| \
	(ch>=0x3300 && ch<=0x337f) \|\| \
	(ch>=0x3400 && ch<=0x3d2d) \|\| \
	(ch>=0x4e00 && ch<=0x9fff) \|\| \
	(ch>=0xf900 && ch<=0xfaff) \|\| \
	(ch>=0xac00 && ch<=0xd7af) ) //korean


	#define DASH (ch == '-')
	#define NEGATIVE_SIGN_ DASH
	//#define POSITIVE_SIGN_ (ch == '+')
	//#define SIGN (NEGATIVE_SIGN_ \|\| POSITIVE_SIGN_)

	#define DOT (ch == '.')
	#define DECIMAL DOT


	//freebsd seems to have a problem with defines over multiple lines, so this has to be one long line
	#define _CONSUME_AS_LONG_AS(conditionFails) while (true) { ch = readChar(); if (ch==-1 \|\| (!(conditionFails) \|\| str.len >= LUCENE_MAX_WORD_LEN)) { break; } str.appendChar(ch);}

	#define CONSUME_ALPHAS _CONSUME_AS_LONG_AS(ALPHA)

	#define CONSUME_DIGITS _CONSUME_AS_LONG_AS(DIGIT)

	/* otherMatches is a condition (possibly compound) under which a character
	** that's not an ALNUM or UNDERSCORE can be considered not to break the
	** span. Callers should pass false if only ALNUM/UNDERSCORE are acceptable. */
	#define CONSUME_WORD _CONSUME_AS_LONG_AS(ALNUM \|\| UNDERSCORE)

	/*
	** Consume CJK characters
	*/
	#define CONSUME_CJK _CONSUME_AS_LONG_AS(_CJK)


	/* It is considered that "nothing of value" has been read if:
	** a) The "read head" hasn't moved since specialCharPos was established.
	** or
	** b) The "read head" has moved by one character, but that character was
	** either whitespace or not among the characters found in the body of
	** a token (deliberately doesn't include the likes of '@'/'&'). */
	#define CONSUMED_NOTHING_OF_VALUE (rdPos == specialCharPos \|\| (rdPos == specialCharPos+1 && ( SPACE \|\| !(ALNUM \|\| DOT \|\| DASH \|\| UNDERSCORE) )))

	#define RIGHTMOST(sb) (sb.getBuffer()[sb.len-1])
	#define RIGHTMOST_IS(sb, c) (RIGHTMOST(sb) == c)
	/* To discard the last character in a StringBuffer, we decrement the buffer's
	** length indicator and move the terminator back by one character. */
	#define SHAVE_RIGHTMOST(sb) (sb.getBuffer()[--sb.len] = '\0')

	//#define REMOVE_TRAILING_CHARS(sb, charMatchesCondition) { TCHAR* sbBuf = sb.getBuffer(); for (int32_t i = sb.len-1; i >= 0; i--) { TCHAR c = sbBuf[i]; if (charMatchesCondition) { sbBuf[--sb.len] = '\0'; } else {break;}}}

	/* Does StringBuffer sb contain any of the characters in string ofThese? */
	#define CONTAINS_ANY(sb, ofThese) (_tcscspn(sb.getBuffer(), _T(ofThese)) != static_cast<size_t>(sb.len))


	StandardTokenizer::StandardTokenizer(Reader* reader):
	rd(_CLNEW FastCharStream(reader)),
	/* rdPos is zero-based. It starts at -1, and will advance to the first
	** position when readChar() is first called. */
	rdPos(-1),
	tokenStart(-1)
	{
	}

	StandardTokenizer::~StandardTokenizer() {
	_CLDELETE(rd);
	}

	int StandardTokenizer::readChar() {
	/* Increment by 1 because we're speaking in terms of characters, not
	** necessarily bytes: */
	rdPos++;
	return rd->GetNext();
	}

	void StandardTokenizer::unReadChar() {
	rd->UnGet();
	rdPos--;
	}

	inline bool StandardTokenizer::setToken(Token* t, StringBuffer* sb, TokenTypes tokenCode) {
	t->setStartOffset(tokenStart);
	t->setEndOffset(tokenStart+sb->length());
	t->setType(tokenImage[tokenCode]);
	sb->getBuffer(); //null terminates the buffer
	t->resetTermTextLen();
	return true;
	}

	bool StandardTokenizer::next(Token* t) {
	int ch=0;
	while (!EOS) {
	ch = readChar();

	if ( ch == 0 \|\| ch == -1 ){
	continue;
	} else if (SPACE) {
	continue;
	} else if (ALPHA \|\| UNDERSCORE) {
	tokenStart = rdPos;
	return ReadAlphaNum(ch,t);
	} else if (DIGIT \|\| NEGATIVE_SIGN_ \|\| DECIMAL) {
	tokenStart = rdPos;
	/* ReadNumber returns NULL if it fails to extract a valid number; in
	** that case, we just continue. */
	if (ReadNumber(NULL, ch,t))
	return true;
	} else if ( _CJK ){
	if ( ReadCJK(ch,t) )
	return true;
	}
	}
	return false;
	}

	bool StandardTokenizer::ReadNumber(const TCHAR* previousNumber, const TCHAR prev,Token* t) {
	/* previousNumber is only non-NULL if this function already read a complete
	** number in a previous recursion, yet has been asked to read additional
	** numeric segments. For example, in the HOST "192.168.1.3", "192.168" is
	** a complete number, but this function will recurse to read the "1.3",
	** generating a single HOST token "192.168.1.3". */
	t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
	StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
	TokenTypes tokenType;
	bool decExhausted;
	if (previousNumber != NULL) {
	str.prepend(previousNumber);
	tokenType = CL_NS2(analysis,standard)::HOST;
	decExhausted = false;
	} else {
	tokenType = CL_NS2(analysis,standard)::NUM;
	decExhausted = (prev == '.');
	}
	if ( str.len >= LUCENE_MAX_WORD_LEN ){
	//if a number is too long, i would say there is no point
	//storing it, because its going to be the wrong number anyway?
	//what do people think?
	return false;
	}
	str.appendChar(prev);

	const bool signExhausted = (prev == '-');
	int ch = prev;

	CONSUME_DIGITS;

	if (str.len < 2 /* CONSUME_DIGITS didn't find any digits. */
	&& (
	(signExhausted && !DECIMAL)
	\|\| (decExhausted /* && !DIGIT is implied, since CONSUME_DIGITS stopped on a non-digit. */)
	)
	)
	{
	/* We have either:
	** a) a negative sign that's not followed by either digit(s) or a decimal
	** b) a decimal that's not followed by digit(s)
	** so this is not a valid number. */
	if (!EOS) {
	/* Unread the character that stopped CONSUME_DIGITS: */
	unReadChar();
	}
	return false;
	}

	/* We just read a group of digits. Is it followed by a decimal symbol,
	** implying that there might be another group of digits available? */
	if (!EOS) {
	if (DECIMAL) {
	if ( str.len >= LUCENE_MAX_WORD_LEN )
	return false; //read above for rationale
	str.appendChar(ch);
	} else {
	unReadChar();
	goto SUCCESSFULLY_EXTRACTED_NUMBER;
	}

	CONSUME_DIGITS;
	if (!DIGIT && !DECIMAL) {
	unReadChar();
	} else if (!EOS && DECIMAL && _istdigit(rd->Peek())) {
	/* We just read the fractional digit group, but it's also followed by
	** a decimal symbol and at least one more digit, so this must be a
	** HOST rather than a real number. */
	return ReadNumber(str.getBuffer(), '.',t);
	}
	}

	SUCCESSFULLY_EXTRACTED_NUMBER:
	TCHAR rightmost = RIGHTMOST(str);
	/* Don't including a trailing decimal point. */
	if (rightmost == '.') {
	SHAVE_RIGHTMOST(str);
	unReadChar();
	rightmost = RIGHTMOST(str);
	}
	/* If all we have left is a negative sign, it's not a valid number. */
	if (rightmost == '-') {
	CND_PRECONDITION (str.len == 1, "Number is invalid");
	return false;
	}

	return setToken(t,&str,tokenType);
	}

	bool StandardTokenizer::ReadAlphaNum(const TCHAR prev, Token* t) {
	t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
	StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
	if ( str.len < LUCENE_MAX_WORD_LEN ){
	str.appendChar(prev);
	int ch = prev;

	CONSUME_WORD;
	if (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) { //still have space for 1 more character?
	switch(ch) { /* What follows the first alphanum segment? */
	case '.':
	str.appendChar('.');
	return ReadDotted(&str, CL_NS2(analysis,standard)::UNKNOWN,t);
	case '\'':
	str.appendChar('\'');
	return ReadApostrophe(&str,t);
	case '@':
	str.appendChar('@');
	return ReadAt(&str,t);
	case '&':
	str.appendChar('&');
	return ReadCompany(&str,t);
	/* default: fall through to end of this function. */
	}
	}
	}
	return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM);
	}

	bool StandardTokenizer::ReadCJK(const TCHAR prev, Token* t) {
	t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
	StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
	if ( str.len < LUCENE_MAX_WORD_LEN ){
	str.appendChar(prev);
	int ch = prev;

	CONSUME_CJK;
	}
	return setToken(t,&str,CL_NS2(analysis,standard)::CJK);
	}


	bool StandardTokenizer::ReadDotted(StringBuffer* _str, TokenTypes forcedType, Token* t) {
	const int32_t specialCharPos = rdPos;
	StringBuffer& str=*_str;

	/* A segment of a "dotted" is not allowed to begin with another dot or a dash.
	** Even though hosts, e-mail addresses, etc., could have a dotted-segment
	** that begins with a dot or a dash, it's far more common in source text
	** for a pattern like "abc.--def" to be intended as two tokens. */
	int ch = rd->Peek();
	if (!(DOT \|\| DASH)) {
	bool prevWasDot;
	bool prevWasDash;
	if (str.len == 0) {
	prevWasDot = false;
	prevWasDash = false;
	} else {
	prevWasDot = RIGHTMOST(str) == '.';
	prevWasDash = RIGHTMOST(str) == '-';
	}
	while (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) {
	ch = readChar();
	const bool dot = ch == '.';
	const bool dash = ch == '-';

	if (!(ALNUM \|\| UNDERSCORE \|\| dot \|\| dash)) {
	break;
	}
	/* Multiple dots or dashes in succession end the token.
	** Consider the following inputs:
	** "Visit windowsupdate.microsoft.com--update today!"
	** "In the U.S.A.--yes, even there!" */
	if ((dot \|\| dash) && (prevWasDot \|\| prevWasDash)) {
	/* We're not going to append the character we just read, in any case.
	** As to the character before it (which is currently RIGHTMOST(str)):
	** Unless RIGHTMOST(str) is a dot, in which we need to save it so the
	** acronym-versus-host detection can work, we want to get rid of it. */
	if (!prevWasDot) {
	SHAVE_RIGHTMOST(str);
	}
	break;
	}

	str.appendChar(ch);

	prevWasDot = dot;
	prevWasDash = dash;
	}
	}

	/* There's a potential StringBuffer.append call in the code above, which
	** could cause str to reallocate its internal buffer. We must wait to
	** obtain the optimization-oriented strBuf pointer until after the initial
	** potentially realloc-triggering operations on str.
	** Because there can be other such ops much later in this function, strBuf
	** is guarded within a block to prevent its use during or after the calls
	** that would potentially invalidate it. */
	{ /* Begin block-guard of strBuf */
	TCHAR* strBuf = str.getBuffer();

	bool rightmostIsDot = RIGHTMOST_IS(str, '.');
	if (CONSUMED_NOTHING_OF_VALUE) {
	/* No more alphanums available for this token; shave trailing dot, if any. */
	if (rightmostIsDot) {
	SHAVE_RIGHTMOST(str);
	}
	/* If there are no dots remaining, this is a generic ALPHANUM. */
	if (_tcschr(strBuf, '.') == NULL) {
	forcedType = CL_NS2(analysis,standard)::ALPHANUM;
	}

	/* Check the token to see if it's an acronym. An acronym must have a
	** letter in every even slot and a dot in every odd slot, including the
	** last slot (for example, "U.S.A."). */
	} else if (rightmostIsDot) {
	bool isAcronym = true;
	const int32_t upperCheckLimit = str.len - 1; /* -1 b/c we already checked the last slot. */

	for (int32_t i = 0; i < upperCheckLimit; i++) {
	const bool even = (i % 2 == 0);
	ch = strBuf[i];
	if ( (even && !ALPHA) \|\| (!even && !DOT) ) {
	isAcronym = false;
	break;
	}
	}
	if (isAcronym) {
	forcedType = CL_NS2(analysis,standard)::ACRONYM;
	} else {
	/* If it's not an acronym, we don't want the trailing dot. */
	SHAVE_RIGHTMOST(str);
	/* If there are no dots remaining, this is a generic ALPHANUM. */
	if (_tcschr(strBuf, '.') == NULL) {
	forcedType = CL_NS2(analysis,standard)::ALPHANUM;
	}
	}
	}
	} /* End block-guard of strBuf */

	if (!EOS) {
	if (ch == '@' && str.len < LUCENE_MAX_WORD_LEN-1) {
	str.appendChar('@');
	return ReadAt(&str,t);
	} else {
	unReadChar();
	}
	}

	return setToken(t,&str,CL_NS2(analysis,standard)::UNKNOWN
	? forcedType : CL_NS2(analysis,standard)::HOST);
	}

	bool StandardTokenizer::ReadApostrophe(StringBuffer* _str, Token* t) {
	StringBuffer& str=*_str;

	TokenTypes tokenType = CL_NS2(analysis,standard)::APOSTROPHE;
	const int32_t specialCharPos = rdPos;
	int ch=0;

	CONSUME_ALPHAS;
	if (RIGHTMOST_IS(str, '\'') \|\| CONSUMED_NOTHING_OF_VALUE) {
	/* After the apostrophe, no more alphanums were available within this
	** token; shave trailing apostrophe and revert to generic ALPHANUM. */
	SHAVE_RIGHTMOST(str);
	tokenType = CL_NS2(analysis,standard)::ALPHANUM;
	}
	if (!EOS) {
	unReadChar();
	}

	return setToken(t,&str,tokenType);
	}

	bool StandardTokenizer::ReadAt(StringBuffer* str, Token* t) {
	ReadDotted(str, CL_NS2(analysis,standard)::EMAIL,t);
	/* JLucene grammar indicates dots/digits not allowed in company name: */
	if (!CONTAINS_ANY((*str), ".0123456789")) {
	setToken(t,str,CL_NS2(analysis,standard)::COMPANY);
	}
	return true;
	}

	bool StandardTokenizer::ReadCompany(StringBuffer* _str, Token* t) {
	StringBuffer& str = *_str;
	const int32_t specialCharPos = rdPos;
	int ch=0;

	CONSUME_WORD;
	if (CONSUMED_NOTHING_OF_VALUE) {
	/* After the ampersand, no more alphanums were available within this
	** token; shave trailing ampersand and revert to ALPHANUM. */
	CND_PRECONDITION(RIGHTMOST_IS(str, '&'),"ReadCompany failed");
	SHAVE_RIGHTMOST(str);


	return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM);
	}
	if (!EOS) {
	unReadChar();
	}

	return setToken(t,&str,CL_NS2(analysis,standard)::COMPANY);
	}

	CL_NS_END2