blob: 7daaaa7fe0282f171b0dc0b656193d280caa61b9 [file] [log] [blame]
/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
*
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "CLucene/StdHeader.h"
#include "AnalysisHeader.h"
#include "CLucene/util/StringBuffer.h"
CL_NS_USE(util)
CL_NS_DEF(analysis)
const TCHAR* Token::defaultType=_T("word");
Token::Token():
_startOffset (0),
_endOffset (0),
_type ( defaultType ),
positionIncrement (1)
{
_termTextLen = 0;
#ifndef LUCENE_TOKEN_WORD_LENGTH
_termText = NULL;
bufferTextLen = 0;
#else
_termText[0] = 0; //make sure null terminated
bufferTextLen = LUCENE_TOKEN_WORD_LENGTH+1;
#endif
}
Token::~Token(){
#ifndef LUCENE_TOKEN_WORD_LENGTH
free(_termText);
#endif
}
Token::Token(const TCHAR* text, const int32_t start, const int32_t end, const TCHAR* typ):
_startOffset (start),
_endOffset (end),
_type ( typ ),
positionIncrement (1)
{
_termTextLen = 0;
#ifndef LUCENE_TOKEN_WORD_LENGTH
_termText = NULL;
bufferTextLen = 0;
#else
_termText[0] = 0; //make sure null terminated
bufferTextLen = LUCENE_TOKEN_WORD_LENGTH+1;
#endif
setText(text);
}
void Token::set(const TCHAR* text, const int32_t start, const int32_t end, const TCHAR* typ){
_startOffset = start;
_endOffset = end;
_type = typ;
positionIncrement = 1;
setText(text);
}
void Token::setText(const TCHAR* text){
_termTextLen = _tcslen(text);
#ifndef LUCENE_TOKEN_WORD_LENGTH
growBuffer(_termTextLen+1);
_tcsncpy(_termText,text,_termTextLen+1);
#else
if ( _termTextLen > LUCENE_TOKEN_WORD_LENGTH ){
//in the case where this occurs, we will leave the endOffset as it is
//since the actual word still occupies that space.
_termTextLen=LUCENE_TOKEN_WORD_LENGTH;
}
_tcsncpy(_termText,text,_termTextLen+1);
#endif
_termText[_termTextLen] = 0; //make sure null terminated
}
void Token::growBuffer(size_t size){
if(bufferTextLen>=size)
return;
#ifndef LUCENE_TOKEN_WORD_LENGTH
if ( _termText == NULL )
_termText = (TCHAR*)malloc( size * sizeof(TCHAR) );
else
_termText = (TCHAR*)realloc( _termText, size * sizeof(TCHAR) );
bufferTextLen = size;
#else
_CLTHROWA(CL_ERR_TokenMgr,"Couldn't grow Token buffer");
#endif
}
void Token::setPositionIncrement(int32_t posIncr) {
if (posIncr < 0) {
_CLTHROWA(CL_ERR_IllegalArgument,"positionIncrement must be >= 0");
}
positionIncrement = posIncr;
}
int32_t Token::getPositionIncrement() const { return positionIncrement; }
// Returns the Token's term text.
const TCHAR* Token::termText() const{
return (const TCHAR*) _termText;
}
size_t Token::termTextLength() {
if ( _termTextLen == -1 ) //it was invalidated by growBuffer
_termTextLen = _tcslen(_termText);
return _termTextLen;
}
void Token::resetTermTextLen(){
_termTextLen=-1;
}
bool Token::OrderCompare::operator()( Token* t1, Token* t2 ) const{
if(t1->startOffset()>t2->startOffset())
return false;
if(t1->startOffset()<t2->startOffset())
return true;
return true;
}
TCHAR* Token::toString() const{
StringBuffer sb;
sb.append(_T("("));
sb.append( _termText );
sb.append(_T(","));
sb.appendInt( _startOffset );
sb.append(_T(","));
sb.appendInt( _endOffset );
if (!_tcscmp( _type, _T("word")) == 0 ){
sb.append(_T(",type="));
sb.append(_type);
}
if (positionIncrement != 1){
sb.append(_T(",posIncr="));
sb.appendInt(positionIncrement);
}
sb.append(_T(")"));
return sb.toString();
}
Token* TokenStream::next(){
Token* t = _CLNEW Token; //deprecated
if ( !next(t) )
_CLDELETE(t);
return t;
}
TokenFilter::TokenFilter(TokenStream* in, bool deleteTS):
input(in),
deleteTokenStream(deleteTS)
{
}
TokenFilter::~TokenFilter(){
close();
}
// Close the input TokenStream.
void TokenFilter::close() {
if ( input != NULL ){
input->close();
if ( deleteTokenStream )
_CLDELETE( input );
}
input = NULL;
}
Tokenizer::Tokenizer() {
input = NULL;
}
Tokenizer::Tokenizer(CL_NS(util)::Reader* _input):
input(_input)
{
}
void Tokenizer::close(){
if (input != NULL) {
// ? delete input;
input = NULL;
}
}
Tokenizer::~Tokenizer(){
close();
}
int32_t Analyzer::getPositionIncrementGap(const TCHAR* fieldName)
{
return 0;
}
CL_NS_END