/*------------------------------------------------------------------------------ | |
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team | |
* | |
* Distributable under the terms of either the Apache License (Version 2.0) or | |
* the GNU Lesser General Public License, as specified in the COPYING file. | |
------------------------------------------------------------------------------*/ | |
#ifndef _lucene_analysis_Analyzers_ | |
#define _lucene_analysis_Analyzers_ | |
#if defined(_LUCENE_PRAGMA_ONCE) | |
# pragma once | |
#endif | |
#include "CLucene/util/Reader.h" | |
#include "AnalysisHeader.h" | |
#include "CLucene/util/Misc.h" | |
CL_NS_DEF(analysis) | |
/** An abstract base class for simple, character-oriented tokenizers.*/ | |
class CharTokenizer:public Tokenizer { | |
private: | |
int32_t offset, bufferIndex, dataLen; | |
TCHAR buffer[LUCENE_MAX_WORD_LEN+1]; | |
const TCHAR* ioBuffer; | |
protected: | |
/** Returns true iff a character should be included in a token. This | |
* tokenizer generates as tokens adjacent sequences of characters which | |
* satisfy this predicate. Characters for which this is false are used to | |
* define token boundaries and are not included in tokens. */ | |
virtual bool isTokenChar(const TCHAR c) const = 0; | |
/** Called on each token character to normalize it before it is added to the | |
* token. The default implementation does nothing. Subclasses may use this | |
* to, e.g., lowercase tokens. */ | |
virtual TCHAR normalize(const TCHAR c) const; | |
public: | |
CharTokenizer(CL_NS(util)::Reader* in); | |
virtual ~CharTokenizer(){ | |
} | |
bool next(Token* token); | |
}; | |
/** A LetterTokenizer is a tokenizer that divides text at non-letters. That's | |
to say, it defines tokens as maximal strings of adjacent letters, as defined | |
by java.lang.Character.isLetter() predicate. | |
Note: this does a decent job for most European languages, but does a terrible | |
job for some Asian languages, where words are not separated by spaces. */ | |
class LetterTokenizer:public CharTokenizer { | |
public: | |
// Construct a new LetterTokenizer. | |
LetterTokenizer(CL_NS(util)::Reader* in): | |
CharTokenizer(in) {} | |
~LetterTokenizer(){} | |
protected: | |
/** Collects only characters which satisfy _istalpha.*/ | |
bool isTokenChar(const TCHAR c) const; | |
}; | |
/** | |
* LowerCaseTokenizer performs the function of LetterTokenizer | |
* and LowerCaseFilter together. It divides text at non-letters and converts | |
* them to lower case. While it is functionally equivalent to the combination | |
* of LetterTokenizer and LowerCaseFilter, there is a performance advantage | |
* to doing the two tasks at once, hence this (redundant) implementation. | |
* <P> | |
* Note: this does a decent job for most European languages, but does a terrible | |
* job for some Asian languages, where words are not separated by spaces. | |
*/ | |
class LowerCaseTokenizer:public LetterTokenizer { | |
public: | |
/** Construct a new LowerCaseTokenizer. */ | |
LowerCaseTokenizer(CL_NS(util)::Reader* in): | |
LetterTokenizer(in) {} | |
~LowerCaseTokenizer(){} | |
protected: | |
/** Collects only characters which satisfy _totlower. */ | |
TCHAR normalize(const TCHAR chr) const; | |
}; | |
/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace. | |
* Adjacent sequences of non-Whitespace characters form tokens. */ | |
class WhitespaceTokenizer: public CharTokenizer { | |
public: | |
/** Construct a new WhitespaceTokenizer. */ | |
WhitespaceTokenizer(CL_NS(util)::Reader* in):CharTokenizer(in) {} | |
~WhitespaceTokenizer(){} | |
protected: | |
/** Collects only characters which do not satisfy _istspace. | |
*/ | |
bool isTokenChar(const TCHAR c) const; | |
}; | |
/** An Analyzer that uses WhitespaceTokenizer. */ | |
class WhitespaceAnalyzer: public Analyzer { | |
public: | |
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); | |
~WhitespaceAnalyzer(){} | |
}; | |
/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */ | |
class SimpleAnalyzer: public Analyzer { | |
public: | |
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); | |
~SimpleAnalyzer(){} | |
}; | |
/** | |
* Normalizes token text to lower case. | |
*/ | |
class LowerCaseFilter: public TokenFilter { | |
public: | |
LowerCaseFilter(TokenStream* in, bool deleteTokenStream):TokenFilter(in,deleteTokenStream) {} | |
~LowerCaseFilter(){} | |
bool next(Token* token); | |
}; | |
/** | |
* Removes stop words from a token stream. | |
*/ | |
class StopFilter: public TokenFilter { | |
private: | |
//bvk: i found this to work faster with a non-hash table. the number of items | |
//in the stop table is not like to make it worth having hashing. | |
CL_NS(util)::CLSetList<const TCHAR*>* table; | |
public: | |
// Constructs a filter which removes words from the input | |
// TokenStream that are named in the array of words. | |
StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** stopWords); | |
~StopFilter(){} | |
/** Constructs a filter which removes words from the input | |
* TokenStream that are named in the CLSetList. | |
*/ | |
StopFilter(TokenStream* in, bool deleteTokenStream, CL_NS(util)::CLSetList<const TCHAR*>* stopTable): | |
TokenFilter(in, deleteTokenStream), | |
table(stopTable) | |
{} | |
/** | |
* Builds a Hashtable from an array of stop words, appropriate for passing | |
* into the StopFilter constructor. This permits this table construction to | |
* be cached once when an Analyzer is constructed. | |
* Note: the stopWords list must be a static list because the strings are not copied | |
*/ | |
static void fillStopTable(CL_NS(util)::CLSetList<const TCHAR*>* stopTable, | |
const TCHAR** stopWords); | |
/** | |
* Returns the next input Token whose termText() is not a stop word. | |
*/ | |
bool next(Token* token); | |
}; | |
/** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */ | |
class StopAnalyzer: public Analyzer { | |
CL_NS(util)::CLSetList<const TCHAR*> stopTable; | |
public: | |
/** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */ | |
StopAnalyzer(); | |
~StopAnalyzer(); | |
/** Builds an analyzer which removes words in the provided array. */ | |
StopAnalyzer( const TCHAR** stopWords ); | |
/** Filters LowerCaseTokenizer with StopFilter. */ | |
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); | |
/** An array containing some common English words that are not usually useful | |
for searching. */ | |
static const TCHAR* ENGLISH_STOP_WORDS[]; | |
}; | |
/** | |
* This analyzer is used to facilitate scenarios where different | |
* fields require different analysis techniques. Use {@link #addAnalyzer} | |
* to add a non-default analyzer on a field name basis. | |
* | |
* <p>Example usage: | |
* | |
* <pre> | |
* PerFieldAnalyzerWrapper aWrapper = | |
* new PerFieldAnalyzerWrapper(new StandardAnalyzer()); | |
* aWrapper.addAnalyzer("firstname", new KeywordAnalyzer()); | |
* aWrapper.addAnalyzer("lastname", new KeywordAnalyzer()); | |
* </pre> | |
* | |
* <p>In this example, StandardAnalyzer will be used for all fields except "firstname" | |
* and "lastname", for which KeywordAnalyzer will be used. | |
* | |
* <p>A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing | |
* and query parsing. | |
*/ | |
class PerFieldAnalyzerWrapper : public Analyzer { | |
private: | |
Analyzer* defaultAnalyzer; | |
CL_NS(util)::CLHashMap<const TCHAR*, Analyzer*, CL_NS(util)::Compare::TChar, | |
CL_NS(util)::Equals::TChar, CL_NS(util)::Deletor::tcArray,CL_NS(util)::Deletor::Void<Analyzer> > analyzerMap; | |
public: | |
/** | |
* Constructs with default analyzer. | |
* | |
* @param defaultAnalyzer Any fields not specifically | |
* defined to use a different analyzer will use the one provided here. | |
*/ | |
PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer); | |
~PerFieldAnalyzerWrapper(); | |
/** | |
* Defines an analyzer to use for the specified field. | |
* | |
* @param fieldName field name requiring a non-default analyzer | |
* @param analyzer non-default analyzer to use for field | |
*/ | |
void addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer); | |
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); | |
}; | |
/** | |
* A filter that replaces accented characters in the ISO Latin 1 character set | |
* (ISO-8859-1) by their unaccented equivalent. The case will not be altered. | |
* <p> | |
* For instance, 'à' will be replaced by 'a'. | |
* <p> | |
*/ | |
class ISOLatin1AccentFilter: public TokenFilter { | |
public: | |
ISOLatin1AccentFilter(TokenStream* input, bool deleteTs): | |
TokenFilter(input,deleteTs) | |
{ | |
} | |
/** | |
* To replace accented characters in a String by unaccented equivalents. | |
*/ | |
bool next(Token* token); | |
}; | |
/** | |
* Emits the entire input as a single token. | |
*/ | |
class KeywordTokenizer: public Tokenizer { | |
private: | |
LUCENE_STATIC_CONSTANT(int, DEFAULT_BUFFER_SIZE = 256); | |
bool done; | |
int bufferSize; | |
public: | |
KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize=-1); | |
virtual ~KeywordTokenizer(); | |
bool next(Token* token); | |
}; | |
/** | |
* "Tokenizes" the entire stream as a single token. This is useful | |
* for data like zip codes, ids, and some product names. | |
*/ | |
class KeywordAnalyzer: public Analyzer { | |
public: | |
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); | |
virtual ~KeywordAnalyzer(){} | |
}; | |
/** | |
* Removes words that are too long and too short from the stream. | |
* | |
*/ | |
class LengthFilter: public TokenFilter { | |
private: | |
int _min; | |
int _max; | |
public: | |
/** | |
* Build a filter that removes words that are too long or too | |
* short from the text. | |
*/ | |
LengthFilter(TokenStream* in, int _min, int _max); | |
/** | |
* Returns the next input Token whose termText() is the right len | |
*/ | |
bool next(Token* token); | |
}; | |
CL_NS_END | |
#endif |