blob: a12bd653f9d9ead902419af51ef588b6053e252f [file] [log] [blame]
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
#ifndef _lucene_analysis_Analyzers_
#define _lucene_analysis_Analyzers_
#if defined(_LUCENE_PRAGMA_ONCE)
# pragma once
#include "CLucene/util/Reader.h"
#include "AnalysisHeader.h"
#include "CLucene/util/Misc.h"
/** An abstract base class for simple, character-oriented tokenizers.*/
class CharTokenizer:public Tokenizer {
int32_t offset, bufferIndex, dataLen;
const TCHAR* ioBuffer;
/** Returns true iff a character should be included in a token. This
* tokenizer generates as tokens adjacent sequences of characters which
* satisfy this predicate. Characters for which this is false are used to
* define token boundaries and are not included in tokens. */
virtual bool isTokenChar(const TCHAR c) const = 0;
/** Called on each token character to normalize it before it is added to the
* token. The default implementation does nothing. Subclasses may use this
* to, e.g., lowercase tokens. */
virtual TCHAR normalize(const TCHAR c) const;
CharTokenizer(CL_NS(util)::Reader* in);
virtual ~CharTokenizer(){
bool next(Token* token);
/** A LetterTokenizer is a tokenizer that divides text at non-letters. That's
to say, it defines tokens as maximal strings of adjacent letters, as defined
by java.lang.Character.isLetter() predicate.
Note: this does a decent job for most European languages, but does a terrible
job for some Asian languages, where words are not separated by spaces. */
class LetterTokenizer:public CharTokenizer {
// Construct a new LetterTokenizer.
LetterTokenizer(CL_NS(util)::Reader* in):
CharTokenizer(in) {}
/** Collects only characters which satisfy _istalpha.*/
bool isTokenChar(const TCHAR c) const;
* LowerCaseTokenizer performs the function of LetterTokenizer
* and LowerCaseFilter together. It divides text at non-letters and converts
* them to lower case. While it is functionally equivalent to the combination
* of LetterTokenizer and LowerCaseFilter, there is a performance advantage
* to doing the two tasks at once, hence this (redundant) implementation.
* <P>
* Note: this does a decent job for most European languages, but does a terrible
* job for some Asian languages, where words are not separated by spaces.
class LowerCaseTokenizer:public LetterTokenizer {
/** Construct a new LowerCaseTokenizer. */
LowerCaseTokenizer(CL_NS(util)::Reader* in):
LetterTokenizer(in) {}
/** Collects only characters which satisfy _totlower. */
TCHAR normalize(const TCHAR chr) const;
/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
* Adjacent sequences of non-Whitespace characters form tokens. */
class WhitespaceTokenizer: public CharTokenizer {
/** Construct a new WhitespaceTokenizer. */
WhitespaceTokenizer(CL_NS(util)::Reader* in):CharTokenizer(in) {}
/** Collects only characters which do not satisfy _istspace.
bool isTokenChar(const TCHAR c) const;
/** An Analyzer that uses WhitespaceTokenizer. */
class WhitespaceAnalyzer: public Analyzer {
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
class SimpleAnalyzer: public Analyzer {
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
* Normalizes token text to lower case.
class LowerCaseFilter: public TokenFilter {
LowerCaseFilter(TokenStream* in, bool deleteTokenStream):TokenFilter(in,deleteTokenStream) {}
bool next(Token* token);
* Removes stop words from a token stream.
class StopFilter: public TokenFilter {
//bvk: i found this to work faster with a non-hash table. the number of items
//in the stop table is not like to make it worth having hashing.
CL_NS(util)::CLSetList<const TCHAR*>* table;
// Constructs a filter which removes words from the input
// TokenStream that are named in the array of words.
StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** stopWords);
/** Constructs a filter which removes words from the input
* TokenStream that are named in the CLSetList.
StopFilter(TokenStream* in, bool deleteTokenStream, CL_NS(util)::CLSetList<const TCHAR*>* stopTable):
TokenFilter(in, deleteTokenStream),
* Builds a Hashtable from an array of stop words, appropriate for passing
* into the StopFilter constructor. This permits this table construction to
* be cached once when an Analyzer is constructed.
* Note: the stopWords list must be a static list because the strings are not copied
static void fillStopTable(CL_NS(util)::CLSetList<const TCHAR*>* stopTable,
const TCHAR** stopWords);
* Returns the next input Token whose termText() is not a stop word.
bool next(Token* token);
/** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
class StopAnalyzer: public Analyzer {
CL_NS(util)::CLSetList<const TCHAR*> stopTable;
/** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
/** Builds an analyzer which removes words in the provided array. */
StopAnalyzer( const TCHAR** stopWords );
/** Filters LowerCaseTokenizer with StopFilter. */
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
/** An array containing some common English words that are not usually useful
for searching. */
* This analyzer is used to facilitate scenarios where different
* fields require different analysis techniques. Use {@link #addAnalyzer}
* to add a non-default analyzer on a field name basis.
* <p>Example usage:
* <pre>
* PerFieldAnalyzerWrapper aWrapper =
* new PerFieldAnalyzerWrapper(new StandardAnalyzer());
* aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
* aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
* </pre>
* <p>In this example, StandardAnalyzer will be used for all fields except "firstname"
* and "lastname", for which KeywordAnalyzer will be used.
* <p>A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing
* and query parsing.
class PerFieldAnalyzerWrapper : public Analyzer {
Analyzer* defaultAnalyzer;
CL_NS(util)::CLHashMap<const TCHAR*, Analyzer*, CL_NS(util)::Compare::TChar,
CL_NS(util)::Equals::TChar, CL_NS(util)::Deletor::tcArray,CL_NS(util)::Deletor::Void<Analyzer> > analyzerMap;
* Constructs with default analyzer.
* @param defaultAnalyzer Any fields not specifically
* defined to use a different analyzer will use the one provided here.
PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer);
* Defines an analyzer to use for the specified field.
* @param fieldName field name requiring a non-default analyzer
* @param analyzer non-default analyzer to use for field
void addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer);
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
* A filter that replaces accented characters in the ISO Latin 1 character set
* (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
* <p>
* For instance, '&agrave;' will be replaced by 'a'.
* <p>
class ISOLatin1AccentFilter: public TokenFilter {
ISOLatin1AccentFilter(TokenStream* input, bool deleteTs):
* To replace accented characters in a String by unaccented equivalents.
bool next(Token* token);
* Emits the entire input as a single token.
class KeywordTokenizer: public Tokenizer {
bool done;
int bufferSize;
KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize=-1);
virtual ~KeywordTokenizer();
bool next(Token* token);
* "Tokenizes" the entire stream as a single token. This is useful
* for data like zip codes, ids, and some product names.
class KeywordAnalyzer: public Analyzer {
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
virtual ~KeywordAnalyzer(){}
* Removes words that are too long and too short from the stream.
class LengthFilter: public TokenFilter {
int _min;
int _max;
* Build a filter that removes words that are too long or too
* short from the text.
LengthFilter(TokenStream* in, int _min, int _max);
* Returns the next input Token whose termText() is the right len
bool next(Token* token);