Windows-4.7.4/src/3rdparty/clucene/src/CLucene/analysis/Analyzers.h - platform/external/qt - Git at Google

 /*------------------------------------------------------------------------------
 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
 *
 * Distributable under the terms of either the Apache License (Version 2.0) or
 * the GNU Lesser General Public License, as specified in the COPYING file.
 ------------------------------------------------------------------------------*/
 #ifndef _lucene_analysis_Analyzers_
 #define _lucene_analysis_Analyzers_

 #if defined(_LUCENE_PRAGMA_ONCE)
 # pragma once
 #endif

 #include "CLucene/util/Reader.h"
 #include "AnalysisHeader.h"
 #include "CLucene/util/Misc.h"

 CL_NS_DEF(analysis)

 /** An abstract base class for simple, character-oriented tokenizers.*/
 class CharTokenizer:public Tokenizer {
 private:
 	int32_t offset, bufferIndex, dataLen;
 	TCHAR buffer[LUCENE_MAX_WORD_LEN+1];
 	const TCHAR* ioBuffer;
 protected:

     /** Returns true iff a character should be included in a token.  This
     * tokenizer generates as tokens adjacent sequences of characters which
     * satisfy this predicate.  Characters for which this is false are used to
     * define token boundaries and are not included in tokens. */
 	virtual bool isTokenChar(const TCHAR c) const = 0;

     /** Called on each token character to normalize it before it is added to the
     * token.  The default implementation does nothing.  Subclasses may use this
     * to, e.g., lowercase tokens. */
    	virtual TCHAR normalize(const TCHAR c) const;

 public:
 	CharTokenizer(CL_NS(util)::Reader* in);
 	virtual ~CharTokenizer(){
 	}
 	bool next(Token* token);
 };


 /** A LetterTokenizer is a tokenizer that divides text at non-letters.  That's
 to say, it defines tokens as maximal strings of adjacent letters, as defined
 by java.lang.Character.isLetter() predicate.

 Note: this does a decent job for most European languages, but does a terrible
 job for some Asian languages, where words are not separated by spaces. */
 class LetterTokenizer:public CharTokenizer {
 public:
 	// Construct a new LetterTokenizer.
 	LetterTokenizer(CL_NS(util)::Reader* in):
 	CharTokenizer(in) {}

     ~LetterTokenizer(){}
 protected:
     /** Collects only characters which satisfy _istalpha.*/
 	bool isTokenChar(const TCHAR c) const;
 };


 /**
 * LowerCaseTokenizer performs the function of LetterTokenizer
 * and LowerCaseFilter together.  It divides text at non-letters and converts
 * them to lower case.  While it is functionally equivalent to the combination
 * of LetterTokenizer and LowerCaseFilter, there is a performance advantage
 * to doing the two tasks at once, hence this (redundant) implementation.
 * <P>
 * Note: this does a decent job for most European languages, but does a terrible
 * job for some Asian languages, where words are not separated by spaces.
 */
 class LowerCaseTokenizer:public LetterTokenizer {
 public:
 	/** Construct a new LowerCaseTokenizer. */
 	LowerCaseTokenizer(CL_NS(util)::Reader* in):
 	LetterTokenizer(in) {}

     ~LowerCaseTokenizer(){}
 protected:
 	/** Collects only characters which satisfy _totlower. */
 	TCHAR normalize(const TCHAR chr) const;
 };


 /** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
  * Adjacent sequences of non-Whitespace characters form tokens. */
 class WhitespaceTokenizer: public CharTokenizer {
 public:
 	/** Construct a new WhitespaceTokenizer. */
 	WhitespaceTokenizer(CL_NS(util)::Reader* in):CharTokenizer(in) {}
 	~WhitespaceTokenizer(){}
 protected:
 	/** Collects only characters which do not satisfy _istspace.
 	*/
 	bool isTokenChar(const TCHAR c) const;
 };


 /** An Analyzer that uses WhitespaceTokenizer. */
 class WhitespaceAnalyzer: public Analyzer {
  public:
   TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
   ~WhitespaceAnalyzer(){}
 };

 /** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
 class SimpleAnalyzer: public Analyzer {
 public:
 	TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
 	~SimpleAnalyzer(){}
 };


 /**
 * Normalizes token text to lower case.
 */
 class LowerCaseFilter: public TokenFilter {
 public:
 	LowerCaseFilter(TokenStream* in, bool deleteTokenStream):TokenFilter(in,deleteTokenStream) {}
 	~LowerCaseFilter(){}
 	bool next(Token* token);
 };


 /**
  * Removes stop words from a token stream.
  */
 class StopFilter: public TokenFilter {
 private:
 	//bvk: i found this to work faster with a non-hash table. the number of items
 	//in the stop table is not like to make it worth having hashing.
 	CL_NS(util)::CLSetList<const TCHAR*>* table;
 public:
 	// Constructs a filter which removes words from the input
 	//	TokenStream that are named in the array of words.
 	StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** stopWords);

 	~StopFilter(){}

 	/** Constructs a filter which removes words from the input
 	*	TokenStream that are named in the CLSetList.
 	*/
 	StopFilter(TokenStream* in, bool deleteTokenStream, CL_NS(util)::CLSetList<const TCHAR*>* stopTable):
 		TokenFilter(in, deleteTokenStream),
 		table(stopTable)
 	{}


 	/**
 	* Builds a Hashtable from an array of stop words, appropriate for passing
 	* into the StopFilter constructor.  This permits this table construction to
 	* be cached once when an Analyzer is constructed.
 	* Note: the stopWords list must be a static list because the strings are not copied
 	*/
 	static void fillStopTable(CL_NS(util)::CLSetList<const TCHAR*>* stopTable,
                                       const TCHAR** stopWords);

 	/**
 	* Returns the next input Token whose termText() is not a stop word.
 	*/
 	bool next(Token* token);
 };


 /** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
 class StopAnalyzer: public Analyzer {
     CL_NS(util)::CLSetList<const TCHAR*> stopTable;

 public:
     /** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
     StopAnalyzer();
     ~StopAnalyzer();

     /** Builds an analyzer which removes words in the provided array. */
     StopAnalyzer( const TCHAR** stopWords );
     /** Filters LowerCaseTokenizer with StopFilter. */
     TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);

 	/** An array containing some common English words that are not usually useful
     for searching. */
     static const TCHAR* ENGLISH_STOP_WORDS[];
 };


 /**
  * This analyzer is used to facilitate scenarios where different
  * fields require different analysis techniques.  Use {@link #addAnalyzer}
  * to add a non-default analyzer on a field name basis.
  *
  * <p>Example usage:
  *
  * <pre>
  *   PerFieldAnalyzerWrapper aWrapper =
  *      new PerFieldAnalyzerWrapper(new StandardAnalyzer());
  *   aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
  *   aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
  * </pre>
  *
  * <p>In this example, StandardAnalyzer will be used for all fields except "firstname"
  * and "lastname", for which KeywordAnalyzer will be used.
  *
  * <p>A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing
  * and query parsing.
  */
 class PerFieldAnalyzerWrapper : public Analyzer {
 private:
     Analyzer* defaultAnalyzer;
     CL_NS(util)::CLHashMap<const TCHAR*, Analyzer*, CL_NS(util)::Compare::TChar,
     CL_NS(util)::Equals::TChar, CL_NS(util)::Deletor::tcArray,CL_NS(util)::Deletor::Void<Analyzer> > analyzerMap;
 public:
     /**
     * Constructs with default analyzer.
     *
     * @param defaultAnalyzer Any fields not specifically
     * defined to use a different analyzer will use the one provided here.
     */
     PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer);
     ~PerFieldAnalyzerWrapper();

     /**
     * Defines an analyzer to use for the specified field.
     *
     * @param fieldName field name requiring a non-default analyzer
     * @param analyzer non-default analyzer to use for field
     */
     void addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer);
     TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
 };


 /**
  * A filter that replaces accented characters in the ISO Latin 1 character set
  * (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
  * <p>
  * For instance, '&agrave;' will be replaced by 'a'.
  * <p>
  */
 class ISOLatin1AccentFilter: public TokenFilter {
 public:
 	ISOLatin1AccentFilter(TokenStream* input, bool deleteTs):
 		TokenFilter(input,deleteTs)
 	{
 	}

 	/**
 	 * To replace accented characters in a String by unaccented equivalents.
 	 */
 	bool next(Token* token);
 };


 /**
  * Emits the entire input as a single token.
  */
 class KeywordTokenizer: public Tokenizer {
 private:
     LUCENE_STATIC_CONSTANT(int, DEFAULT_BUFFER_SIZE = 256);
     bool done;
     int bufferSize;
 public:
     KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize=-1);
     virtual ~KeywordTokenizer();
     bool next(Token* token);
 };

 /**
  * "Tokenizes" the entire stream as a single token. This is useful
  * for data like zip codes, ids, and some product names.
  */
 class KeywordAnalyzer: public Analyzer {
 public:
     TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
     virtual ~KeywordAnalyzer(){}
 };


 /**
  * Removes words that are too long and too short from the stream.
  *
  */
 class LengthFilter: public TokenFilter {
 private:
     int _min;
     int _max;
 public:
     /**
     * Build a filter that removes words that are too long or too
     * short from the text.
     */
     LengthFilter(TokenStream* in, int _min, int _max);

     /**
     * Returns the next input Token whose termText() is the right len
     */
     bool next(Token* token);
 };


 CL_NS_END
 #endif
	/*------------------------------------------------------------------------------
	* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
	*
	* Distributable under the terms of either the Apache License (Version 2.0) or
	* the GNU Lesser General Public License, as specified in the COPYING file.
	------------------------------------------------------------------------------*/
	#ifndef _lucene_analysis_Analyzers_
	#define _lucene_analysis_Analyzers_

	#if defined(_LUCENE_PRAGMA_ONCE)
	# pragma once
	#endif

	#include "CLucene/util/Reader.h"
	#include "AnalysisHeader.h"
	#include "CLucene/util/Misc.h"

	CL_NS_DEF(analysis)

	/** An abstract base class for simple, character-oriented tokenizers.*/
	class CharTokenizer:public Tokenizer {
	private:
	int32_t offset, bufferIndex, dataLen;
	TCHAR buffer[LUCENE_MAX_WORD_LEN+1];
	const TCHAR* ioBuffer;
	protected:

	/** Returns true iff a character should be included in a token. This
	* tokenizer generates as tokens adjacent sequences of characters which
	* satisfy this predicate. Characters for which this is false are used to
	* define token boundaries and are not included in tokens. */
	virtual bool isTokenChar(const TCHAR c) const = 0;

	/** Called on each token character to normalize it before it is added to the
	* token. The default implementation does nothing. Subclasses may use this
	* to, e.g., lowercase tokens. */
	virtual TCHAR normalize(const TCHAR c) const;

	public:
	CharTokenizer(CL_NS(util)::Reader* in);
	virtual ~CharTokenizer(){
	}
	bool next(Token* token);
	};


	/** A LetterTokenizer is a tokenizer that divides text at non-letters. That's
	to say, it defines tokens as maximal strings of adjacent letters, as defined
	by java.lang.Character.isLetter() predicate.

	Note: this does a decent job for most European languages, but does a terrible
	job for some Asian languages, where words are not separated by spaces. */
	class LetterTokenizer:public CharTokenizer {
	public:
	// Construct a new LetterTokenizer.
	LetterTokenizer(CL_NS(util)::Reader* in):
	CharTokenizer(in) {}

	~LetterTokenizer(){}
	protected:
	/** Collects only characters which satisfy _istalpha.*/
	bool isTokenChar(const TCHAR c) const;
	};



	/**
	* LowerCaseTokenizer performs the function of LetterTokenizer
	* and LowerCaseFilter together. It divides text at non-letters and converts
	* them to lower case. While it is functionally equivalent to the combination
	* of LetterTokenizer and LowerCaseFilter, there is a performance advantage
	* to doing the two tasks at once, hence this (redundant) implementation.
	* <P>
	* Note: this does a decent job for most European languages, but does a terrible
	* job for some Asian languages, where words are not separated by spaces.
	*/
	class LowerCaseTokenizer:public LetterTokenizer {
	public:
	/** Construct a new LowerCaseTokenizer. */
	LowerCaseTokenizer(CL_NS(util)::Reader* in):
	LetterTokenizer(in) {}

	~LowerCaseTokenizer(){}
	protected:
	/** Collects only characters which satisfy _totlower. */
	TCHAR normalize(const TCHAR chr) const;
	};


	/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
	* Adjacent sequences of non-Whitespace characters form tokens. */
	class WhitespaceTokenizer: public CharTokenizer {
	public:
	/** Construct a new WhitespaceTokenizer. */
	WhitespaceTokenizer(CL_NS(util)::Reader* in):CharTokenizer(in) {}
	~WhitespaceTokenizer(){}
	protected:
	/** Collects only characters which do not satisfy _istspace.
	*/
	bool isTokenChar(const TCHAR c) const;
	};


	/** An Analyzer that uses WhitespaceTokenizer. */
	class WhitespaceAnalyzer: public Analyzer {
	public:
	TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
	~WhitespaceAnalyzer(){}
	};

	/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
	class SimpleAnalyzer: public Analyzer {
	public:
	TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
	~SimpleAnalyzer(){}
	};



	/**
	* Normalizes token text to lower case.
	*/
	class LowerCaseFilter: public TokenFilter {
	public:
	LowerCaseFilter(TokenStream* in, bool deleteTokenStream):TokenFilter(in,deleteTokenStream) {}
	~LowerCaseFilter(){}
	bool next(Token* token);
	};


	/**
	* Removes stop words from a token stream.
	*/
	class StopFilter: public TokenFilter {
	private:
	//bvk: i found this to work faster with a non-hash table. the number of items
	//in the stop table is not like to make it worth having hashing.
	CL_NS(util)::CLSetList<const TCHAR> table;
	public:
	// Constructs a filter which removes words from the input
	// TokenStream that are named in the array of words.
	StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** stopWords);

	~StopFilter(){}

	/** Constructs a filter which removes words from the input
	* TokenStream that are named in the CLSetList.
	*/
	StopFilter(TokenStream* in, bool deleteTokenStream, CL_NS(util)::CLSetList<const TCHAR> stopTable):
	TokenFilter(in, deleteTokenStream),
	table(stopTable)
	{}


	/**
	* Builds a Hashtable from an array of stop words, appropriate for passing
	* into the StopFilter constructor. This permits this table construction to
	* be cached once when an Analyzer is constructed.
	* Note: the stopWords list must be a static list because the strings are not copied
	*/
	static void fillStopTable(CL_NS(util)::CLSetList<const TCHAR> stopTable,
	const TCHAR** stopWords);

	/**
	* Returns the next input Token whose termText() is not a stop word.
	*/
	bool next(Token* token);
	};




	/** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
	class StopAnalyzer: public Analyzer {
	CL_NS(util)::CLSetList<const TCHAR*> stopTable;

	public:
	/** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
	StopAnalyzer();
	~StopAnalyzer();

	/** Builds an analyzer which removes words in the provided array. */
	StopAnalyzer( const TCHAR** stopWords );
	/** Filters LowerCaseTokenizer with StopFilter. */
	TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);

	/** An array containing some common English words that are not usually useful
	for searching. */
	static const TCHAR* ENGLISH_STOP_WORDS[];
	};



	/**
	* This analyzer is used to facilitate scenarios where different
	* fields require different analysis techniques. Use {@link #addAnalyzer}
	* to add a non-default analyzer on a field name basis.
	*
	* <p>Example usage:
	*
	* <pre>
	* PerFieldAnalyzerWrapper aWrapper =
	* new PerFieldAnalyzerWrapper(new StandardAnalyzer());
	* aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
	* aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
	* </pre>
	*
	* <p>In this example, StandardAnalyzer will be used for all fields except "firstname"
	* and "lastname", for which KeywordAnalyzer will be used.
	*
	* <p>A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing
	* and query parsing.
	*/
	class PerFieldAnalyzerWrapper : public Analyzer {
	private:
	Analyzer* defaultAnalyzer;
	CL_NS(util)::CLHashMap<const TCHAR, Analyzer, CL_NS(util)::Compare::TChar,
	CL_NS(util)::Equals::TChar, CL_NS(util)::Deletor::tcArray,CL_NS(util)::Deletor::Void<Analyzer> > analyzerMap;
	public:
	/**
	* Constructs with default analyzer.
	*
	* @param defaultAnalyzer Any fields not specifically
	* defined to use a different analyzer will use the one provided here.
	*/
	PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer);
	~PerFieldAnalyzerWrapper();

	/**
	* Defines an analyzer to use for the specified field.
	*
	* @param fieldName field name requiring a non-default analyzer
	* @param analyzer non-default analyzer to use for field
	*/
	void addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer);
	TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
	};


	/**
	* A filter that replaces accented characters in the ISO Latin 1 character set
	* (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
	* <p>
	* For instance, 'à' will be replaced by 'a'.
	* <p>
	*/
	class ISOLatin1AccentFilter: public TokenFilter {
	public:
	ISOLatin1AccentFilter(TokenStream* input, bool deleteTs):
	TokenFilter(input,deleteTs)
	{
	}

	/**
	* To replace accented characters in a String by unaccented equivalents.
	*/
	bool next(Token* token);
	};


	/**
	* Emits the entire input as a single token.
	*/
	class KeywordTokenizer: public Tokenizer {
	private:
	LUCENE_STATIC_CONSTANT(int, DEFAULT_BUFFER_SIZE = 256);
	bool done;
	int bufferSize;
	public:
	KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize=-1);
	virtual ~KeywordTokenizer();
	bool next(Token* token);
	};

	/**
	* "Tokenizes" the entire stream as a single token. This is useful
	* for data like zip codes, ids, and some product names.
	*/
	class KeywordAnalyzer: public Analyzer {
	public:
	TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
	virtual ~KeywordAnalyzer(){}
	};


	/**
	* Removes words that are too long and too short from the stream.
	*
	*/
	class LengthFilter: public TokenFilter {
	private:
	int _min;
	int _max;
	public:
	/**
	* Build a filter that removes words that are too long or too
	* short from the text.
	*/
	LengthFilter(TokenStream* in, int _min, int _max);

	/**
	* Returns the next input Token whose termText() is the right len
	*/
	bool next(Token* token);
	};


	CL_NS_END
	#endif