jni/include/matrixsearch.h - platform/packages/inputmethods/PinyinIME - Git at Google

 /*
  * Copyright (C) 2009 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #ifndef PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__
 #define PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__

 #include <stdlib.h>
 #include "./atomdictbase.h"
 #include "./dicttrie.h"
 #include "./searchutility.h"
 #include "./spellingtrie.h"
 #include "./splparser.h"

 namespace ime_pinyin {

 static const size_t kMaxRowNum = kMaxSearchSteps;

 typedef struct {
   // MileStoneHandle objects for the system and user dictionaries.
   MileStoneHandle dict_handles[2];
   // From which DMI node. -1 means it's from root.
   PoolPosType dmi_fr;
   // The spelling id for the Pinyin string from the previous DMI to this node.
   // If it is a half id like Shengmu, the node pointed by dict_node is the first
   // node with this Shengmu,
   uint16 spl_id;
   // What's the level of the dict node. Level of root is 0, but root is never
   // recorded by dict_node.
   unsigned char dict_level:7;
   // If this node is for composing phrase, this bit is 1.
   unsigned char c_phrase:1;
   // Whether the spl_id is parsed with a split character at the end.
   unsigned char splid_end_split:1;
   // What's the length of the spelling string for this match, for the whole
   // word.
   unsigned char splstr_len:7;
   // Used to indicate whether all spelling ids from the root are full spelling
   // ids. This information is useful for keymapping mode(not finished). Because
   // in this mode, there is no clear boundaries, we prefer those results which
   // have full spelling ids.
   unsigned char all_full_id:1;
 } DictMatchInfo, *PDictMatchInfo;

 typedef struct MatrixNode {
   LemmaIdType id;
   float score;
   MatrixNode *from;
   // From which DMI node. Used to trace the spelling segmentation.
   PoolPosType dmi_fr;
   uint16 step;
 } MatrixNode, *PMatrixNode;

 typedef struct {
   // The MatrixNode position in the matrix pool
   PoolPosType mtrx_nd_pos;
   // The DictMatchInfo position in the DictMatchInfo pool.
   PoolPosType dmi_pos;
   uint16 mtrx_nd_num;
   uint16 dmi_num:15;
   // Used to indicate whether there are dmi nodes in this step with full
   // spelling id. This information is used to decide whether a substring of a
   // valid Pinyin should be extended.
   //
   // Example1: shoudao
   // When the last char 'o' is added, the parser will find "dao" is a valid
   // Pinyin, and because all dmi nodes at location 'd' (including those for
   // "shoud", and those for "d") have Shengmu id only, so it is not necessary
   // to extend "ao", otherwise the result may be "shoud ao", that is not
   // reasonable.
   //
   // Example2: hengao
   // When the last 'o' is added, the parser finds "gao" is a valid Pinyin.
   // Because some dmi nodes at 'g' has Shengmu ids (hen'g and g), but some dmi
   // nodes at 'g' has full ids ('heng'), so it is necessary to extend "ao", thus
   // "heng ao" can also be the result.
   //
   // Similarly, "ganga" is expanded to "gang a".
   //
   // For Pinyin string "xian", because "xian" is a valid Pinyin, because all dmi
   // nodes at 'x' only have Shengmu ids, the parser will not try "x ian" (and it
   // is not valid either). If the parser uses break in the loop, the result
   // always be "xian"; but if the parser uses continue in the loop, "xi an" will
   // also be tried. This behaviour can be set via the function
   // set_xi_an_switch().
   uint16 dmi_has_full_id:1;
   // Points to a MatrixNode of the current step to indicate which choice the
   // user selects.
   MatrixNode *mtrx_nd_fixed;
 } MatrixRow, *PMatrixRow;

 // When user inputs and selects candidates, the fixed lemma ids are stored in
 // lma_id_ of class MatrixSearch, and fixed_lmas_ is used to indicate how many
 // lemmas from the beginning are fixed. If user deletes Pinyin characters one
 // by one from the end, these fixed lemmas can be unlocked one by one when
 // necessary. Whenever user deletes a Chinese character and its spelling string
 // in these fixed lemmas, all fixed lemmas will be merged together into a unit
 // named ComposingPhrase with a lemma id kLemmaIdComposing, and this composing
 // phrase will be the first lemma in the sentence. Because it contains some
 // modified lemmas (by deleting a character), these merged lemmas are called
 // sub lemmas (sublma), and each of them are represented individually, so that
 // when user deletes Pinyin characters from the end, these sub lemmas can also
 // be unlocked one by one.
 typedef struct {
   uint16 spl_ids[kMaxRowNum];
   uint16 spl_start[kMaxRowNum];
   char16 chn_str[kMaxRowNum];       // Chinese string.
   uint16 sublma_start[kMaxRowNum];  // Counted in Chinese characters.
   size_t sublma_num;
   uint16 length;                    // Counted in Chinese characters.
 } ComposingPhrase, *TComposingPhrase;

 class MatrixSearch {
  private:
   // If it is true, prediction list by string whose length is greater than 1
   // will be limited to a reasonable number.
   static const bool kPredictLimitGt1 = false;

   // If it is true, the engine will prefer long history based prediction,
   // for example, when user inputs "BeiJing", we prefer "DaXue", etc., which are
   // based on the two-character history.
   static const bool kPreferLongHistoryPredict = true;

   // If it is true, prediction will only be based on user dictionary. this flag
   // is for debug purpose.
   static const bool kOnlyUserDictPredict = false;

   // The maximum buffer to store LmaPsbItems.
   static const size_t kMaxLmaPsbItems = 1450;

   // How many rows for each step.
   static const size_t kMaxNodeARow = 5;

   // The maximum length of the sentence candidates counted in chinese
   // characters
   static const size_t kMaxSentenceLength = 16;

   // The size of the matrix node pool.
   static const size_t kMtrxNdPoolSize = 200;

   // The size of the DMI node pool.
   static const size_t kDmiPoolSize = 800;

   // Used to indicate whether this object has been initialized.
   bool inited_;

   // Spelling trie.
   const SpellingTrie *spl_trie_;

   // Used to indicate this switcher status: when "xian" is parseed, should
   // "xi an" also be extended. Default is false.
   // These cases include: xia, xian, xiang, zhuan, jiang..., etc. The string
   // should be valid for a FULL spelling, or a combination of two spellings,
   // first of which is a FULL id too. So even it is true, "da" will never be
   // split into "d a", because "d" is not a full spelling id.
   bool xi_an_enabled_;

   // System dictionary.
   DictTrie* dict_trie_;

   // User dictionary.
   AtomDictBase* user_dict_;

   // Spelling parser.
   SpellingParser* spl_parser_;

   // The maximum allowed length of spelling string (such as a Pinyin string).
   size_t max_sps_len_;

   // The maximum allowed length of a result Chinese string.
   size_t max_hzs_len_;

   // Pinyin string. Max length: kMaxRowNum - 1
   char pys_[kMaxRowNum];

   // The length of the string that has been decoded successfully.
   size_t pys_decoded_len_;

   // Shared buffer for multiple purposes.
   size_t *share_buf_;

   MatrixNode *mtrx_nd_pool_;
   PoolPosType mtrx_nd_pool_used_;    // How many nodes used in the pool
   DictMatchInfo *dmi_pool_;
   PoolPosType dmi_pool_used_;        // How many items used in the pool

   MatrixRow *matrix_;                // The first row is for starting

   DictExtPara *dep_;                 // Parameter used to extend DMI nodes.

   NPredictItem *npre_items_;         // Used to do prediction
   size_t npre_items_len_;

   // The starting positions and lemma ids for the full sentence candidate.
   size_t lma_id_num_;
   uint16 lma_start_[kMaxRowNum];     // Counted in spelling ids.
   LemmaIdType lma_id_[kMaxRowNum];
   size_t fixed_lmas_;

   // If fixed_lmas_ is bigger than i,  Element i is used to indicate whether
   // the i'th lemma id in lma_id_ is the first candidate for that step.
   // If all candidates are the first one for that step, the whole string can be
   // decoded by the engine automatically, so no need to add it to user
   // dictionary. (We are considering to add it to user dictionary in the
   // future).
   uint8 fixed_lmas_no1_[kMaxRowNum];

   // Composing phrase
   ComposingPhrase c_phrase_;

   // If dmi_c_phrase_ is true, the decoder will try to match the
   // composing phrase (And definitely it will match successfully). If it
   // is false, the decoder will try to match lemmas items in dictionaries.
   bool dmi_c_phrase_;

   // The starting positions and spelling ids for the first full sentence
   // candidate.
   size_t spl_id_num_;                // Number of splling ids
   uint16 spl_start_[kMaxRowNum];     // Starting positions
   uint16 spl_id_[kMaxRowNum];        // Spelling ids
   // Used to remember the last fixed position, counted in Hanzi.
   size_t fixed_hzs_;

   // Lemma Items with possibility score, two purposes:
   // 1. In Viterbi decoding, this buffer is used to get all possible candidates
   // for current step;
   // 2. When the search is done, this buffer is used to get candiates from the
   // first un-fixed step and show them to the user.
   LmaPsbItem lpi_items_[kMaxLmaPsbItems];
   size_t lpi_total_;

   // Assign the pointers with NULL. The caller makes sure that all pointers are
   // not valid before calling it. This function only will be called in the
   // construction function and free_resource().
   void reset_pointers_to_null();

   bool alloc_resource();

   void free_resource();

   // Reset the search space totally.
   bool reset_search0();

   // Reset the search space from ch_pos step. For example, if the original
   // input Pinyin is "an", reset_search(1) will reset the search space to the
   // result of "a". If the given position is out of range, return false.
   // if clear_fixed_this_step is true, and the ch_pos step is a fixed step,
   // clear its fixed status. if clear_dmi_his_step is true, clear the DMI nodes.
   // If clear_mtrx_this_sTep is true, clear the mtrx nodes of this step.
   // The DMI nodes will be kept.
   //
   // Note: this function should not destroy content of pys_.
   bool reset_search(size_t ch_pos, bool clear_fixed_this_step,
                     bool clear_dmi_this_step, bool clear_mtrx_this_step);

   // Delete a part of the content in pys_.
   void del_in_pys(size_t start, size_t len);

   // Delete a spelling id and its corresponding Chinese character, and merge
   // the fixed lemmas into the composing phrase.
   // del_spl_pos indicates which spelling id needs to be delete.
   // This function will update the lemma and spelling segmentation information.
   // The caller guarantees that fixed_lmas_ > 0 and del_spl_pos is within
   // the fixed lemmas.
   void merge_fixed_lmas(size_t del_spl_pos);

   // Get spelling start posistions and ids. The result will be stored in
   // spl_id_num_, spl_start_[], spl_id_[].
   // fixed_hzs_ will be also assigned.
   void get_spl_start_id();

   // Get all lemma ids with match the given spelling id stream(shorter than the
   // maximum length of a word).
   // If pfullsent is not NULL, means the full sentence candidate may be the
   // same with the coming lemma string, if so, remove that lemma.
   // The result is sorted in descendant order by the frequency score.
   size_t get_lpis(const uint16* splid_str, size_t splid_str_len,
                   LmaPsbItem* lma_buf, size_t max_lma_buf,
                   const char16 *pfullsent, bool sort_by_psb);

   uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max);

   uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
                           uint16 splids_max, bool arg_valid);


   // Extend a DMI node with a spelling id. ext_len is the length of the rows
   // to extend, actually, it is the size of the spelling string of splid.
   // return value can be 1 or 0.
   // 1 means a new DMI is filled in (dmi_pool_used_ is the next blank DMI in
   // the pool).
   // 0 means either the dmi node can not be extended with splid, or the splid
   // is a Shengmu id, which is only used to get lpi_items, or the result node
   // in DictTrie has no son, it is not nccessary to keep the new DMI.
   //
   // This function modifies the content of lpi_items_ and lpi_total_.
   // lpi_items_ is used to get the LmaPsbItem list, lpi_total_ returns the size.
   // The function's returned value has no relation with the value of lpi_num.
   //
   // If dmi == NULL, this function will extend the root node of DictTrie
   //
   // This function will not change dmi_nd_pool_used_. Please change it after
   // calling this function if necessary.
   //
   // The caller should guarantees that NULL != dep.
   size_t extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s);

   // Extend dmi for the composing phrase.
   size_t extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s);

   // Extend a MatrixNode with the give LmaPsbItem list.
   // res_row is the destination row number.
   // This function does not change mtrx_nd_pool_used_. Please change it after
   // calling this function if necessary.
   // return 0 always.
   size_t extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[],
                         size_t lpi_num, PoolPosType dmi_fr, size_t res_row);


   // Try to find a dmi node at step_to position, and the found dmi node should
   // match the given spelling id strings.
   PoolPosType match_dmi(size_t step_to, uint16 spl_ids[], uint16 spl_id_num);

   bool add_char(char ch);
   bool prepare_add_char(char ch);

   // Called after prepare_add_char, so the input char has been saved.
   bool add_char_qwerty();

   // Prepare candidates from the last fixed hanzi position.
   void prepare_candidates();

   // Is the character in step pos a splitter character?
   // The caller guarantees that the position is valid.
   bool is_split_at(uint16 pos);

   void fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles,
                 PoolPosType dmi_fr,
                 uint16 spl_id, uint16 node_num, unsigned char dict_level,
                 bool splid_end_split, unsigned char splstr_len,
                 unsigned char all_full_id);

   size_t inner_predict(const char16 fixed_scis_ids[], uint16 scis_num,
                        char16 predict_buf[][kMaxPredictSize + 1],
                        size_t buf_len);

   // Add the first candidate to the user dictionary.
   bool try_add_cand0_to_userdict();

   // Add a user lemma to the user dictionary. This lemma is a subset of
   // candidate 0. lma_from is from which lemma in lma_ids_, lma_num is the
   // number of lemmas to be combined together as a new lemma. The caller
   // gurantees that the combined new lemma's length is less or equal to
   // kMaxLemmaSize.
   bool add_lma_to_userdict(uint16 lma_from, uint16 lma_num, float score);

   // Update dictionary frequencies.
   void update_dict_freq();

   void debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level);

  public:
   MatrixSearch();
   ~MatrixSearch();

   bool init(const char *fn_sys_dict, const char *fn_usr_dict);

   bool init_fd(int sys_fd, long start_offset, long length,
                const char *fn_usr_dict);

   void set_max_lens(size_t max_sps_len, size_t max_hzs_len);

   void close();

   void flush_cache();

   void set_xi_an_switch(bool xi_an_enabled);

   bool get_xi_an_switch();

   // Reset the search space. Equivalent to reset_search(0).
   // If inited, always return true;
   bool reset_search();

   // Search a Pinyin string.
   // Return value is the position successfully parsed.
   size_t search(const char *py, size_t py_len);

   // Used to delete something in the Pinyin string kept by the engine, and do
   // a re-search.
   // Return value is the new length of Pinyin string kept by the engine which
   // is parsed successfully.
   // If is_pos_in_splid is false, pos is used to indicate that pos-th Pinyin
   // character needs to be deleted. If is_pos_in_splid is true, all Pinyin
   // characters for pos-th spelling id needs to be deleted.
   // If the deleted character(s) is just after a fixed lemma or sub lemma in
   // composing phrase, clear_fixed_this_step indicates whether we needs to
   // unlock the last fixed lemma or sub lemma.
   // If is_pos_in_splid is false, and pos-th character is in the range for the
   // fixed lemmas or composing string, this function will do nothing and just
   // return the result of the previous search.
   size_t delsearch(size_t pos, bool is_pos_in_splid,
                    bool clear_fixed_this_step);

   // Get the number of candiates, called after search().
   size_t get_candidate_num();

   // Get the Pinyin string stored by the engine.
   // *decoded_len returns the length of the successfully decoded string.
   const char* get_pystr(size_t *decoded_len);

   // Get the spelling boundaries for the first sentence candidate.
   // Number of spellings will be returned. The number of valid elements in
   // spl_start is one more than the return value because the last one is used
   // to indicate the beginning of the next un-input speling.
   // For a Pinyin "women", the returned value is 2, spl_start is [0, 2, 5] .
   size_t get_spl_start(const uint16 *&spl_start);

   // Get one candiate string. If full sentence candidate is available, it will
   // be the first one.
   char16* get_candidate(size_t cand_id, char16 *cand_str, size_t max_len);

   // Get the first candiate, which is a "full sentence".
   // retstr_len is not NULL, it will be used to return the string length.
   // If only_unfixed is true, only unfixed part will be fetched.
   char16* get_candidate0(char16* cand_str, size_t max_len,
                          uint16 *retstr_len, bool only_unfixed);

   // Choose a candidate. The decoder will do a search after the fixed position.
   size_t choose(size_t cand_id);

   // Cancel the last choosing operation, and return the new number of choices.
   size_t cancel_last_choice();

   // Get the length of fixed Hanzis.
   size_t get_fixedlen();

   size_t get_predicts(const char16 fixed_buf[],
                       char16 predict_buf[][kMaxPredictSize + 1],
                       size_t buf_len);
 };
 }

 #endif  // PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__
	/*
	* Copyright (C) 2009 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#ifndef PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__
	#define PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__

	#include <stdlib.h>
	#include "./atomdictbase.h"
	#include "./dicttrie.h"
	#include "./searchutility.h"
	#include "./spellingtrie.h"
	#include "./splparser.h"

	namespace ime_pinyin {

	static const size_t kMaxRowNum = kMaxSearchSteps;

	typedef struct {
	// MileStoneHandle objects for the system and user dictionaries.
	MileStoneHandle dict_handles[2];
	// From which DMI node. -1 means it's from root.
	PoolPosType dmi_fr;
	// The spelling id for the Pinyin string from the previous DMI to this node.
	// If it is a half id like Shengmu, the node pointed by dict_node is the first
	// node with this Shengmu,
	uint16 spl_id;
	// What's the level of the dict node. Level of root is 0, but root is never
	// recorded by dict_node.
	unsigned char dict_level:7;
	// If this node is for composing phrase, this bit is 1.
	unsigned char c_phrase:1;
	// Whether the spl_id is parsed with a split character at the end.
	unsigned char splid_end_split:1;
	// What's the length of the spelling string for this match, for the whole
	// word.
	unsigned char splstr_len:7;
	// Used to indicate whether all spelling ids from the root are full spelling
	// ids. This information is useful for keymapping mode(not finished). Because
	// in this mode, there is no clear boundaries, we prefer those results which
	// have full spelling ids.
	unsigned char all_full_id:1;
	} DictMatchInfo, *PDictMatchInfo;

	typedef struct MatrixNode {
	LemmaIdType id;
	float score;
	MatrixNode *from;
	// From which DMI node. Used to trace the spelling segmentation.
	PoolPosType dmi_fr;
	uint16 step;
	} MatrixNode, *PMatrixNode;

	typedef struct {
	// The MatrixNode position in the matrix pool
	PoolPosType mtrx_nd_pos;
	// The DictMatchInfo position in the DictMatchInfo pool.
	PoolPosType dmi_pos;
	uint16 mtrx_nd_num;
	uint16 dmi_num:15;
	// Used to indicate whether there are dmi nodes in this step with full
	// spelling id. This information is used to decide whether a substring of a
	// valid Pinyin should be extended.
	//
	// Example1: shoudao
	// When the last char 'o' is added, the parser will find "dao" is a valid
	// Pinyin, and because all dmi nodes at location 'd' (including those for
	// "shoud", and those for "d") have Shengmu id only, so it is not necessary
	// to extend "ao", otherwise the result may be "shoud ao", that is not
	// reasonable.
	//
	// Example2: hengao
	// When the last 'o' is added, the parser finds "gao" is a valid Pinyin.
	// Because some dmi nodes at 'g' has Shengmu ids (hen'g and g), but some dmi
	// nodes at 'g' has full ids ('heng'), so it is necessary to extend "ao", thus
	// "heng ao" can also be the result.
	//
	// Similarly, "ganga" is expanded to "gang a".
	//
	// For Pinyin string "xian", because "xian" is a valid Pinyin, because all dmi
	// nodes at 'x' only have Shengmu ids, the parser will not try "x ian" (and it
	// is not valid either). If the parser uses break in the loop, the result
	// always be "xian"; but if the parser uses continue in the loop, "xi an" will
	// also be tried. This behaviour can be set via the function
	// set_xi_an_switch().
	uint16 dmi_has_full_id:1;
	// Points to a MatrixNode of the current step to indicate which choice the
	// user selects.
	MatrixNode *mtrx_nd_fixed;
	} MatrixRow, *PMatrixRow;

	// When user inputs and selects candidates, the fixed lemma ids are stored in
	// lma_id_ of class MatrixSearch, and fixed_lmas_ is used to indicate how many
	// lemmas from the beginning are fixed. If user deletes Pinyin characters one
	// by one from the end, these fixed lemmas can be unlocked one by one when
	// necessary. Whenever user deletes a Chinese character and its spelling string
	// in these fixed lemmas, all fixed lemmas will be merged together into a unit
	// named ComposingPhrase with a lemma id kLemmaIdComposing, and this composing
	// phrase will be the first lemma in the sentence. Because it contains some
	// modified lemmas (by deleting a character), these merged lemmas are called
	// sub lemmas (sublma), and each of them are represented individually, so that
	// when user deletes Pinyin characters from the end, these sub lemmas can also
	// be unlocked one by one.
	typedef struct {
	uint16 spl_ids[kMaxRowNum];
	uint16 spl_start[kMaxRowNum];
	char16 chn_str[kMaxRowNum]; // Chinese string.
	uint16 sublma_start[kMaxRowNum]; // Counted in Chinese characters.
	size_t sublma_num;
	uint16 length; // Counted in Chinese characters.
	} ComposingPhrase, *TComposingPhrase;

	class MatrixSearch {
	private:
	// If it is true, prediction list by string whose length is greater than 1
	// will be limited to a reasonable number.
	static const bool kPredictLimitGt1 = false;

	// If it is true, the engine will prefer long history based prediction,
	// for example, when user inputs "BeiJing", we prefer "DaXue", etc., which are
	// based on the two-character history.
	static const bool kPreferLongHistoryPredict = true;

	// If it is true, prediction will only be based on user dictionary. this flag
	// is for debug purpose.
	static const bool kOnlyUserDictPredict = false;

	// The maximum buffer to store LmaPsbItems.
	static const size_t kMaxLmaPsbItems = 1450;

	// How many rows for each step.
	static const size_t kMaxNodeARow = 5;

	// The maximum length of the sentence candidates counted in chinese
	// characters
	static const size_t kMaxSentenceLength = 16;

	// The size of the matrix node pool.
	static const size_t kMtrxNdPoolSize = 200;

	// The size of the DMI node pool.
	static const size_t kDmiPoolSize = 800;

	// Used to indicate whether this object has been initialized.
	bool inited_;

	// Spelling trie.
	const SpellingTrie *spl_trie_;

	// Used to indicate this switcher status: when "xian" is parseed, should
	// "xi an" also be extended. Default is false.
	// These cases include: xia, xian, xiang, zhuan, jiang..., etc. The string
	// should be valid for a FULL spelling, or a combination of two spellings,
	// first of which is a FULL id too. So even it is true, "da" will never be
	// split into "d a", because "d" is not a full spelling id.
	bool xi_an_enabled_;

	// System dictionary.
	DictTrie* dict_trie_;

	// User dictionary.
	AtomDictBase* user_dict_;

	// Spelling parser.
	SpellingParser* spl_parser_;

	// The maximum allowed length of spelling string (such as a Pinyin string).
	size_t max_sps_len_;

	// The maximum allowed length of a result Chinese string.
	size_t max_hzs_len_;

	// Pinyin string. Max length: kMaxRowNum - 1
	char pys_[kMaxRowNum];

	// The length of the string that has been decoded successfully.
	size_t pys_decoded_len_;

	// Shared buffer for multiple purposes.
	size_t *share_buf_;

	MatrixNode *mtrx_nd_pool_;
	PoolPosType mtrx_nd_pool_used_; // How many nodes used in the pool
	DictMatchInfo *dmi_pool_;
	PoolPosType dmi_pool_used_; // How many items used in the pool

	MatrixRow *matrix_; // The first row is for starting

	DictExtPara *dep_; // Parameter used to extend DMI nodes.

	NPredictItem *npre_items_; // Used to do prediction
	size_t npre_items_len_;

	// The starting positions and lemma ids for the full sentence candidate.
	size_t lma_id_num_;
	uint16 lma_start_[kMaxRowNum]; // Counted in spelling ids.
	LemmaIdType lma_id_[kMaxRowNum];
	size_t fixed_lmas_;

	// If fixed_lmas_ is bigger than i, Element i is used to indicate whether
	// the i'th lemma id in lma_id_ is the first candidate for that step.
	// If all candidates are the first one for that step, the whole string can be
	// decoded by the engine automatically, so no need to add it to user
	// dictionary. (We are considering to add it to user dictionary in the
	// future).
	uint8 fixed_lmas_no1_[kMaxRowNum];

	// Composing phrase
	ComposingPhrase c_phrase_;

	// If dmi_c_phrase_ is true, the decoder will try to match the
	// composing phrase (And definitely it will match successfully). If it
	// is false, the decoder will try to match lemmas items in dictionaries.
	bool dmi_c_phrase_;

	// The starting positions and spelling ids for the first full sentence
	// candidate.
	size_t spl_id_num_; // Number of splling ids
	uint16 spl_start_[kMaxRowNum]; // Starting positions
	uint16 spl_id_[kMaxRowNum]; // Spelling ids
	// Used to remember the last fixed position, counted in Hanzi.
	size_t fixed_hzs_;

	// Lemma Items with possibility score, two purposes:
	// 1. In Viterbi decoding, this buffer is used to get all possible candidates
	// for current step;
	// 2. When the search is done, this buffer is used to get candiates from the
	// first un-fixed step and show them to the user.
	LmaPsbItem lpi_items_[kMaxLmaPsbItems];
	size_t lpi_total_;

	// Assign the pointers with NULL. The caller makes sure that all pointers are
	// not valid before calling it. This function only will be called in the
	// construction function and free_resource().
	void reset_pointers_to_null();

	bool alloc_resource();

	void free_resource();

	// Reset the search space totally.
	bool reset_search0();

	// Reset the search space from ch_pos step. For example, if the original
	// input Pinyin is "an", reset_search(1) will reset the search space to the
	// result of "a". If the given position is out of range, return false.
	// if clear_fixed_this_step is true, and the ch_pos step is a fixed step,
	// clear its fixed status. if clear_dmi_his_step is true, clear the DMI nodes.
	// If clear_mtrx_this_sTep is true, clear the mtrx nodes of this step.
	// The DMI nodes will be kept.
	//
	// Note: this function should not destroy content of pys_.
	bool reset_search(size_t ch_pos, bool clear_fixed_this_step,
	bool clear_dmi_this_step, bool clear_mtrx_this_step);

	// Delete a part of the content in pys_.
	void del_in_pys(size_t start, size_t len);

	// Delete a spelling id and its corresponding Chinese character, and merge
	// the fixed lemmas into the composing phrase.
	// del_spl_pos indicates which spelling id needs to be delete.
	// This function will update the lemma and spelling segmentation information.
	// The caller guarantees that fixed_lmas_ > 0 and del_spl_pos is within
	// the fixed lemmas.
	void merge_fixed_lmas(size_t del_spl_pos);

	// Get spelling start posistions and ids. The result will be stored in
	// spl_id_num_, spl_start_[], spl_id_[].
	// fixed_hzs_ will be also assigned.
	void get_spl_start_id();

	// Get all lemma ids with match the given spelling id stream(shorter than the
	// maximum length of a word).
	// If pfullsent is not NULL, means the full sentence candidate may be the
	// same with the coming lemma string, if so, remove that lemma.
	// The result is sorted in descendant order by the frequency score.
	size_t get_lpis(const uint16* splid_str, size_t splid_str_len,
	LmaPsbItem* lma_buf, size_t max_lma_buf,
	const char16 *pfullsent, bool sort_by_psb);

	uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max);

	uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
	uint16 splids_max, bool arg_valid);


	// Extend a DMI node with a spelling id. ext_len is the length of the rows
	// to extend, actually, it is the size of the spelling string of splid.
	// return value can be 1 or 0.
	// 1 means a new DMI is filled in (dmi_pool_used_ is the next blank DMI in
	// the pool).
	// 0 means either the dmi node can not be extended with splid, or the splid
	// is a Shengmu id, which is only used to get lpi_items, or the result node
	// in DictTrie has no son, it is not nccessary to keep the new DMI.
	//
	// This function modifies the content of lpi_items_ and lpi_total_.
	// lpi_items_ is used to get the LmaPsbItem list, lpi_total_ returns the size.
	// The function's returned value has no relation with the value of lpi_num.
	//
	// If dmi == NULL, this function will extend the root node of DictTrie
	//
	// This function will not change dmi_nd_pool_used_. Please change it after
	// calling this function if necessary.
	//
	// The caller should guarantees that NULL != dep.
	size_t extend_dmi(DictExtPara dep, DictMatchInfo dmi_s);

	// Extend dmi for the composing phrase.
	size_t extend_dmi_c(DictExtPara dep, DictMatchInfo dmi_s);

	// Extend a MatrixNode with the give LmaPsbItem list.
	// res_row is the destination row number.
	// This function does not change mtrx_nd_pool_used_. Please change it after
	// calling this function if necessary.
	// return 0 always.
	size_t extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[],
	size_t lpi_num, PoolPosType dmi_fr, size_t res_row);


	// Try to find a dmi node at step_to position, and the found dmi node should
	// match the given spelling id strings.
	PoolPosType match_dmi(size_t step_to, uint16 spl_ids[], uint16 spl_id_num);

	bool add_char(char ch);
	bool prepare_add_char(char ch);

	// Called after prepare_add_char, so the input char has been saved.
	bool add_char_qwerty();

	// Prepare candidates from the last fixed hanzi position.
	void prepare_candidates();

	// Is the character in step pos a splitter character?
	// The caller guarantees that the position is valid.
	bool is_split_at(uint16 pos);

	void fill_dmi(DictMatchInfo dmi, MileStoneHandle handles,
	PoolPosType dmi_fr,
	uint16 spl_id, uint16 node_num, unsigned char dict_level,
	bool splid_end_split, unsigned char splstr_len,
	unsigned char all_full_id);

	size_t inner_predict(const char16 fixed_scis_ids[], uint16 scis_num,
	char16 predict_buf[][kMaxPredictSize + 1],
	size_t buf_len);

	// Add the first candidate to the user dictionary.
	bool try_add_cand0_to_userdict();

	// Add a user lemma to the user dictionary. This lemma is a subset of
	// candidate 0. lma_from is from which lemma in lma_ids_, lma_num is the
	// number of lemmas to be combined together as a new lemma. The caller
	// gurantees that the combined new lemma's length is less or equal to
	// kMaxLemmaSize.
	bool add_lma_to_userdict(uint16 lma_from, uint16 lma_num, float score);

	// Update dictionary frequencies.
	void update_dict_freq();

	void debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level);

	public:
	MatrixSearch();
	~MatrixSearch();

	bool init(const char fn_sys_dict, const char fn_usr_dict);

	bool init_fd(int sys_fd, long start_offset, long length,
	const char *fn_usr_dict);

	void set_max_lens(size_t max_sps_len, size_t max_hzs_len);

	void close();

	void flush_cache();

	void set_xi_an_switch(bool xi_an_enabled);

	bool get_xi_an_switch();

	// Reset the search space. Equivalent to reset_search(0).
	// If inited, always return true;
	bool reset_search();

	// Search a Pinyin string.
	// Return value is the position successfully parsed.
	size_t search(const char *py, size_t py_len);

	// Used to delete something in the Pinyin string kept by the engine, and do
	// a re-search.
	// Return value is the new length of Pinyin string kept by the engine which
	// is parsed successfully.
	// If is_pos_in_splid is false, pos is used to indicate that pos-th Pinyin
	// character needs to be deleted. If is_pos_in_splid is true, all Pinyin
	// characters for pos-th spelling id needs to be deleted.
	// If the deleted character(s) is just after a fixed lemma or sub lemma in
	// composing phrase, clear_fixed_this_step indicates whether we needs to
	// unlock the last fixed lemma or sub lemma.
	// If is_pos_in_splid is false, and pos-th character is in the range for the
	// fixed lemmas or composing string, this function will do nothing and just
	// return the result of the previous search.
	size_t delsearch(size_t pos, bool is_pos_in_splid,
	bool clear_fixed_this_step);

	// Get the number of candiates, called after search().
	size_t get_candidate_num();

	// Get the Pinyin string stored by the engine.
	// *decoded_len returns the length of the successfully decoded string.
	const char* get_pystr(size_t *decoded_len);

	// Get the spelling boundaries for the first sentence candidate.
	// Number of spellings will be returned. The number of valid elements in
	// spl_start is one more than the return value because the last one is used
	// to indicate the beginning of the next un-input speling.
	// For a Pinyin "women", the returned value is 2, spl_start is [0, 2, 5] .
	size_t get_spl_start(const uint16 *&spl_start);

	// Get one candiate string. If full sentence candidate is available, it will
	// be the first one.
	char16* get_candidate(size_t cand_id, char16 *cand_str, size_t max_len);

	// Get the first candiate, which is a "full sentence".
	// retstr_len is not NULL, it will be used to return the string length.
	// If only_unfixed is true, only unfixed part will be fetched.
	char16* get_candidate0(char16* cand_str, size_t max_len,
	uint16 *retstr_len, bool only_unfixed);

	// Choose a candidate. The decoder will do a search after the fixed position.
	size_t choose(size_t cand_id);

	// Cancel the last choosing operation, and return the new number of choices.
	size_t cancel_last_choice();

	// Get the length of fixed Hanzis.
	size_t get_fixedlen();

	size_t get_predicts(const char16 fixed_buf[],
	char16 predict_buf[][kMaxPredictSize + 1],
	size_t buf_len);
	};
	}

	#endif // PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__