dict/dict.h - platform/external/tesseract - Git at Google

 ///////////////////////////////////////////////////////////////////////
 // File:        dict.h
 // Description: dict class.
 // Author:      Samuel Charron
 //
 // (C) Copyright 2006, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 ///////////////////////////////////////////////////////////////////////

 #ifndef TESSERACT_DICT_DICT_H_
 #define TESSERACT_DICT_DICT_H_

 #include "ambigs.h"
 #include "choices.h"
 #include "choicearr.h"
 #include "dawg.h"
 #include "image.h"
 #include "ratngs.h"
 #include "stopper.h"
 #include "trie.h"
 #include "unicharset.h"

 extern STRING_VAR_H(global_user_words_suffix, "user-words",
                     "A list of user-provided words.");
 extern INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");

 #define MAX_WERD_LENGTH        (inT64) 40
 #define NO_RATING               -1
 #define FREQ_WERD               1.0
 #define GOOD_WERD               1.1
 #define OK_WERD                 1.3125

 // Struct used to hold temporary information about fragments.
 struct CHAR_FRAGMENT_INFO {
   UNICHAR_ID unichar_id;
   const CHAR_FRAGMENT *fragment;
   int num_fragments;
   float rating;
   float certainty;
 };

 namespace tesseract {

 typedef GenericVector<Dawg *> DawgVector;

 struct DawgArgs {
   DawgArgs(DawgInfoVector *d, DawgInfoVector *c,
            DawgInfoVector *ud, DawgInfoVector *uc, float r) :
     active_dawgs(d), constraints(c), updated_active_dawgs(ud),
     updated_constraints(uc), rating_margin(r) {
     for (int i = 0; i < MAX_WERD_LENGTH; ++i) {
       rating_array[i] = NO_RATING;
     }
     permuter = NO_PERM;
   }
   DawgInfoVector *active_dawgs;
   DawgInfoVector *constraints;
   DawgInfoVector *updated_active_dawgs;
   DawgInfoVector *updated_constraints;
   PermuterType permuter;
   float rating_margin;  // prunning margin ratio
   float rating_array[MAX_WERD_LENGTH];
 };

 class Dict {
  public:
   Dict(Image* image_ptr);
   ~Dict();
   Image* getImage() {
     return image_ptr_;
   }
   UNICHARSET& getUnicharset() {
     return getImage()->getCCUtil()->unicharset;
   }
   const UnicharAmbigs &getUnicharAmbigs() {
     return getImage()->getCCUtil()->unichar_ambigs;
   }

   /* hyphen.cpp ************************************************************/

   // Returns true if we've recorded the beginning of a hyphenated word.
   inline bool hyphenated() { return !last_word_on_line_ && hyphen_word_; }
   // Size of the base word (the part on the line before) of a hyphenated word.
   inline int hyphen_base_size() {
     return this->hyphenated() ? hyphen_word_->length() : 0;
   }
   // If this word is hyphenated copy the base word (the part on
   // the line before) of a hyphenated word into the given word.
   // This function assumes that word is not NULL.
   inline void copy_hyphen_info(WERD_CHOICE *word) {
     if (this->hyphenated()) {
       *word = *hyphen_word_;
       if (hyphen_debug_level) word->print("copy_hyphen_info: ");
     }
   }
   // Erase the unichar ids corresponding to the portion of the word
   // from the previous line. The word is not changed if it is not
   // split between lines and hyphenated.
   inline void remove_hyphen_head(WERD_CHOICE *word) {
     if (this->hyphenated()) {
       word->remove_unichar_ids(0, hyphen_word_->length());
       if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: ");
     }
   }
   // Check whether the word has a hyphen at the end.
   inline bool has_hyphen_end(const WERD_CHOICE &word) {
     int word_index = word.length() - 1;
     return (last_word_on_line_ && word_index > 0 &&
             word.unichar_id(word_index) == hyphen_unichar_id_);
   }
   // Unless the previous word was the last one on the line, and the current
   // one is not (thus it is the first one on the line), erase hyphen_word_,
   // clear hyphen_active_dawgs_, hyphen_constraints_ update last_word_on_line_.
   void reset_hyphen_vars(bool last_word_on_line);
   // Update hyphen_word_, and copy the given DawgInfoVectors into
   // hyphen_active_dawgs_ and hyphen_constraints_.
   void set_hyphen_word(const WERD_CHOICE &word,
                        const DawgInfoVector &active_dawgs,
                        const DawgInfoVector &constraints);

   /* permdawg.cpp ************************************************************/
   // If new_rating < best_choice->rating(), copy word int best_choice
   // and update rating and permuter of best_choice to the new given values.
   inline void update_best_choice(
       const WERD_CHOICE &word, WERD_CHOICE *best_choice) {
     if (word.rating() < best_choice->rating()) {
       *best_choice = word;
     }
   }
   // Fill the given active_dawgs vector with dawgs that could contain the
   // beginning of the word. If hyphenated() returns true, copy the entries
   // from hyphen_active_dawgs_ instead.
   void init_active_dawgs(DawgInfoVector *active_dawgs);
   // If hyphenated() returns true, copy the entries from hyphen_constraints_
   // into the given constraints vector.
   void init_constraints(DawgInfoVector *constraints);
   // Recursively explore all the possible character combinations in
   // the given char_choices. Use go_deeper_dawg_fxn() to explore all the
   // dawgs in the dawgs_ vector in parallel and discard invalid words.
   //
   // Allocate and return a WERD_CHOICE with the best valid word found.
   WERD_CHOICE *dawg_permute_and_select(
       const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit);
   void adjust_word(WERD_CHOICE *best_choice,
                    float *certainty_array);
   // If the choice being composed so far could be a dictionary word
   // and we have not reached the end of the word keep exploring the
   // char_choices further.
   // Also:
   // -- set hyphen word if needed
   // -- if word_ending is true and word is better than best_choice
   //    copy word to best_choice log new word choice
   void go_deeper_dawg_fxn(
       const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
       int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
       bool word_ending, WERD_CHOICE *word, float certainties[],
       float *limit, WERD_CHOICE *best_choice, void *void_more_args);

   /* permute.cpp *************************************************************/
   void add_document_word(const WERD_CHOICE &best_choice);
   void init_permute();
   WERD_CHOICE *permute_top_choice(
     const BLOB_CHOICE_LIST_VECTOR &char_choices,
     float* rating_limit,
     WERD_CHOICE *raw_choice,
     BOOL8 *any_alpha);
   const char* choose_il1(const char *first_char,       //first choice
                          const char *second_char,      //second choice
                          const char *third_char,       //third choice
                          const char *prev_char,        //prev in word
                          const char *next_char,        //next in word
                          const char *next_next_char);  //after next next in word
   int valid_word(const WERD_CHOICE &word) {
     return valid_word(word, false);  // return NO_PERM for words with digits
   }
   int valid_word_or_number(const WERD_CHOICE &word) {
     return valid_word(word, true);  // return NUMBER_PERM for valid numbers
   }
   int valid_word(const WERD_CHOICE &word, bool numbers_ok);
   bool valid_punctuation(const WERD_CHOICE &word);
   WERD_CHOICE *permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices,
                            float rating_limit,
                            WERD_CHOICE *raw_choice);
   void end_permute();
   void adjust_non_word(WERD_CHOICE *word, float *adjust_factor);
   void permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
                        float rating_limit,
                        int start,
                        int end,
                        WERD_CHOICE *current_word);
   void permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
                           float limit,
                           WERD_CHOICE *best_choice,
                           WERD_CHOICE *raw_choice);
   WERD_CHOICE *permute_compound_words(
       const BLOB_CHOICE_LIST_VECTOR &char_choices,
       float rating_limit);
   // checks if the dominant word script, if there is one, is same as target.
   bool word_script_eq(const BLOB_CHOICE_LIST_VECTOR &char_choices,
                       int target_script_id);
   // Incoporate segmentation cost into word rating
   void incorporate_segcost(WERD_CHOICE* word);
   // checks for script-consistent permutations
   WERD_CHOICE *permute_script_words(
       const BLOB_CHOICE_LIST_VECTOR &char_choices);

   WERD_CHOICE *top_fragments_permute_and_select(
       const BLOB_CHOICE_LIST_VECTOR &char_choices,
       float rating_limit);
   // If the choice being composed so far could be better
   // than best_choice keep exploring char_choices.
   // If we have reached the end of the word and word is better than
   // best_choice, copy word to best_choice and log a new word choice.
   void go_deeper_top_fragments_fxn(
       const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
       int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
       bool word_ending, WERD_CHOICE *word, float certainties[],
       float *limit, WERD_CHOICE *best_choice, void *more_args);

   // Semi-generic functions used by multiple permuters.
   bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
                            float curr_rating, float curr_certainty,
                            const CHAR_FRAGMENT_INFO *prev_char_frag_info,
                            const char *debug, int word_ending,
                            CHAR_FRAGMENT_INFO *char_frag_info);
   void permute_choices(
       const char *debug,
       const BLOB_CHOICE_LIST_VECTOR &char_choices,
       int char_choice_index,
       const CHAR_FRAGMENT_INFO *prev_char_frag_info,
       WERD_CHOICE *word,
       float certainties[],
       float *limit,
       WERD_CHOICE *best_choice,
       void *more_args);

   void append_choices(
       const char *debug,
       const BLOB_CHOICE_LIST_VECTOR &char_choices,
       const BLOB_CHOICE &blob_choice,
       int char_choice_index,
       const CHAR_FRAGMENT_INFO *prev_char_frag_info,
       WERD_CHOICE *word,
       float certainties[],
       float *limit,
       WERD_CHOICE *best_choice,
       void *more_args);
   // Pointer to go_deeper function that will be modified by various permuters.
   void (Dict::*go_deeper_fxn_)(const char *debug,
                                const BLOB_CHOICE_LIST_VECTOR &char_choices,
                              int char_choice_index,
                              const CHAR_FRAGMENT_INFO *prev_char_frag_info,
                                bool word_ending, WERD_CHOICE *word,
                                float certainties[], float *limit,
                                WERD_CHOICE *best_choice, void *void_more_args);
   /* stopper.cpp *************************************************************/
   int NoDangerousAmbig(WERD_CHOICE *BestChoice,
                        DANGERR *fixpt,
                        bool fix_replaceable,
                        BLOB_CHOICE_LIST_VECTOR *Choices,
                        bool *modified_blobs);
   void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
                     UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
                     BLOB_CHOICE_LIST_VECTOR *blob_choices,
                     bool *modified_blobs);

   inline void DisableChoiceAccum() { keep_word_choices_ = FALSE; }
   inline void EnableChoiceAccum() { keep_word_choices_ = TRUE; }

   int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice);
   VIABLE_CHOICE NewViableChoice(const WERD_CHOICE &WordChoice,
                                 FLOAT32 AdjustFactor,
                                 const float Certainties[]);
   void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice);
   int StringSameAs(const char *String,
                    const char *String_lengths,
                    VIABLE_CHOICE ViableChoice);
   bool StringSameAs(const WERD_CHOICE &WordChoice,
                     VIABLE_CHOICE ViableChoice);
   int AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,
                        WERD_CHOICE *BestChoice,
                        const WERD_CHOICE &RawChoice,
                        DANGERR *fixpt,
                        ACCEPTABLE_CHOICE_CALLER caller,
                        bool *modified_blobs);
   int AcceptableResult(const WERD_CHOICE &BestChoice,
                        const WERD_CHOICE &RawChoice);
   int ChoiceSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice);
   void LogNewChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor,
                     const float Certainties[], bool raw_choice);
   void EndDangerousAmbigs();
   int CurrentBestChoiceIs(const WERD_CHOICE &WordChoice);
   FLOAT32 CurrentBestChoiceAdjustFactor();
   int CurrentWordAmbig();
   void DebugWordChoices();
   void PrintAmbigAlternatives(FILE *file, const char *label,
                               int label_num_unichars);
   void FillViableChoice(const WERD_CHOICE &WordChoice,
                         FLOAT32 AdjustFactor, const float Certainties[],
                         bool SameString, VIABLE_CHOICE ViableChoice);
   int AlternativeChoicesWorseThan(FLOAT32 Threshold);
   void FilterWordChoices();
   void FindClassifierErrors(FLOAT32 MinRating,
                           FLOAT32 MaxRating,
                           FLOAT32 RatingMargin,
                           FLOAT32 Thresholds[]);
   void InitChoiceAccum();
   void LogNewSegmentation(PIECES_STATE BlobWidth);
   void LogNewSplit(int Blob);
   void SettupStopperPass1();
   void SettupStopperPass2();
   /* choices.cpp *************************************************************/
   void print_word_string(const char* str);
   void print_word_choice(const char *label, A_CHOICE* choice);
   void print_choices(const char *label,
                      CHOICES rating);   // List of (A_CHOICE*).
   /* permngram.cpp ***********************************************************/
   A_CHOICE *ngram_permute_and_select(CHOICES_LIST char_choices,
                                      float rating_limit,
                                      const Dawg *dawg);
   /* dawg.cpp ****************************************************************/

   // Returns the maximal permuter code (from ccstruct/ratngs.h) if in light
   // of the current state the letter at word_index in the given word
   // is allowed according to at least one of the dawgs in dawgs_,
   // otherwise returns NO_PERM.
   //
   // The state is described by void_dawg_args, which are interpreted as
   // DawgArgs and contain two relevant input vectors: active_dawgs and
   // constraints. Each entry in the active_dawgs vector contains an index
   // into the dawgs_ vector and an EDGE_REF that indicates the last edge
   // followed in the dawg. Each entry in the constraints vector contains
   // an index into the dawgs_ vector and an EDGE_REF that indicates an edge
   // in a pattern dawg followed to match a pattern. Currently constraints
   // are used to save the state of punctuation dawgs after leading
   // punctuation was found.
   //
   // Input:
   // At word_index 0 dawg_args->active_dawgs should contain an entry for each
   // dawg whose type has a bit set in kBeginningDawgsType,
   // dawg_args->constraints should be empty. EDGE_REFs in active_dawgs and
   // constraints vectors should be initialized to NO_EDGE. If hyphen state
   // needs to be applied, initial dawg_args->active_dawgs and
   // dawg_args->constrains can be copied from the saved hyphen state
   // (maintained by Dict).
   // For word_index > 0 the corresponding state (active_dawgs and constraints)
   // can be obtained from dawg_args->updated_* passed to def_letter_is_okay
   // for word_index-1.
   // Note: the function assumes that active_dags, constraints and updated_*
   // member variables of dawg_args are not NULL.
   //
   // Output:
   // The function fills in dawg_args->updated_active_dawgs vector with the
   // entries for dawgs that contain the word up to the letter at word_index.
   // The new constraints (if any) are added to dawg_args->updated_constraints,
   // the constraints from dawg_args->constraints are also copied into it.
   //
   // Detailed description:
   // In order to determine whether the word is still valid after considering
   // all the letters up to the one at word_index the following is done for
   // each entry in dawg_args->active_dawgs:
   //
   // -- next starting node is obtained from entry.ref and edge_char_of() is
   //    called to obtain the next edge
   // -- if a valid edge is found, the function returns the updated permuter
   //    code true and an entry [entry.dawg_index, edge] is inserted in
   //    dawg_args->updated_active_dawgs
   //    otherwise:
   //    -- if we are dealing with dawg of type DAWG_TYPE_PUNCTUATION,
   //       edge_char_of() is called again, but now with kPatternUnicharID
   //       as unichar_id; if a valid edge is found it is recorded in
   //       dawg_args->updated_constraints
   //    -- the function checks whether the word can end with the previous
   //       letter
   //    -- each successor of the dawg (e.g. dawgs with type DAWG_TYPE_WORD
   //       could be successors to dawgs with type DAWG_TYPE_PUNCTUATION; the
   //       successors are defined by successors_ vector) is explored and if
   //       a letter is found in the successor dawg, a new entry is inserted
   //       into dawg_args->updated_active_dawgs with EDGE_REF being either
   //       NO_EDGE or an EDGE_REF recorded in constraints vector for the
   //       corresponding dawg index

   //
   int def_letter_is_okay(void* void_dawg_args, int word_index,
                          const void* word, bool word_end);

   int new_letter_is_okay(void* void_dawg_args, int word_index,
                          const void* word, bool word_end);
   int (Dict::*letter_is_okay_)(void* void_dawg_args, int word_index,
                                const void *word, bool word_end);
   // Return the number of dawgs in the dawgs_ vector.
   inline const int NumDawgs() const { return dawgs_.size(); }
   // Return i-th dawg pointer recorded in the dawgs_ vector.
   inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
   // At word ending make sure all the recorded constraints are satisfied.
   // Each constraint signifies that we found a beginning pattern in a
   // pattern dawg. Check that this pattern can end here (e.g. if some
   // leading punctuation is found this would ensure that we are not
   // expecting any particular trailing punctuation after the word).
   inline bool ConstraintsOk(const DawgInfoVector &constraints,
                             int word_end, DawgType current_dawg_type) {
     if (!word_end) return true;
     if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true;
     for (int c = 0; c < constraints.length(); ++c) {
       const DawgInfo &cinfo = constraints[c];
       Dawg *cdawg = dawgs_[cinfo.dawg_index];
       if (!cdawg->end_of_word(cinfo.ref)) {
         if (dawg_debug_level >= 3) {
           tprintf("Constraint [%d, " REFFORMAT "] is not satisfied\n",
                   cinfo.dawg_index, cinfo.ref);
         }
         return false;
       }
     }
     return true;
   }
   // Record the maximum of the two permuters in permuter.
   static inline void UpdatePermuter(PermuterType new_permuter,
                                     PermuterType *permuter) {
     if (dawg_debug_level >= 3) tprintf("Letter found\n");
     if (new_permuter > *permuter) *permuter = new_permuter;
   }

   /* conversion.cpp **********************************************************/
   // TODO(daria): remove these function when conversion.cpp is deprecated
   // and all the code is converted to work with unichar ids.
   void LogNewWordChoice(A_CHOICE *a_choice,
                         FLOAT32 adjust_factor,
                         const float certainties[],
                         const UNICHARSET &unicharset);
   int valid_word(const char *string);

  private:
   // Private member variables.
   Image* image_ptr_;
   // Table that stores ambiguities computed during training
   // (loaded when NoDangerousAmbigs() is called for the first time).
   // Each entry i in the table stores a set of amibiguities whose
   // wrong ngram starts with unichar id i.
   UnicharAmbigs *dang_ambigs_table_;
   // Same as above, but for ambiguities with replace flag set.
   UnicharAmbigs *replace_ambigs_table_;
   // Flag used to disable accumulation of word choices
   // during compound word permutation.
   BOOL8 keep_word_choices_;
   // Additional certainty padding allowed before a word is rejected.
   FLOAT32 reject_offset_;
   // Current word segmentation.
   PIECES_STATE current_segmentation_;
   // Variables to keep track of best/raw word choices.
   VIABLE_CHOICE best_raw_choice_;
   LIST raw_choices_;
   LIST best_choices_;
   // Hyphen-related variables.
   UNICHAR_ID hyphen_unichar_id_;
   WERD_CHOICE *hyphen_word_;
   DawgInfoVector hyphen_active_dawgs_;
   DawgInfoVector hyphen_constraints_;
   bool last_word_on_line_;
   // Dawgs.
   DawgVector dawgs_;
   SuccessorListsVector successors_;
   Dawg *freq_dawg_;
   Trie *pending_words_;
   // The following pointers are only cached for convenience.
   // The dawgs will be deleted when dawgs_ vector is destroyed.
   // TODO(daria): need to support multiple languages in the future,
   // so maybe will need to maintain a list of dawgs of each kind.
   Trie *document_words_;
 };
 }  // namespace tesseract

 #endif  // THIRD_PARTY_TESSERACT_DICT_DICT_H_
	///////////////////////////////////////////////////////////////////////
	// File: dict.h
	// Description: dict class.
	// Author: Samuel Charron
	//
	// (C) Copyright 2006, Google Inc.
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	// http://www.apache.org/licenses/LICENSE-2.0
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//
	///////////////////////////////////////////////////////////////////////

	#ifndef TESSERACT_DICT_DICT_H_
	#define TESSERACT_DICT_DICT_H_

	#include "ambigs.h"
	#include "choices.h"
	#include "choicearr.h"
	#include "dawg.h"
	#include "image.h"
	#include "ratngs.h"
	#include "stopper.h"
	#include "trie.h"
	#include "unicharset.h"

	extern STRING_VAR_H(global_user_words_suffix, "user-words",
	"A list of user-provided words.");
	extern INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");

	#define MAX_WERD_LENGTH (inT64) 40
	#define NO_RATING -1
	#define FREQ_WERD 1.0
	#define GOOD_WERD 1.1
	#define OK_WERD 1.3125

	// Struct used to hold temporary information about fragments.
	struct CHAR_FRAGMENT_INFO {
	UNICHAR_ID unichar_id;
	const CHAR_FRAGMENT *fragment;
	int num_fragments;
	float rating;
	float certainty;
	};

	namespace tesseract {

	typedef GenericVector<Dawg *> DawgVector;

	struct DawgArgs {
	DawgArgs(DawgInfoVector d, DawgInfoVector c,
	DawgInfoVector ud, DawgInfoVector uc, float r) :
	active_dawgs(d), constraints(c), updated_active_dawgs(ud),
	updated_constraints(uc), rating_margin(r) {
	for (int i = 0; i < MAX_WERD_LENGTH; ++i) {
	rating_array[i] = NO_RATING;
	}
	permuter = NO_PERM;
	}
	DawgInfoVector *active_dawgs;
	DawgInfoVector *constraints;
	DawgInfoVector *updated_active_dawgs;
	DawgInfoVector *updated_constraints;
	PermuterType permuter;
	float rating_margin; // prunning margin ratio
	float rating_array[MAX_WERD_LENGTH];
	};

	class Dict {
	public:
	Dict(Image* image_ptr);
	~Dict();
	Image* getImage() {
	return image_ptr_;
	}
	UNICHARSET& getUnicharset() {
	return getImage()->getCCUtil()->unicharset;
	}
	const UnicharAmbigs &getUnicharAmbigs() {
	return getImage()->getCCUtil()->unichar_ambigs;
	}

	/* hyphen.cpp ************************************************************/

	// Returns true if we've recorded the beginning of a hyphenated word.
	inline bool hyphenated() { return !last_word_on_line_ && hyphen_word_; }
	// Size of the base word (the part on the line before) of a hyphenated word.
	inline int hyphen_base_size() {
	return this->hyphenated() ? hyphen_word_->length() : 0;
	}
	// If this word is hyphenated copy the base word (the part on
	// the line before) of a hyphenated word into the given word.
	// This function assumes that word is not NULL.
	inline void copy_hyphen_info(WERD_CHOICE *word) {
	if (this->hyphenated()) {
	word = hyphen_word_;
	if (hyphen_debug_level) word->print("copy_hyphen_info: ");
	}
	}
	// Erase the unichar ids corresponding to the portion of the word
	// from the previous line. The word is not changed if it is not
	// split between lines and hyphenated.
	inline void remove_hyphen_head(WERD_CHOICE *word) {
	if (this->hyphenated()) {
	word->remove_unichar_ids(0, hyphen_word_->length());
	if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: ");
	}
	}
	// Check whether the word has a hyphen at the end.
	inline bool has_hyphen_end(const WERD_CHOICE &word) {
	int word_index = word.length() - 1;
	return (last_word_on_line_ && word_index > 0 &&
	word.unichar_id(word_index) == hyphen_unichar_id_);
	}
	// Unless the previous word was the last one on the line, and the current
	// one is not (thus it is the first one on the line), erase hyphen_word_,
	// clear hyphen_active_dawgs_, hyphen_constraints_ update last_word_on_line_.
	void reset_hyphen_vars(bool last_word_on_line);
	// Update hyphen_word_, and copy the given DawgInfoVectors into
	// hyphen_active_dawgs_ and hyphen_constraints_.
	void set_hyphen_word(const WERD_CHOICE &word,
	const DawgInfoVector &active_dawgs,
	const DawgInfoVector &constraints);

	/* permdawg.cpp ************************************************************/
	// If new_rating < best_choice->rating(), copy word int best_choice
	// and update rating and permuter of best_choice to the new given values.
	inline void update_best_choice(
	const WERD_CHOICE &word, WERD_CHOICE *best_choice) {
	if (word.rating() < best_choice->rating()) {
	*best_choice = word;
	}
	}
	// Fill the given active_dawgs vector with dawgs that could contain the
	// beginning of the word. If hyphenated() returns true, copy the entries
	// from hyphen_active_dawgs_ instead.
	void init_active_dawgs(DawgInfoVector *active_dawgs);
	// If hyphenated() returns true, copy the entries from hyphen_constraints_
	// into the given constraints vector.
	void init_constraints(DawgInfoVector *constraints);
	// Recursively explore all the possible character combinations in
	// the given char_choices. Use go_deeper_dawg_fxn() to explore all the
	// dawgs in the dawgs_ vector in parallel and discard invalid words.
	//
	// Allocate and return a WERD_CHOICE with the best valid word found.
	WERD_CHOICE *dawg_permute_and_select(
	const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit);
	void adjust_word(WERD_CHOICE *best_choice,
	float *certainty_array);
	// If the choice being composed so far could be a dictionary word
	// and we have not reached the end of the word keep exploring the
	// char_choices further.
	// Also:
	// -- set hyphen word if needed
	// -- if word_ending is true and word is better than best_choice
	// copy word to best_choice log new word choice
	void go_deeper_dawg_fxn(
	const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
	int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
	bool word_ending, WERD_CHOICE *word, float certainties[],
	float limit, WERD_CHOICE best_choice, void *void_more_args);

	/* permute.cpp *************************************************************/
	void add_document_word(const WERD_CHOICE &best_choice);
	void init_permute();
	WERD_CHOICE *permute_top_choice(
	const BLOB_CHOICE_LIST_VECTOR &char_choices,
	float* rating_limit,
	WERD_CHOICE *raw_choice,
	BOOL8 *any_alpha);
	const char* choose_il1(const char *first_char, //first choice
	const char *second_char, //second choice
	const char *third_char, //third choice
	const char *prev_char, //prev in word
	const char *next_char, //next in word
	const char *next_next_char); //after next next in word
	int valid_word(const WERD_CHOICE &word) {
	return valid_word(word, false); // return NO_PERM for words with digits
	}
	int valid_word_or_number(const WERD_CHOICE &word) {
	return valid_word(word, true); // return NUMBER_PERM for valid numbers
	}
	int valid_word(const WERD_CHOICE &word, bool numbers_ok);
	bool valid_punctuation(const WERD_CHOICE &word);
	WERD_CHOICE *permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices,
	float rating_limit,
	WERD_CHOICE *raw_choice);
	void end_permute();
	void adjust_non_word(WERD_CHOICE word, float adjust_factor);
	void permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
	float rating_limit,
	int start,
	int end,
	WERD_CHOICE *current_word);
	void permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
	float limit,
	WERD_CHOICE *best_choice,
	WERD_CHOICE *raw_choice);
	WERD_CHOICE *permute_compound_words(
	const BLOB_CHOICE_LIST_VECTOR &char_choices,
	float rating_limit);
	// checks if the dominant word script, if there is one, is same as target.
	bool word_script_eq(const BLOB_CHOICE_LIST_VECTOR &char_choices,
	int target_script_id);
	// Incoporate segmentation cost into word rating
	void incorporate_segcost(WERD_CHOICE* word);
	// checks for script-consistent permutations
	WERD_CHOICE *permute_script_words(
	const BLOB_CHOICE_LIST_VECTOR &char_choices);

	WERD_CHOICE *top_fragments_permute_and_select(
	const BLOB_CHOICE_LIST_VECTOR &char_choices,
	float rating_limit);
	// If the choice being composed so far could be better
	// than best_choice keep exploring char_choices.
	// If we have reached the end of the word and word is better than
	// best_choice, copy word to best_choice and log a new word choice.
	void go_deeper_top_fragments_fxn(
	const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
	int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
	bool word_ending, WERD_CHOICE *word, float certainties[],
	float limit, WERD_CHOICE best_choice, void *more_args);

	// Semi-generic functions used by multiple permuters.
	bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
	float curr_rating, float curr_certainty,
	const CHAR_FRAGMENT_INFO *prev_char_frag_info,
	const char *debug, int word_ending,
	CHAR_FRAGMENT_INFO *char_frag_info);
	void permute_choices(
	const char *debug,
	const BLOB_CHOICE_LIST_VECTOR &char_choices,
	int char_choice_index,
	const CHAR_FRAGMENT_INFO *prev_char_frag_info,
	WERD_CHOICE *word,
	float certainties[],
	float *limit,
	WERD_CHOICE *best_choice,
	void *more_args);

	void append_choices(
	const char *debug,
	const BLOB_CHOICE_LIST_VECTOR &char_choices,
	const BLOB_CHOICE &blob_choice,
	int char_choice_index,
	const CHAR_FRAGMENT_INFO *prev_char_frag_info,
	WERD_CHOICE *word,
	float certainties[],
	float *limit,
	WERD_CHOICE *best_choice,
	void *more_args);
	// Pointer to go_deeper function that will be modified by various permuters.
	void (Dict::go_deeper_fxn_)(const char debug,
	const BLOB_CHOICE_LIST_VECTOR &char_choices,
	int char_choice_index,
	const CHAR_FRAGMENT_INFO *prev_char_frag_info,
	bool word_ending, WERD_CHOICE *word,
	float certainties[], float *limit,
	WERD_CHOICE best_choice, void void_more_args);
	/* stopper.cpp *************************************************************/
	int NoDangerousAmbig(WERD_CHOICE *BestChoice,
	DANGERR *fixpt,
	bool fix_replaceable,
	BLOB_CHOICE_LIST_VECTOR *Choices,
	bool *modified_blobs);
	void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
	UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
	BLOB_CHOICE_LIST_VECTOR *blob_choices,
	bool *modified_blobs);

	inline void DisableChoiceAccum() { keep_word_choices_ = FALSE; }
	inline void EnableChoiceAccum() { keep_word_choices_ = TRUE; }

	int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice);
	VIABLE_CHOICE NewViableChoice(const WERD_CHOICE &WordChoice,
	FLOAT32 AdjustFactor,
	const float Certainties[]);
	void PrintViableChoice(FILE File, const char Label, VIABLE_CHOICE Choice);
	int StringSameAs(const char *String,
	const char *String_lengths,
	VIABLE_CHOICE ViableChoice);
	bool StringSameAs(const WERD_CHOICE &WordChoice,
	VIABLE_CHOICE ViableChoice);
	int AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,
	WERD_CHOICE *BestChoice,
	const WERD_CHOICE &RawChoice,
	DANGERR *fixpt,
	ACCEPTABLE_CHOICE_CALLER caller,
	bool *modified_blobs);
	int AcceptableResult(const WERD_CHOICE &BestChoice,
	const WERD_CHOICE &RawChoice);
	int ChoiceSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice);
	void LogNewChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor,
	const float Certainties[], bool raw_choice);
	void EndDangerousAmbigs();
	int CurrentBestChoiceIs(const WERD_CHOICE &WordChoice);
	FLOAT32 CurrentBestChoiceAdjustFactor();
	int CurrentWordAmbig();
	void DebugWordChoices();
	void PrintAmbigAlternatives(FILE file, const char label,
	int label_num_unichars);
	void FillViableChoice(const WERD_CHOICE &WordChoice,
	FLOAT32 AdjustFactor, const float Certainties[],
	bool SameString, VIABLE_CHOICE ViableChoice);
	int AlternativeChoicesWorseThan(FLOAT32 Threshold);
	void FilterWordChoices();
	void FindClassifierErrors(FLOAT32 MinRating,
	FLOAT32 MaxRating,
	FLOAT32 RatingMargin,
	FLOAT32 Thresholds[]);
	void InitChoiceAccum();
	void LogNewSegmentation(PIECES_STATE BlobWidth);
	void LogNewSplit(int Blob);
	void SettupStopperPass1();
	void SettupStopperPass2();
	/* choices.cpp *************************************************************/
	void print_word_string(const char* str);
	void print_word_choice(const char label, A_CHOICE choice);
	void print_choices(const char *label,
	CHOICES rating); // List of (A_CHOICE*).
	/* permngram.cpp ***********************************************************/
	A_CHOICE *ngram_permute_and_select(CHOICES_LIST char_choices,
	float rating_limit,
	const Dawg *dawg);
	/* dawg.cpp ****************************************************************/

	// Returns the maximal permuter code (from ccstruct/ratngs.h) if in light
	// of the current state the letter at word_index in the given word
	// is allowed according to at least one of the dawgs in dawgs_,
	// otherwise returns NO_PERM.
	//
	// The state is described by void_dawg_args, which are interpreted as
	// DawgArgs and contain two relevant input vectors: active_dawgs and
	// constraints. Each entry in the active_dawgs vector contains an index
	// into the dawgs_ vector and an EDGE_REF that indicates the last edge
	// followed in the dawg. Each entry in the constraints vector contains
	// an index into the dawgs_ vector and an EDGE_REF that indicates an edge
	// in a pattern dawg followed to match a pattern. Currently constraints
	// are used to save the state of punctuation dawgs after leading
	// punctuation was found.
	//
	// Input:
	// At word_index 0 dawg_args->active_dawgs should contain an entry for each
	// dawg whose type has a bit set in kBeginningDawgsType,
	// dawg_args->constraints should be empty. EDGE_REFs in active_dawgs and
	// constraints vectors should be initialized to NO_EDGE. If hyphen state
	// needs to be applied, initial dawg_args->active_dawgs and
	// dawg_args->constrains can be copied from the saved hyphen state
	// (maintained by Dict).
	// For word_index > 0 the corresponding state (active_dawgs and constraints)
	// can be obtained from dawg_args->updated_* passed to def_letter_is_okay
	// for word_index-1.
	// Note: the function assumes that active_dags, constraints and updated_*
	// member variables of dawg_args are not NULL.
	//
	// Output:
	// The function fills in dawg_args->updated_active_dawgs vector with the
	// entries for dawgs that contain the word up to the letter at word_index.
	// The new constraints (if any) are added to dawg_args->updated_constraints,
	// the constraints from dawg_args->constraints are also copied into it.
	//
	// Detailed description:
	// In order to determine whether the word is still valid after considering
	// all the letters up to the one at word_index the following is done for
	// each entry in dawg_args->active_dawgs:
	//
	// -- next starting node is obtained from entry.ref and edge_char_of() is
	// called to obtain the next edge
	// -- if a valid edge is found, the function returns the updated permuter
	// code true and an entry [entry.dawg_index, edge] is inserted in
	// dawg_args->updated_active_dawgs
	// otherwise:
	// -- if we are dealing with dawg of type DAWG_TYPE_PUNCTUATION,
	// edge_char_of() is called again, but now with kPatternUnicharID
	// as unichar_id; if a valid edge is found it is recorded in
	// dawg_args->updated_constraints
	// -- the function checks whether the word can end with the previous
	// letter
	// -- each successor of the dawg (e.g. dawgs with type DAWG_TYPE_WORD
	// could be successors to dawgs with type DAWG_TYPE_PUNCTUATION; the
	// successors are defined by successors_ vector) is explored and if
	// a letter is found in the successor dawg, a new entry is inserted
	// into dawg_args->updated_active_dawgs with EDGE_REF being either
	// NO_EDGE or an EDGE_REF recorded in constraints vector for the
	// corresponding dawg index

	//
	int def_letter_is_okay(void* void_dawg_args, int word_index,
	const void* word, bool word_end);

	int new_letter_is_okay(void* void_dawg_args, int word_index,
	const void* word, bool word_end);
	int (Dict::letter_is_okay_)(void void_dawg_args, int word_index,
	const void *word, bool word_end);
	// Return the number of dawgs in the dawgs_ vector.
	inline const int NumDawgs() const { return dawgs_.size(); }
	// Return i-th dawg pointer recorded in the dawgs_ vector.
	inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
	// At word ending make sure all the recorded constraints are satisfied.
	// Each constraint signifies that we found a beginning pattern in a
	// pattern dawg. Check that this pattern can end here (e.g. if some
	// leading punctuation is found this would ensure that we are not
	// expecting any particular trailing punctuation after the word).
	inline bool ConstraintsOk(const DawgInfoVector &constraints,
	int word_end, DawgType current_dawg_type) {
	if (!word_end) return true;
	if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true;
	for (int c = 0; c < constraints.length(); ++c) {
	const DawgInfo &cinfo = constraints[c];
	Dawg *cdawg = dawgs_[cinfo.dawg_index];
	if (!cdawg->end_of_word(cinfo.ref)) {
	if (dawg_debug_level >= 3) {
	tprintf("Constraint [%d, " REFFORMAT "] is not satisfied\n",
	cinfo.dawg_index, cinfo.ref);
	}
	return false;
	}
	}
	return true;
	}
	// Record the maximum of the two permuters in permuter.
	static inline void UpdatePermuter(PermuterType new_permuter,
	PermuterType *permuter) {
	if (dawg_debug_level >= 3) tprintf("Letter found\n");
	if (new_permuter > permuter) permuter = new_permuter;
	}

	/* conversion.cpp **********************************************************/
	// TODO(daria): remove these function when conversion.cpp is deprecated
	// and all the code is converted to work with unichar ids.
	void LogNewWordChoice(A_CHOICE *a_choice,
	FLOAT32 adjust_factor,
	const float certainties[],
	const UNICHARSET &unicharset);
	int valid_word(const char *string);

	private:
	// Private member variables.
	Image* image_ptr_;
	// Table that stores ambiguities computed during training
	// (loaded when NoDangerousAmbigs() is called for the first time).
	// Each entry i in the table stores a set of amibiguities whose
	// wrong ngram starts with unichar id i.
	UnicharAmbigs *dang_ambigs_table_;
	// Same as above, but for ambiguities with replace flag set.
	UnicharAmbigs *replace_ambigs_table_;
	// Flag used to disable accumulation of word choices
	// during compound word permutation.
	BOOL8 keep_word_choices_;
	// Additional certainty padding allowed before a word is rejected.
	FLOAT32 reject_offset_;
	// Current word segmentation.
	PIECES_STATE current_segmentation_;
	// Variables to keep track of best/raw word choices.
	VIABLE_CHOICE best_raw_choice_;
	LIST raw_choices_;
	LIST best_choices_;
	// Hyphen-related variables.
	UNICHAR_ID hyphen_unichar_id_;
	WERD_CHOICE *hyphen_word_;
	DawgInfoVector hyphen_active_dawgs_;
	DawgInfoVector hyphen_constraints_;
	bool last_word_on_line_;
	// Dawgs.
	DawgVector dawgs_;
	SuccessorListsVector successors_;
	Dawg *freq_dawg_;
	Trie *pending_words_;
	// The following pointers are only cached for convenience.
	// The dawgs will be deleted when dawgs_ vector is destroyed.
	// TODO(daria): need to support multiple languages in the future,
	// so maybe will need to maintain a list of dawgs of each kind.
	Trie *document_words_;
	};
	} // namespace tesseract

	#endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_