dict/permute.cpp - platform/external/tesseract - Git at Google

 /* -*-C-*-
  ********************************************************************************
  *
  * File:        permute.c  (Formerly permute.c)
  * Description:  Choose OCR text given character-probability maps
  *               for sequences of glyph fragments and a dictionary provided as
  *               a Dual Acyclic Word Graph.
  *               In this file, "permute" should be read "combine."
  * Author:       Mark Seaman, OCR Technology
  * Created:      Fri Sep 22 14:05:51 1989
  * Modified:     Thu Jan  3 16:38:46 1991 (Mark Seaman) marks@hpgrlt
  * Language:     C
  * Package:      N/A
  * Status:       Experimental (Do Not Distribute)
  *
  * (c) Copyright 1989, Hewlett-Packard Company.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  *********************************************************************************/
 /*----------------------------------------------------------------------
             I n c l u d e s
 ---------------------------------------------------------------------*/

 #include <assert.h>
 #include <math.h>

 #include "const.h"

 #include "permute.h"

 #include "callcpp.h"
 #include "context.h"
 #include "conversion.h"
 #include "freelist.h"
 #include "globals.h"
 #include "ndminx.h"
 #include "permdawg.h"
 #include "permngram.h"
 #include "ratngs.h"
 #include "stopper.h"
 #include "tordvars.h"
 #include "tprintf.h"
 #include "trie.h"
 #include "varable.h"
 #include "unicharset.h"
 #include "dict.h"
 #include "image.h"
 #include "ccutil.h"

 int permutation_count;           // Used in metrics.cpp.
 /*----------------------------------------------------------------------
               V a r i a b l e s
 ----------------------------------------------------------------------*/
 // TODO(tkielbus) Choose a value for the MAX_NUM_EDGES constant
 // (or make it dynamic)
 #define MAX_NUM_EDGES          2000000
 #define MAX_DOC_EDGES          250000
 #define MAX_USER_EDGES         50000
                                  /* Weights for adjustment */
 #define NON_WERD               1.25
 #define GARBAGE_STRING         1.5
 #define MAX_PERM_LENGTH         128

 // debugging flags
 INT_VAR(fragments_debug, 0, "Debug character fragments");

 BOOL_VAR(segment_debug, 0, "Debug the whole segmentation process");

 BOOL_VAR(permute_debug, 0, "Debug char permutation process");


 // control parameters
 double_VAR(bestrate_pruning_factor, 2.0,
            "Multiplying factor of current best rate to prune other hypotheses");

 BOOL_VAR(permute_script_word, 0,
          "Turn on word script consistency permuter");

 BOOL_VAR(segment_segcost_rating, 0,
          "incorporate segmentation cost in word rating?");

 double_VAR(segment_reward_script, 0.95,
            "Score multipler for script consistency within a word. "
            "Being a 'reward' factor, it should be <= 1. "
            "Smaller value implies bigger reward.");

 double_VAR(segment_penalty_dict_nonword, NON_WERD,
            "Score multiplier for glyph fragment segmentations which do not "
            "match a dictionary word (lower is better).");

 double_VAR(segment_penalty_garbage, GARBAGE_STRING,
            "Score multiplier for poorly cased strings that are not in the "
            "dictionary and generally look like garbage (lower is better).");

 BOOL_VAR(save_doc_words, 0, "Save Document Words");

 BOOL_VAR(doc_dict_enable, 1, "Enable Document Dictionary ");

 BOOL_VAR(ngram_permuter_activated, FALSE,
          "Activate character-level n-gram-based permuter");

 STRING_VAR(global_user_words_suffix, "", "A list of user-provided words.");

 // This is an ugly way to incorporate segmentation cost in word rating.
 // See comments in incorporate_segcost.
 float wordseg_rating_adjust_factor;

 int permute_only_top = 0;

 #define SIM_CERTAINTY_SCALE  -10.0   /* Similarity matcher values */
 #define SIM_CERTAINTY_OFFSET -10.0   /* Similarity matcher values */
 #define SIMILARITY_FLOOR     100.0   /* Worst E*L product to stop on */

 // TODO(daria): If hyphens are different in different languages and can be
 // inferred from training data we should load their values dynamically.
 static const char kHyphenSymbol[] = "-";

 /*----------------------------------------------------------------------
               F u n c t i o n s
 ----------------------------------------------------------------------*/

 /**********************************************************************
  * get_best_delete_other
  *
  * Returns the best of two choices and deletes the other (worse) choice.
  * A choice is better if it has a non-empty string and has a lower
  * rating than the other choice. If the ratings are the same,
  * choice2 is preferred over choice1.
  **********************************************************************/
 WERD_CHOICE *get_best_delete_other(WERD_CHOICE *choice1,
                                    WERD_CHOICE *choice2) {
   if (!choice1) return choice2;
   if (!choice2) return choice1;
   if (choice1->rating() < choice2->rating() || choice2->length() == 0) {
     delete choice2;
     return choice1;
   } else {
     delete choice1;
     return choice2;
   }
 }


 /**********************************************************************
  * good_choice
  *
  * Return TRUE if a good answer is found for the unknown blob rating.
  **********************************************************************/
 int good_choice(const WERD_CHOICE &choice) {
   register float certainty;
   if (tord_similarity_enable) {
     if ((choice.rating() + 1) * choice.certainty() > SIMILARITY_FLOOR)
       return false;
     certainty =
       SIM_CERTAINTY_OFFSET + choice.rating() * SIM_CERTAINTY_SCALE;
   } else {
     certainty = choice.certainty();
   }

   return (certainty > tord_certainty_threshold) ? true : false;
 }


 /**********************************************************************
  * add_document_word
  *
  * Add a word found on this document to the document specific
  * dictionary.
  **********************************************************************/
 namespace tesseract {
 void Dict::add_document_word(const WERD_CHOICE &best_choice) {
   // Do not add hyphenated word parts to the document dawg.
   // hyphen_word_ will be non-NULL after the set_hyphen_word() is
   // called when the first part of the hyphenated word is
   // discovered and while the second part of the word is recognized.
   // hyphen_word_ is cleared in cc_recg() before the next word on
   // the line is recognized.
   if (hyphen_word_) return;

   char filename[CHARS_PER_LINE];
   FILE *doc_word_file;
   int stringlen = best_choice.length();

   if (!doc_dict_enable || valid_word(best_choice) ||
       CurrentWordAmbig() || stringlen < 2)
     return;

   if (!good_choice(best_choice) || stringlen == 2) {
     if (best_choice.certainty() < permuter_pending_threshold)
       return;

     if (!pending_words_->word_in_dawg(best_choice)) {
       if (stringlen > 2 ||
           (stringlen == 2 &&
            getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
            getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
         pending_words_->add_word_to_dawg(best_choice);
       }
       return;
     }
   }

   if (save_doc_words) {
     strcpy(filename, getImage()->getCCUtil()->imagefile.string());
     strcat (filename, ".doc");
     doc_word_file = open_file (filename, "a");
     fprintf (doc_word_file, "%s\n",
              best_choice.debug_string(getUnicharset()).string());
     fclose(doc_word_file);
   }
   document_words_->add_word_to_dawg(best_choice);
 }


 /**********************************************************************
  * adjust_non_word
  *
  * Assign an adjusted value to a string that is a non-word.  The value
  * that this word choice has is based on case and punctuation rules.
  * The adjustment value applied is stored in adjust_factor upon return.
  **********************************************************************/
 void Dict::adjust_non_word(WERD_CHOICE *word, float *adjust_factor) {
   float new_rating;
   if (permute_debug)
     cprintf("Non-word: %s %4.2f ",
             word->debug_string(getUnicharset()).string(), word->rating());

   new_rating = word->rating() + RATING_PAD;
   if (Context::case_ok(*word, getUnicharset()) && valid_punctuation(*word)) {
     new_rating *= segment_penalty_dict_nonword;
     *adjust_factor = segment_penalty_dict_nonword;
     if (permute_debug) tprintf(", W");
   } else {
     new_rating *= segment_penalty_garbage;
     *adjust_factor = segment_penalty_garbage;
     if (permute_debug) {
       if (!Context::case_ok(*word, getUnicharset())) tprintf(", C");
       if (!valid_punctuation(*word)) tprintf(", P");
     }
   }
   new_rating -= RATING_PAD;
   word->set_rating(new_rating);
   if (permute_debug)
     cprintf (" %4.2f --> %4.2f\n", *adjust_factor, new_rating);
 }


 /**********************************************************************
  * init_permute
  *
  * Initialize anything that needs to be set up for the permute
  * functions.
  **********************************************************************/
 void Dict::init_permute() {
   STRING name;
   STRING &lang = getImage()->getCCUtil()->lang;

   if (dawgs_.length() != 0) end_permute();

   hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
   TessdataManager &tessdata_manager =
     getImage()->getCCUtil()->tessdata_manager;

   // Load dawgs_.
   if (global_load_punc_dawg &&
       tessdata_manager.SeekToStart(TESSDATA_PUNC_DAWG)) {
     dawgs_ += new SquishedDawg(tessdata_manager.GetDataFilePtr(),
                                DAWG_TYPE_PUNCTUATION, lang, PUNC_PERM);
   }
   if (global_load_system_dawg &&
       tessdata_manager.SeekToStart(TESSDATA_SYSTEM_DAWG)) {
     dawgs_ += new SquishedDawg(tessdata_manager.GetDataFilePtr(),
                                DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM);
   }
   if (global_load_number_dawg &&
       tessdata_manager.SeekToStart(TESSDATA_NUMBER_DAWG)) {
     dawgs_ +=
       new SquishedDawg(tessdata_manager.GetDataFilePtr(),
                        DAWG_TYPE_NUMBER, lang, NUMBER_PERM);
   }
   if (((STRING &)global_user_words_suffix).length() > 0) {
     Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
                               MAX_USER_EDGES, getUnicharset().size());
   name = getImage()->getCCUtil()->language_data_path_prefix;
     name += global_user_words_suffix;
     if (!trie_ptr->read_word_list(name.string(), getUnicharset())) {
       tprintf("Error: failed to load %s\n", name.string());
       exit(1);
     }
     dawgs_ += trie_ptr;
   }
   document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
                              MAX_DOC_EDGES, getUnicharset().size());
   dawgs_ += document_words_;

   // This dawg is temporary and should not be searched by letter_is_ok.
   pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
                             MAX_DOC_EDGES, getUnicharset().size());

   // The frequent words dawg is only searched when a word
   // is found in any of the other dawgs.
   if (tessdata_manager.SeekToStart(TESSDATA_FREQ_DAWG)) {
     freq_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
                                   DAWG_TYPE_WORD, lang, FREQ_DAWG_PERM);
   }

   // Construct a list of corresponding successors for each dawg. Each entry i
   // in the successors_ vector is a vector of integers that represent the
   // indices into the dawgs_ vector of the successors for dawg i.
   successors_.reserve(dawgs_.length());
   for (int i = 0; i < dawgs_.length(); ++i) {
     const Dawg *dawg = dawgs_[i];
     SuccessorList *lst = new SuccessorList();
     for (int j = 0; j < dawgs_.length(); ++j) {
       const Dawg *other = dawgs_[j];
       if (dawg->lang() == other->lang() &&
           kDawgSuccessors[dawg->type()][other->type()]) *lst += j;
     }
     successors_ += lst;
   }
 }

 void Dict::end_permute() {
   if (dawgs_.length() == 0)
     return;  // Not safe to call twice.
   dawgs_.delete_data_pointers();
   successors_.delete_data_pointers();
   dawgs_.clear();
   successors_.clear();
   document_words_ = NULL;
   if (pending_words_ != NULL) delete pending_words_;
   pending_words_ = NULL;
   if (freq_dawg_ != NULL) delete freq_dawg_;
   freq_dawg_ = NULL;
 }


 /**********************************************************************
  * permute_all
  *
  * Permute all the characters together using all of the different types
  * of permuters/selectors available.  Each of the characters must have
  * a non-NULL choice list.
  *
  * Note: order of applying permuters does matter, since the latter
  * permuter will be recorded if the resulting word ratings are the same.
  **********************************************************************/
 WERD_CHOICE *Dict::permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices,
                                float rating_limit,
                                WERD_CHOICE *raw_choice) {
   WERD_CHOICE *result1;
   WERD_CHOICE *result2 = NULL;
   BOOL8 any_alpha;
   float top_choice_rating_limit = rating_limit;

   // Initialize result1 from the result of permute_top_choice.
   result1 = permute_top_choice(char_choices, &top_choice_rating_limit,
                                raw_choice, &any_alpha);

   // Enforce script consistency within a word on some scripts
   if (permute_script_word &&
       !word_script_eq(char_choices, getUnicharset().common_sid()) &&
       !word_script_eq(char_choices, getUnicharset().latin_sid())) {
     result2 = permute_script_words(char_choices);
     // TODO(dsl): incorporate segmentation cost into word rating.
     // This should only be turned on for scripts that we have a segmentation
     // cost model for, such as CJK.
     if (segment_segcost_rating)
       incorporate_segcost(result2);
     result1 = get_best_delete_other(result1, result2);
   }

   // Permute character fragments if necessary.
   if (result1 == NULL || result1->fragment_mark()) {
     result2 = top_fragments_permute_and_select(char_choices,
                                                top_choice_rating_limit);
     result1 = get_best_delete_other(result1, result2);
   }

   // TODO(daria): update ngram permuter code.
   if (ngram_permuter_activated) {
     tprintf("Error: ngram permuter functionality is not available\n");
     exit(1);
     // A_CHOICE *ngram_choice =
     //  ngram_permute_and_select(old_char_choices, rating_limit, word_dawg_);
     // return ngram_choice;
   }

   if (result1 == NULL)
     return (NULL);
   if (permute_only_top)
     return result1;

   result2 = dawg_permute_and_select(char_choices, rating_limit);
   result1 = get_best_delete_other(result1, result2);

   result2 = permute_compound_words(char_choices, rating_limit);
   result1 = get_best_delete_other(result1, result2);

   return (result1);
     }

 // Returns the top choice char id.  A helper function to make code cleaner.
 UNICHAR_ID get_top_choice_uid(BLOB_CHOICE_LIST *blob_list) {
   BLOB_CHOICE_IT blob_choice_it;
   blob_choice_it.set_to_list(blob_list);
   return (blob_choice_it.data()) ? blob_choice_it.data()->unichar_id()
                                  : INVALID_UNICHAR_ID;
   }

 // Return the "dominant" script ID for the word.  By "dominant", the script
 // must account for at least half the characters.  Otherwise, it returns 0.
 int get_top_word_script(const BLOB_CHOICE_LIST_VECTOR &char_choices,
                         const UNICHARSET &unicharset) {
   int max_script = unicharset.get_script_table_size();
   int *sid = new int[max_script];
   int x;
   for (x = 0; x < max_script; x++) sid[x] = 0;
   for (x = 0; x < char_choices.length(); ++x) {
     BLOB_CHOICE_IT blob_choice_it;
     blob_choice_it.set_to_list(char_choices.get(x));
     sid[blob_choice_it.data()->script_id()]++;
   }
   // Note that high script ID overrides lower one on a tie, thus biasing
   // towards non-Common script (if sorted that way in unicharset file).
   int max_sid = 0;
   for (x = 1; x < max_script; x++)
     if (sid[x] >= sid[max_sid]) max_sid = x;
   if (sid[max_sid] < char_choices.length() / 2)
     max_sid = unicharset.null_sid();
   delete[] sid;
   return max_sid;
   }

 /**********************************************************************
  * Checks whether the dominant word script, if there is one, matches
  * the given target script ID.
  **********************************************************************/
 bool Dict::word_script_eq(const BLOB_CHOICE_LIST_VECTOR &char_choices,
                           int target_sid) {
   int max_sid = get_top_word_script(char_choices, getUnicharset());
   // If "Latin" is not a loaded script, then latin_sid() would return 0.
   // max_sid could also be 0 if there is no dominant script.
   // This is faster than
   // strcmp(getUnicharset().get_script_from_script_id(max_sid), "Latin")
   return (max_sid > 0 && max_sid == target_sid);
 }

 /**********************************************************************
  * Iterate through all the character choices (for a single blob) and
  * return the first that matches the given type, which is one of 'aA0px*',
  * for lower, upper, digit, punctuation, other, and 'any', respectively.
  * If not match is found, a NULL is returned.
  **********************************************************************/
 BLOB_CHOICE* find_choice_by_type(
     BLOB_CHOICE_LIST *char_choices,
     char target_type,
     const UNICHARSET &unicharset) {
   BLOB_CHOICE_IT c_it;
   c_it.set_to_list(char_choices);
   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
     bool found = false;
     UNICHAR_ID unichar_id = c_it.data()->unichar_id();
     switch (target_type) {
       case '*': found = true;  break;
       case 'A': found = unicharset.get_isupper(unichar_id); break;
       case 'a': found = unicharset.get_islower(unichar_id); break;
       case '0': found = unicharset.get_isdigit(unichar_id); break;
       case 'p': found = unicharset.get_ispunctuation(unichar_id); break;
       case 'x': found = !unicharset.get_isupper(unichar_id) &&
                         !unicharset.get_islower(unichar_id) &&
                         !unicharset.get_isdigit(unichar_id) &&
                         !unicharset.get_ispunctuation(unichar_id);
                 break;
     }
     if (found) return c_it.data();
   }
   return NULL;
 }

 /**********************************************************************
  * Iterate through all the character choices (for a single blob) and
  * return the first that matches the target script ID.  If backup_sid
  * is not 0, then a match on either the target or backup sid is allowed.
  * Note that there is no preference between a target or backup sid.
  * To search for another sid only if no target_sid matched, use
  * secondary_sid.
  * So for example, to find first Han or Common char choice, do
  *   find_choice_by_script(cchoice, han_sid, common_sid, 0);
  * To find first Han choice, but allow Common if none is found, do
  *   find_choice_by_script(cchoice, han_sid, 0, common_sid);
  **********************************************************************/
 BLOB_CHOICE* find_choice_by_script(
     BLOB_CHOICE_LIST *char_choices,
     int target_sid,
     int backup_sid,
     int secondary_sid) {
   BLOB_CHOICE_IT c_it;
   c_it.set_to_list(char_choices);
   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
     bool found = false;
     if (c_it.data()->script_id() == 0) continue;
     if (c_it.data()->script_id() == target_sid) found = true;
     if (backup_sid > 0 && c_it.data()->script_id() == backup_sid) found = true;
     if (found) return c_it.data();
   }
   if (secondary_sid > 0) {
     c_it.set_to_list(char_choices);
     for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
       if (c_it.data()->script_id() == 0) continue;
       if (c_it.data()->script_id() == secondary_sid)
         return c_it.data();
     }
   }
   return NULL;
 }

 /**********************************************************************
  * Incorporate segmentation cost into the word rating.  This is done
  * through a mutliplier wordseg_rating_adjust_factor which is determined
  * in bestfirst.cpp during state evaluation.  This is not the cleanest
  * way to do this.  It would be better to reorganize the SEARCH_STATE
  * to keep track of associated states, or do the rating adjustment
  * outside the permuter in evalaute_state.
  **********************************************************************/
 void Dict::incorporate_segcost(WERD_CHOICE *word) {
   if (!word || wordseg_rating_adjust_factor <= 0) return;

   float old_rating = word->rating();
   float new_rating = old_rating * wordseg_rating_adjust_factor;
   word->set_rating(new_rating);
   if (permute_debug)
     tprintf("Permute segadjust %f * %f --> %f\n",
             old_rating, wordseg_rating_adjust_factor, new_rating);
 }

 /**********************************************************************
  * Try flipping characters in a word to get better script consistency.
  * Similar to how upper/lower case checking is done in top_choice_permuter,
  * this permuter tries to suggest a more script-consistent choice AND
  * modifieds the rating.  So it combines both the case_ok check and
  * adjust_non_word functionality.  However, instead of penalizing an
  * inconsistent word with a > 1 multiplier, we reward the script-consistent
  * choice with a < 1 multiplier.
  **********************************************************************/
 WERD_CHOICE* Dict::permute_script_words(
     const BLOB_CHOICE_LIST_VECTOR &char_choices) {
   if (char_choices.length() > MAX_WERD_LENGTH)
     return NULL;

   int word_sid = get_top_word_script(char_choices, getUnicharset());
   if (word_sid == getUnicharset().null_sid())
     return NULL;

   if (permute_debug) {
     tprintf("\n\nPermuteScript %s\n",
             getUnicharset().get_script_from_script_id(word_sid));
     print_char_choices_list("", char_choices, getUnicharset(),
                             permute_debug > 1);
   }

   WERD_CHOICE *current_word = new WERD_CHOICE(MAX_WERD_LENGTH);
   BLOB_CHOICE_IT blob_choice_it;
   bool replaced = false;
   bool prev_is_consistent = false;
   for (int x = 0; x < char_choices.length(); ++x) {
     blob_choice_it.set_to_list(char_choices.get(x));
     BLOB_CHOICE *first_choice = blob_choice_it.data();
     if (!first_choice) return NULL;
     UNICHAR_ID unichar_id = first_choice->unichar_id();
     bool sid_consistent = (first_choice->script_id() == word_sid);
     bool this_is_punct = getUnicharset().get_ispunctuation(unichar_id);

     if (!sid_consistent && !this_is_punct && prev_is_consistent) {
       // If the previous char is CJK, we prefer a cjk over non-cjk char
       if (permute_debug) {
         tprintf("Checking %s r%g\n", getUnicharset().id_to_unichar(unichar_id),
                                      first_choice->rating());
         print_ratings_list("\t", char_choices.get(x), getUnicharset());
       }
       // prefer a script consistent choice
       BLOB_CHOICE* c_it = find_choice_by_script(char_choices.get(x),
                                                 word_sid, 0, 0);
       // make this a separate check
       // otherwise, prefer a punctuation
       if (c_it == NULL)
         c_it = find_choice_by_type(char_choices.get(x), 'p', getUnicharset());

       if (c_it != NULL) {
         if (permute_debug)
           tprintf("Replacing %d r%g ==> %d r%g\n",
                   first_choice->unichar_id(), first_choice->rating(),
                   c_it->unichar_id(), c_it->rating());
         first_choice = c_it;
         replaced = true;
       }
     }
     current_word->append_unichar_id_space_allocated(
       first_choice->unichar_id(), 1,
       first_choice->rating(), first_choice->certainty());
     prev_is_consistent = sid_consistent;
   }
   if (replaced) {
     // When we replace a word choice (usually top choice) with
     // another for the sake of script consistency, we need to improve its
     // rating so that it will replace the best choice.  How much we modify
     // the rating determines how strong is the script consistency constraint.
     // We need a more consistent solution for all contextual constraints
     // like case, punct pattern, script, etc.  Right now, this does the same
     // thing as adjust_non_words for case and punctuation rules.
     float rating = current_word->rating();
     rating *= segment_reward_script;
     current_word->set_rating(rating);
   }
   current_word->populate_unichars(getUnicharset());
   if (permute_debug && replaced)
     current_word->print("<== permute_script_word **");
   return current_word;
 }

 /**********************************************************************
  * permute_characters
  *
  * Permute these characters together according to each of the different
  * permuters that are enabled.
  **********************************************************************/
 void Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
                               float limit,
                               WERD_CHOICE *best_choice,
                               WERD_CHOICE *raw_choice) {
   float old_raw_choice_rating = raw_choice->rating();
   permutation_count++;           /* Global counter */
   if (tord_display_ratings > 1) {
     cprintf("\nchar_choices in permute_characters:\n");
     print_char_choices_list("\n==> Input CharChoices", char_choices,
                             getUnicharset(), true);
   }

   if (char_choices.length() == 1 &&
       get_top_choice_uid(char_choices.get(0)) == 0)
     return;
   WERD_CHOICE *this_choice = permute_all(char_choices, limit, raw_choice);

   if (raw_choice->rating() < old_raw_choice_rating) {
     // Populate unichars_ and unichar_lengths_ of raw_choice. This is
     // needed for various components that still work with unichars rather
     // than unichar ids (e.g. AdaptToWord).
     raw_choice->populate_unichars(getUnicharset());
   }
   if (this_choice && this_choice->rating() < best_choice->rating()) {
     *best_choice = *this_choice;
     // Populate unichars_ and unichar_lengths_ of best_choice. This is
     // needed for various components that still work with unichars rather
     // than unichar ids (dawg, *_ok functions, various hard-coded hacks).
     best_choice->populate_unichars(getUnicharset());

     if (tord_display_ratings) {
       cprintf("permute_characters: %s\n",
               best_choice->debug_string(getUnicharset()).string());
     }
   }
   delete this_choice;
 }

 /**********************************************************************
  * permute_compound_words
  *
  * Return the top choice for each character as the choice for the word.
  **********************************************************************/
 WERD_CHOICE *Dict::permute_compound_words(
     const BLOB_CHOICE_LIST_VECTOR &char_choices,
     float rating_limit) {
   BLOB_CHOICE *first_choice;
   WERD_CHOICE *best_choice = NULL;
   WERD_CHOICE current_word(MAX_WERD_LENGTH);
   int first_index = 0;
   int x;
   BLOB_CHOICE_IT blob_choice_it;

   if (char_choices.length() > MAX_WERD_LENGTH) {
     WERD_CHOICE *bad_word_choice = new WERD_CHOICE();
     bad_word_choice->make_bad();
     return bad_word_choice;
   }

   UNICHAR_ID slash = getUnicharset().unichar_to_id("/");
   UNICHAR_ID dash = getUnicharset().unichar_to_id("-");
   for (x = 0; x < char_choices.length(); ++x) {
     blob_choice_it.set_to_list(char_choices.get(x));
     first_choice = blob_choice_it.data();
     if (first_choice->unichar_id() == slash ||
         first_choice->unichar_id() == dash) {
       if (x > first_index) {
         if (segment_debug)
           cprintf ("Hyphenated word found\n");
         permute_subword(char_choices, rating_limit, first_index,
                         x - 1, &current_word);
         if (current_word.rating() > rating_limit)
           break;
       }
       // Append hyphen/slash separator to current_word.
       current_word.append_unichar_id_space_allocated(
           first_choice->unichar_id(), 1,
           first_choice->rating(), first_choice->certainty());

       first_index = x + 1;  // update first_index
     }
   }

   if (first_index > 0 && first_index < x &&
       current_word.rating() <= rating_limit) {
     permute_subword(char_choices, rating_limit, first_index,
                     x - 1, &current_word);
     best_choice = new WERD_CHOICE(current_word);
     best_choice->set_permuter(COMPOUND_PERM);
   }
   return (best_choice);
 }


 /**********************************************************************
  * permute_subword
  *
  * Permute a part of a compound word this subword is bounded by hyphens
  * and the start and end of the word.  Call the standard word permute
  * function on a set of choices covering only part of the original
  * word.  When it is done reclaim the memory that was used in the
  * excercise.
  **********************************************************************/
 void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
                            float rating_limit,
                            int start,
                            int end,
                            WERD_CHOICE *current_word) {
   int x;
   BLOB_CHOICE_LIST_VECTOR subchoices;
   WERD_CHOICE *best_choice = NULL;
   WERD_CHOICE raw_choice;
   raw_choice.make_bad();

   DisableChoiceAccum();

   for (x = start; x <= end; x++) {
     if (char_choices.get(x) != NULL) {
       subchoices += char_choices.get(x);
     }
   }

   if (!subchoices.empty()) {
     bool old_segment_dawg_debug = segment_dawg_debug;
     if (segment_debug) segment_dawg_debug.set_value(true);
     best_choice = permute_all(subchoices, rating_limit, &raw_choice);

     if (segment_debug) {
       segment_dawg_debug.set_value(old_segment_dawg_debug);
     }
     if (best_choice && best_choice->length() > 0) {
       *current_word += *best_choice;
     } else {
       current_word->set_rating(MAX_FLOAT32);
     }
   } else {
     current_word->set_rating(MAX_FLOAT32);
   }

   if (best_choice)
     delete best_choice;

   if (segment_debug && current_word->rating() < MAX_FLOAT32) {
     cprintf ("Subword permuted = %s, %5.2f, %5.2f\n\n",
              current_word->debug_string(getUnicharset()).string(),
              current_word->rating(), current_word->certainty());
   }

   EnableChoiceAccum();
 }

 /**********************************************************************
  * permute_top_choice
  *
  * Return the top choice for each character as the choice for the word.
  * In addition a choice is created for the best lower and upper case
  * non-words.  In each character position the best lower (or upper) case
  * character is substituted for the best overall character.
  **********************************************************************/
 WERD_CHOICE *Dict::permute_top_choice(
     const BLOB_CHOICE_LIST_VECTOR &char_choices,
     float* rating_limit,
     WERD_CHOICE *raw_choice,
     BOOL8 *any_alpha) {
   BLOB_CHOICE *first_choice;
   const char *first_char;             //first choice
   const char *second_char;            //second choice
   const char *third_char;             //third choice
   char prev_char[UNICHAR_LEN + 1];    //prev in word
   const char *next_char = "";         //next in word
   const char *next_next_char = "";    //after next next in word

   WERD_CHOICE word(MAX_PERM_LENGTH);
   word.set_permuter(TOP_CHOICE_PERM);
   WERD_CHOICE capital_word(MAX_PERM_LENGTH);
   capital_word.set_permuter(UPPER_CASE_PERM);
   WERD_CHOICE lower_word(MAX_PERM_LENGTH);
   lower_word.set_permuter(LOWER_CASE_PERM);

   int x;
   BOOL8 char_alpha;
   float first_rating = 0;
   float adjust_factor;

   float certainties[MAX_PERM_LENGTH + 1];
   float lower_certainties[MAX_PERM_LENGTH + 1];
   float upper_certainties[MAX_PERM_LENGTH + 1];

   BLOB_CHOICE_IT blob_choice_it;
   UNICHAR_ID temp_id;
   UNICHAR_ID unichar_id;
   UNICHAR_ID space = getUnicharset().unichar_to_id(" ");
   register const char* ch;
   register inT8 lower_done;
   register inT8 upper_done;

   prev_char[0] = '\0';

   if (any_alpha != NULL)
     *any_alpha = FALSE;

   if (char_choices.length() > MAX_PERM_LENGTH) {
     return (NULL);
   }

   for (x = 0; x < char_choices.length(); ++x) {
     if (x + 1 < char_choices.length()) {
       unichar_id = get_top_choice_uid(char_choices.get(x+1));
       next_char = unichar_id != INVALID_UNICHAR_ID ?
         getUnicharset().id_to_unichar(unichar_id) : "";
     } else {
       next_char = "";
     }

     if (x + 2 < char_choices.length()) {
       unichar_id = get_top_choice_uid(char_choices.get(x+2));
       next_next_char = unichar_id != INVALID_UNICHAR_ID ?
         getUnicharset().id_to_unichar(unichar_id) : "";
     } else {
       next_next_char = "";
     }

     blob_choice_it.set_to_list(char_choices.get(x));
     ASSERT_HOST(!blob_choice_it.empty());
     first_choice = NULL;
     for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
          blob_choice_it.forward()) {  // find the best non-fragment char choice
       temp_id = blob_choice_it.data()->unichar_id();
       if (!(getUnicharset().get_fragment(temp_id))) {
         first_choice = blob_choice_it.data();
         break;
       } else if (char_choices.length() > 1) {
         word.set_fragment_mark(true);
         capital_word.set_fragment_mark(true);
         lower_word.set_fragment_mark(true);
       }
     }
     if (first_choice == NULL) {
       cprintf("Permuter found only fragments for"
               " character at position %d; word=%s\n",
               x, word.debug_string(getUnicharset()).string());
     }
     ASSERT_HOST(first_choice != NULL);

     unichar_id = first_choice->unichar_id() != INVALID_UNICHAR_ID ?
       first_choice->unichar_id() : space;
     first_char = getUnicharset().id_to_unichar(unichar_id);
     first_rating = first_choice->rating();
     word.append_unichar_id_space_allocated(
         unichar_id, 1, first_choice->rating(), first_choice->certainty());
     capital_word.append_unichar_id_space_allocated(
         unichar_id, 1, first_choice->rating(), first_choice->certainty());
     lower_word.append_unichar_id_space_allocated(
         unichar_id, 1, first_choice->rating(), first_choice->certainty());

     certainties[x] = first_choice->certainty();
     lower_certainties[x] = first_choice->certainty();
     upper_certainties[x] = first_choice->certainty();

     lower_done = FALSE;
     upper_done = FALSE;
     char_alpha = FALSE;
     second_char = "";
     third_char = "";
     for (; !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
       unichar_id = blob_choice_it.data()->unichar_id();
       if (getUnicharset().eq(unichar_id, "l") && !blob_choice_it.at_last() &&
           blob_choice_it.data_relative(1)->rating() == first_rating) {
         temp_id = blob_choice_it.data_relative(1)->unichar_id();
         if (getUnicharset().eq(temp_id, "1") ||
             getUnicharset().eq(temp_id, "I")) {
           second_char = getUnicharset().id_to_unichar(temp_id);
           blob_choice_it.forward();
           if (!blob_choice_it.at_last() &&
               blob_choice_it.data_relative(1)->rating() == first_rating) {
             temp_id = blob_choice_it.data_relative(1)->unichar_id();
             if (getUnicharset().eq(temp_id, "1") ||
                 getUnicharset().eq(temp_id, "I")) {
               third_char = getUnicharset().id_to_unichar(temp_id);
               blob_choice_it.forward();
             }
           }
           ch = choose_il1 (first_char, second_char, third_char,
             prev_char, next_char, next_next_char);
           unichar_id = (ch != NULL && *ch != '\0') ?
             getUnicharset().unichar_to_id(ch) : INVALID_UNICHAR_ID;
           if (strcmp(ch, "l") != 0 &&
               getUnicharset().eq(word.unichar_id(x), "l")) {
             word.set_unichar_id(unichar_id, x);
             lower_word.set_unichar_id(unichar_id, x);
             capital_word.set_unichar_id(unichar_id, x);
           }
         }
       }
       if (unichar_id != INVALID_UNICHAR_ID) {
         /* Find lower case */
         if (!lower_done &&
             (getUnicharset().get_islower(unichar_id) ||
              (getUnicharset().get_isupper(unichar_id) && x == 0))) {
           lower_word.set_unichar_id(unichar_id, x);
           lower_word.set_rating(lower_word.rating() -
             first_choice->rating() + blob_choice_it.data()->rating());
           if (blob_choice_it.data()->certainty() < lower_word.certainty()) {
             lower_word.set_certainty(blob_choice_it.data()->certainty());
           }
           lower_certainties[x] = blob_choice_it.data()->certainty();
           lower_done = TRUE;
         }
         /* Find upper case */
         if (!upper_done && getUnicharset().get_isupper(unichar_id)) {
           capital_word.set_unichar_id(unichar_id, x);
           capital_word.set_rating(capital_word.rating() -
             first_choice->rating() + blob_choice_it.data()->rating());
           if (blob_choice_it.data()->certainty() < capital_word.certainty()) {
             capital_word.set_certainty(blob_choice_it.data()->certainty());
           }
           upper_certainties[x] = blob_choice_it.data()->certainty();
           upper_done = TRUE;
         }
         if (!char_alpha) {
           const CHAR_FRAGMENT *fragment =
             getUnicharset().get_fragment(unichar_id);
           temp_id = !fragment ? unichar_id :
             getUnicharset().unichar_to_id(fragment->get_unichar());
           if (getUnicharset().get_isalpha(temp_id)) {
             char_alpha = TRUE;
           }
         }
         if (lower_done && upper_done)
           break;
       }
     }
     if (char_alpha && any_alpha != NULL)
       *any_alpha = TRUE;

     if (word.rating() > bestrate_pruning_factor * *rating_limit) {
       if (permute_debug)
         tprintf("\n***** Aborting high-cost word: %g > limit %g \n",
                 word.rating(), bestrate_pruning_factor * *rating_limit);
       return (NULL);
     }

     *prev_char = '\0';
     temp_id = word.unichar_id(word.length()-1);
     if (temp_id != INVALID_UNICHAR_ID) {
       strcpy(prev_char, getUnicharset().id_to_unichar(temp_id));
     }
   }

   if (word.rating() < raw_choice->rating()) {
     *raw_choice = word;
     LogNewChoice(*raw_choice, 1.0, certainties, true);
   }

   if (ngram_permuter_activated)
     return NULL;

   float rating = word.rating();
   adjust_non_word(&word, &adjust_factor);
   LogNewChoice(word, adjust_factor, certainties, false);

   float lower_rating = lower_word.rating();
   adjust_non_word(&lower_word, &adjust_factor);
   LogNewChoice(lower_word, adjust_factor, lower_certainties, false);

   float upper_rating = capital_word.rating();
   adjust_non_word(&capital_word, &adjust_factor);
   LogNewChoice(capital_word, adjust_factor, upper_certainties, false);

   WERD_CHOICE *best_choice = &word;
   *rating_limit = rating;
   if (lower_word.rating() < best_choice->rating()) {
     best_choice = &lower_word;
     *rating_limit = lower_rating;
   }
   if (capital_word.rating() < best_choice->rating()) {
     best_choice = &capital_word;
     *rating_limit = upper_rating;
   }
   return new WERD_CHOICE(*best_choice);
 }


 /**********************************************************************
  * choose_il1
  *
  * Choose between the candidate il1 chars.
  **********************************************************************/
 const char* Dict::choose_il1(const char *first_char,        //first choice
                              const char *second_char,       //second choice
                              const char *third_char,        //third choice
                              const char *prev_char,         //prev in word
                              const char *next_char,         //next in word
                              const char *next_next_char) {  //after next next in word
   inT32 type1;                   //1/I/l type of first choice
   inT32 type2;                   //1/I/l type of second choice
   inT32 type3;                   //1/I/l type of third choice

   int first_char_length = strlen(first_char);
   int prev_char_length = strlen(prev_char);
   int next_char_length = strlen(next_char);
   int next_next_char_length = strlen(next_next_char);

   if (*first_char == 'l' && *second_char != '\0') {
     if (*second_char == 'I'
         && (((prev_char_length != 0 &&
             getUnicharset().get_isupper (prev_char, prev_char_length)) &&
             (next_char_length == 0 ||
              !getUnicharset().get_islower (next_char, next_char_length)) &&
             (next_char_length == 0 ||
              !getUnicharset().get_isdigit (next_char, next_char_length))) ||
             ((next_char_length != 0 &&
              getUnicharset().get_isupper (next_char, next_char_length)) &&
             (prev_char_length == 0 ||
              !getUnicharset().get_islower (prev_char, prev_char_length)) &&
             (prev_char_length == 0 ||
              !getUnicharset().get_isdigit (prev_char, prev_char_length)))))
       first_char = second_char;  //override
     else if (*second_char == '1' || *third_char == '1') {
       if ((next_char_length != 0 &&
            getUnicharset().get_isdigit (next_char, next_char_length)) ||
           (prev_char_length != 0 &&
            getUnicharset().get_isdigit (prev_char, prev_char_length))
           || (*next_char == 'l' &&
           (next_next_char_length != 0 &&
            getUnicharset().get_isdigit (next_next_char,
                                         next_next_char_length)))) {
         first_char = "1";
         first_char_length = 1;
       }
       else if ((prev_char_length == 0 ||
                 !getUnicharset().get_islower (prev_char, prev_char_length)) &&
                ((next_char_length == 0 ||
                  !getUnicharset().get_islower (next_char, next_char_length)) ||
                 (*next_char == 's' &&
                 *next_next_char == 't'))) {
         if (((*prev_char != '\'' && *prev_char != '`') || *next_char != '\0')
             && ((*next_char != '\'' && *next_char != '`')
                 || *prev_char != '\0')) {
           first_char = "1";
           first_char_length = 1;
         }
       }
     }
     if (*first_char == 'l' && *next_char != '\0' &&
         (prev_char_length == 0 ||
          !getUnicharset().get_isalpha (prev_char, prev_char_length))) {
       type1 = 2;

       if (*second_char == '1')
         type2 = 0;
       else if (*second_char == 'I')
         type2 = 1;
       else if (*second_char == 'l')
         type2 = 2;
       else
         type2 = type1;

       if (*third_char == '1')
         type3 = 0;
       else if (*third_char == 'I')
         type3 = 1;
       else if (*third_char == 'l')
         type3 = 2;
       else
         type3 = type1;

 #if 0
       if (bigram_counts[*next_char][type2] >
       bigram_counts[*next_char][type1]) {
         first_char = second_char;
         type1 = type2;
       }
       if (bigram_counts[*next_char][type3] >
       bigram_counts[*next_char][type1]) {
         first_char = third_char;
       }
 #endif
     }
   }
   return first_char;
 }

 //
 // Check all the DAWGs to see if this word is in any of them.
 //
 int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) {
   const WERD_CHOICE *word_ptr = &word;
   WERD_CHOICE temp_word;
   if (hyphenated()) {
     copy_hyphen_info(&temp_word);
     temp_word += word;
     word_ptr = &temp_word;
   }
   if (word_ptr->length() == 0) return NO_PERM;
   // Allocate vectors for holding current and updated
   // active_dawgs and constraints and initialize them.
   DawgInfoVector *active_dawgs = new DawgInfoVector[2];
   DawgInfoVector *constraints = new DawgInfoVector[2];
   init_active_dawgs(&(active_dawgs[0]));
   init_constraints(&(constraints[0]));
   DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]),
                      &(active_dawgs[1]), &(constraints[1]), 0.0);
   int last_index = word_ptr->length() - 1;
   // Call leter_is_okay for each letter in the word.
   for (int i = hyphen_base_size(); i <= last_index; ++i) {
     if (!((this->*letter_is_okay_)(&dawg_args, i, word_ptr,
                                    i == last_index))) break;
     // Swap active_dawgs, constraints with the corresponding updated vector.
     if (dawg_args.updated_active_dawgs == &(active_dawgs[1])) {
       dawg_args.updated_active_dawgs = &(active_dawgs[0]);
       dawg_args.updated_constraints = &(constraints[0]);
       ++(dawg_args.active_dawgs);
       ++(dawg_args.constraints);
     } else {
       ++(dawg_args.updated_active_dawgs);
       ++(dawg_args.updated_constraints);
       dawg_args.active_dawgs = &(active_dawgs[0]);
       dawg_args.constraints = &(constraints[0]);
     }
   }
   delete[] active_dawgs;
   delete[] constraints;
   if (dawg_args.permuter == SYSTEM_DAWG_PERM ||
       dawg_args.permuter == DOC_DAWG_PERM ||
       dawg_args.permuter == USER_DAWG_PERM ||
       (numbers_ok && dawg_args.permuter == NUMBER_PERM)){
     return dawg_args.permuter;
   } else {
     return NO_PERM;
   }
 }

 //
 // Return true if the word contains a valid punctuation pattern.
 //
 // Note: Since the domains of punctuation symbols and symblos
 // used in numbers are not disjoint, a valid number might contain
 // an invalid punctuation pattern (e.g. .99).
 //
 bool Dict::valid_punctuation(const WERD_CHOICE &word) {
   if (word.length() == 0) return NO_PERM;
   int i;
   WERD_CHOICE new_word;
   int last_index = word.length() - 1;
   int new_len = 0;
   for (i = 0; i <= last_index; ++i) {
     UNICHAR_ID unichar_id = (word.unichar_id(i));
     if (getUnicharset().get_ispunctuation(unichar_id)) {
       new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
     } else if (!getUnicharset().get_isalpha(unichar_id) &&
                !getUnicharset().get_isdigit(unichar_id)) {
       return false;  // neither punc, nor alpha, nor digit
     } else if ((new_len = new_word.length()) == 0 ||
                new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) {
       new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
     }
   }
   for (i = 0; i < dawgs_.size(); ++i) {
     if (dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
         dawgs_[i]->word_in_dawg(new_word)) return true;
   }
   return false;
 }

 /**********************************************************************
  * fragment_state
  *
  * Given the current char choice and information about previously seen
  * fragments, determines whether adjacent character fragments are
  * present and whether they can be concatenated.
  *
  * The given prev_char_frag_info contains:
  *  -- fragment: if not NULL contains information about immediately
  *     preceeding fragmented character choice
  *  -- num_fragments: number of fragments that have been used so far
  *     to construct a character
  *  -- certainty: certainty of the current choice or minimum
  *     certainty of all fragments concatenated so far
  *  -- rating: rating of the current choice or sum of fragment
  *     ratings concatenated so far
  *
  * The output char_frag_info is filled in as follows:
  * -- character: is set to be NULL if the choice is a non-matching
  *    or non-ending fragment piece; is set to unichar of the given choice
  *    if it represents a regular character or a matching ending fragment
  * -- fragment,num_fragments,certainty,rating are set as described above
  *
  * Returns false if a non-matching fragment is discovered, true otherwise.
  **********************************************************************/
 bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id,
                                float curr_rating, float curr_certainty,
                                const CHAR_FRAGMENT_INFO *prev_char_frag_info,
                                const char *debug, int word_ending,
                                CHAR_FRAGMENT_INFO *char_frag_info) {
   const CHAR_FRAGMENT *this_fragment =
     getUnicharset().get_fragment(curr_unichar_id);
   const CHAR_FRAGMENT *prev_fragment =
     prev_char_frag_info != NULL ? prev_char_frag_info->fragment : NULL;

   // Print debug info for fragments.
   if (debug && (prev_fragment || this_fragment)) {
     cprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
             getUnicharset().debug_str(curr_unichar_id).string(),
             word_ending);
     if (prev_fragment) {
       cprintf("prev_fragment %s\n", prev_fragment->to_string().string());
     }
     if (this_fragment) {
       cprintf("this_fragment %s\n", this_fragment->to_string().string());
     }
   }

   char_frag_info->unichar_id = curr_unichar_id;
   char_frag_info->fragment = this_fragment;
   char_frag_info->rating = curr_rating;
   char_frag_info->certainty = curr_certainty;
   char_frag_info->num_fragments = 1;
   if (prev_fragment && !this_fragment) {
     if (debug) tprintf("Skip choice with incomplete fragment\n");
     return false;
   }
   if (this_fragment) {
     // We are dealing with a fragment.
     char_frag_info->unichar_id = INVALID_UNICHAR_ID;
     if (prev_fragment) {
       if (!this_fragment->is_continuation_of(prev_fragment)) {
         if (debug) tprintf("Non-matching fragment piece\n");
         return false;
       }
       if (this_fragment->is_ending()) {
         char_frag_info->unichar_id =
           getUnicharset().unichar_to_id(this_fragment->get_unichar());
         char_frag_info->fragment = NULL;
         if (debug) {
           tprintf("Built character %s from fragments\n",
                   getUnicharset().debug_str(
                       char_frag_info->unichar_id).string());
         }
       } else {
         if (debug) tprintf("Record fragment continuation\n");
         char_frag_info->fragment = this_fragment;
       }
       // Update certainty and rating.
       char_frag_info->rating =
         prev_char_frag_info->rating + curr_rating;
       char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
       char_frag_info->certainty =
         MIN(curr_certainty, prev_char_frag_info->certainty);
     } else {
       if (this_fragment->is_beginning()) {
         if (debug) cprintf("Record fragment beginning\n");
       } else {
         if (debug) {
           tprintf("Non-starting fragment piece with no prev_fragment\n");
         }
         return false;
       }
     }
   }
   if (word_ending && char_frag_info->fragment) {
     if (debug) tprintf("Word can not end with a fragment\n");
     return false;
   }
   return true;
 }
 /**********************************************************************
  * top_fragments_permute_and_select
  *
  * Creates a copy of character choices list that contain only fragments
  * and the best non-fragmented character choice.
  * Permutes character in this shortened list, builds characters from
  * fragments if possible and returns a better choice if found.
  **********************************************************************/
 WERD_CHOICE *Dict::top_fragments_permute_and_select(
     const BLOB_CHOICE_LIST_VECTOR &char_choices,
     float rating_limit) {
   if (char_choices.length() <= 1 ||
       char_choices.length() > MAX_PERM_LENGTH) {
     return NULL;
   }
   // See it would be possible to benefit from permuting fragments.
   int x;
   float min_rating = 0.0;
   BLOB_CHOICE_IT blob_choice_it;
   for (x = 0; x < char_choices.length(); ++x) {
     blob_choice_it.set_to_list(char_choices.get(x));
     if (blob_choice_it.data()) {
       min_rating += blob_choice_it.data()->rating();
     }
     if (min_rating >= rating_limit) {
       return NULL;
     }
   }
   if (fragments_debug > 1) {
     tprintf("A choice with fragment beats top choice\n");
     tprintf("Running fragment permuter...\n");
   }

   // Construct a modified choices list that contains (for each position):
   // the best choice, all fragments and at least one choice for
   // a non-fragmented character.
   BLOB_CHOICE_LIST_VECTOR frag_char_choices(char_choices.length());
   for (x = 0; x < char_choices.length(); ++x) {
     bool need_nonfrag_char = true;
     BLOB_CHOICE_LIST *frag_choices = new BLOB_CHOICE_LIST();
     BLOB_CHOICE_IT frag_choices_it;
     frag_choices_it.set_to_list(frag_choices);
     blob_choice_it.set_to_list(char_choices.get(x));
     for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
          blob_choice_it.forward()) {
       if (getUnicharset().get_fragment(blob_choice_it.data()->unichar_id())) {
         frag_choices_it.add_after_then_move(
             new BLOB_CHOICE(*(blob_choice_it.data())));
       } else if (need_nonfrag_char) {
         frag_choices_it.add_after_then_move(
             new BLOB_CHOICE(*(blob_choice_it.data())));
         need_nonfrag_char = false;
       }
     }
     frag_char_choices += frag_choices;
   }

   WERD_CHOICE *best_choice = new WERD_CHOICE();
   best_choice->make_bad();
   WERD_CHOICE word(MAX_PERM_LENGTH);
   word.set_permuter(TOP_CHOICE_PERM);
   float certainties[MAX_PERM_LENGTH];
   this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_top_fragments_fxn;
   permute_choices((fragments_debug > 1) ? "fragments_debug" : NULL,
                   frag_char_choices, 0, NULL, &word, certainties,
                   &rating_limit, best_choice, NULL);

   frag_char_choices.delete_data_pointers();
   return best_choice;
 }

 /**********************************************************************
  * permute_choices
  *
  * Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST
  * with the given char_choice_index in char_choices.
  **********************************************************************/
 void Dict::permute_choices(
     const char *debug,
     const BLOB_CHOICE_LIST_VECTOR &char_choices,
     int char_choice_index,
     const CHAR_FRAGMENT_INFO *prev_char_frag_info,
     WERD_CHOICE *word,
     float certainties[],
     float *limit,
     WERD_CHOICE *best_choice,
     void *more_args) {
   if (debug) {
     tprintf("%s permute_choices: char_choice_index=%d"
             " limit=%4.2f rating=%4.2f, certainty=%4.2f word=%s\n",
             debug, char_choice_index, *limit, word->rating(),
             word->certainty(), word->debug_string(getUnicharset()).string());
   }
   if (char_choice_index < char_choices.length()) {
     BLOB_CHOICE_IT blob_choice_it;
     blob_choice_it.set_to_list(char_choices.get(char_choice_index));
     for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
          blob_choice_it.forward()) {
       append_choices(debug, char_choices, *(blob_choice_it.data()),
                      char_choice_index, prev_char_frag_info, word,
                      certainties, limit, best_choice, more_args);

     }
   }
 }

 /**********************************************************************
  * append_choices
  *
  * Check to see whether or not the next choice is worth appending to
  * the word being generated. If so then keep going deeper into the word.
  *
  * This function assumes that Dict::go_deeper_fxn_ is set.
  **********************************************************************/
 void Dict::append_choices(
     const char *debug,
     const BLOB_CHOICE_LIST_VECTOR &char_choices,
     const BLOB_CHOICE &blob_choice,
     int char_choice_index,
     const CHAR_FRAGMENT_INFO *prev_char_frag_info,
     WERD_CHOICE *word,
     float certainties[],
     float *limit,
     WERD_CHOICE *best_choice,
     void *more_args) {
   int word_ending =
     (char_choice_index == char_choices.length() - 1) ? true : false;

   // Deal with fragments.
   CHAR_FRAGMENT_INFO char_frag_info;
   if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
                            blob_choice.certainty(), prev_char_frag_info, debug,
                            word_ending, &char_frag_info)) {
     return;  // blob_choice must be an invalid fragment
   }
   // Search the next letter if this character is a fragment.
   if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
     permute_choices(debug, char_choices, char_choice_index + 1,
                     &char_frag_info, word, certainties, limit,
                     best_choice, more_args);
     return;
   }

   // Add the next unichar.
   float old_rating = word->rating();
   float old_certainty = word->certainty();
   uinT8 old_permuter = word->permuter();
   certainties[word->length()] = char_frag_info.certainty;
   word->append_unichar_id_space_allocated(
       char_frag_info.unichar_id, char_frag_info.num_fragments,
       char_frag_info.rating, char_frag_info.certainty);

   // Explore the next unichar.
   (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
                           &char_frag_info, word_ending, word, certainties,
                           limit, best_choice, more_args);

   // Remove the unichar we added to explore other choices in it's place.
   word->remove_last_unichar_id();
   word->set_rating(old_rating);
   word->set_certainty(old_certainty);
   word->set_permuter(old_permuter);
 }

 /**********************************************************************
  * go_deeper_top_fragments_fxn
  *
  * If the choice being composed so far could be better
  * than best_choice keep exploring choices.
  **********************************************************************/
 void Dict::go_deeper_top_fragments_fxn(
     const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
     int char_choice_index,
     const CHAR_FRAGMENT_INFO *prev_char_frag_info,
     bool word_ending, WERD_CHOICE *word, float certainties[],
     float *limit, WERD_CHOICE *best_choice, void *more_args) {
   if (word->rating() < *limit) {
     if (word_ending) {
       if (fragments_debug > 1) {
         tprintf("fragments_debug new choice = %s\n",
                 word->debug_string(getUnicharset()).string());
       }
       *limit = word->rating();

       float adjust_factor;
       adjust_non_word(word, &adjust_factor);
       LogNewChoice(*word, adjust_factor, certainties, false);

       if (word->rating() < best_choice->rating()) {
         *best_choice = *word;
       }
     } else {  // search the next letter
       permute_choices(debug, char_choices, char_choice_index + 1,
                       prev_char_frag_info, word, certainties, limit,
                       best_choice, more_args);
     }
   } else {
     if (fragments_debug > 1) {
       tprintf("fragments_debug pruned word (%s, rating=%4.2f, limit=%4.2f)\n",
               word->debug_string(getUnicharset()).string(),
               word->rating(), *limit);
     }
   }
 }

 }  // namespace tesseract