dict/permdawg.cpp - platform/external/tesseract - Git at Google

 /* -*-C-*-
  ********************************************************************************
  *
  * File:        permdawg.c  (Formerly permdawg.c)
  * Description:  Scale word choices by a dictionary
  * Author:       Mark Seaman, OCR Technology
  * Created:      Fri Oct 16 14:37:00 1987
  * Modified:     Tue Jul  9 15:43:18 1991 (Mark Seaman) marks@hpgrlt
  * Language:     C
  * Package:      N/A
  * Status:       Reusable Software Component
  *
  * (c) Copyright 1987, Hewlett-Packard Company.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  *********************************************************************************/
 /*----------------------------------------------------------------------
               I n c l u d e s
 ----------------------------------------------------------------------*/

 #include "context.h"
 #include "conversion.h"
 #include "cutil.h"
 #include "dawg.h"
 #include "freelist.h"
 #include "globals.h"
 #include "ndminx.h"
 #include "permdawg.h"
 #include "permute.h"
 #include "stopper.h"
 #include "tordvars.h"
 #include "tprintf.h"
 #include "varable.h"

 #include <ctype.h>
 #include "dict.h"
 #include "image.h"

 /*----------------------------------------------------------------------
               V a r i a b l e s
 ----------------------------------------------------------------------*/
 BOOL_VAR(segment_dawg_debug, 0, "Debug mode for word segmentation");

 double_VAR(segment_penalty_dict_case_bad, OK_WERD,
            "Default score multiplier for word matches, which may have "
            "case issues (lower is better).");

 double_VAR(segment_penalty_dict_case_ok, GOOD_WERD,
            "Score multiplier for word matches that have good case "
            "(lower is better).");

 double_VAR(segment_penalty_dict_frequent_word, FREQ_WERD,
            "Score multiplier for word matches which have good case and are "
            "frequent in the given language (lower is better).");

 /*----------------------------------------------------------------------
               F u n c t i o n s
 ----------------------------------------------------------------------*/
 namespace tesseract {

 static const float kPermDawgRatingPad = 5.0;

 /**********************************************************************
  * adjust_word
  *
  * Assign an adjusted value to a string that is a word.	The value
  * that this word choice has is based on case and punctuation rules.
  **********************************************************************/
 void Dict::adjust_word(WERD_CHOICE *word,
                        float *certainty_array) {
   float adjust_factor;
   float new_rating = word->rating();

   if (segment_dawg_debug) {
     tprintf ("Word: %s %4.2f ",
             word->debug_string(getUnicharset()).string(), word->rating());
   }

   new_rating += RATING_PAD;
   if (Context::case_ok(*word, getUnicharset())) {
     if (freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) {
       word->set_permuter(FREQ_DAWG_PERM);
       new_rating *= segment_penalty_dict_frequent_word;
       adjust_factor = segment_penalty_dict_frequent_word;
       if (segment_dawg_debug)
         tprintf(", F, %4.2f ", (double)segment_penalty_dict_frequent_word);
     } else {
       new_rating *= segment_penalty_dict_case_ok;
       adjust_factor = segment_penalty_dict_case_ok;
       if (segment_dawg_debug)
         tprintf(", %4.2f ", (double)segment_penalty_dict_case_ok);
     }
   } else {
     new_rating *= segment_penalty_dict_case_bad;
     adjust_factor = segment_penalty_dict_case_bad;
     if (segment_dawg_debug) {
       tprintf(", C %4.2f ", (double)segment_penalty_dict_case_bad);
     }
   }
   new_rating -= RATING_PAD;
   word->set_rating(new_rating);

   LogNewChoice(*word, adjust_factor, certainty_array, false);

   if (segment_dawg_debug)
     tprintf(" --> %4.2f\n", new_rating);
 }

 /**********************************************************************
  * go_deeper_dawg_fxn
  *
  * If the choice being composed so far could be a dictionary word
  * keep exploring choices.
  **********************************************************************/
 void Dict::go_deeper_dawg_fxn(
     const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
                               int char_choice_index,
                               const CHAR_FRAGMENT_INFO *prev_char_frag_info,
     bool word_ending, WERD_CHOICE *word, float certainties[],
     float *limit, WERD_CHOICE *best_choice, void *void_more_args) {
   DawgArgs *more_args = reinterpret_cast<DawgArgs*>(void_more_args);
   int word_index = word->length() - 1;

   // There are two modes for deciding whether to go deeper: regular dawg
   // permuter mode and the special ambigs mode. If *limit is <= 0.0 the
   // function switches to the ambigs mode (this is the case when
   // dawg_permute_and_select() function is called from NoDangerousAmbigs()) and
   // only searches for the first choice that has a rating better than *limit
   // (in this case ratings are fake, since the real ratings can not be < 0).
   // Modification of the hyphen state is turned off in the ambigs mode.
   // When in the regular dawg permuter mode, the function explores all the
   // possible words and chooses the one with the best rating. The letters with
   // ratings that are far worse than the ones seen so far are pruned out.
   bool ambigs_mode = (*limit <= 0.0);
   if (ambigs_mode) {
     if (best_choice->rating() < *limit) return;
   } else {
     // Prune bad subwords
     if (more_args->rating_array[word_index] == NO_RATING) {
       more_args->rating_array[word_index] = word->rating();
   } else {
       float permdawg_limit = more_args->rating_array[word_index] *
         more_args->rating_margin + kPermDawgRatingPad;
       if (permdawg_limit < word->rating()) {
       if (segment_dawg_debug) {
           tprintf("early pruned word rating=%4.2f,"
                   " permdawg_limit=%4.2f, word=%s\n", word->rating(),
                   permdawg_limit, word->debug_string(getUnicharset()).string());
       }
       return;
     }
   }
   }
   // Deal with hyphens
   if (word_ending && has_hyphen_end(*word) && !ambigs_mode) {
     if (segment_dawg_debug)
       tprintf("new hyphen choice = %s\n",
               word->debug_string(getUnicharset()).string());
     word->set_permuter(more_args->permuter);
     adjust_word(word, certainties);
     set_hyphen_word(*word, *(more_args->active_dawgs),
                     *(more_args->constraints));
     update_best_choice(*word, best_choice);
   } else {  // Look up char in DAWG
     // TODO(daria): update the rest of the code that specifies alternative
     // letter_is_okay_ functions (e.g. TessCharNgram class) to work with
     // multi-byte unichars and/or unichar ids.

     // If the current unichar is an ngram first try calling
     // letter_is_okay() for each unigram it contains separately.
     UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
     bool checked_unigrams = false;
     if (getUnicharset().get_isngram(orig_uch_id)) {
       if (segment_dawg_debug) {
         tprintf("checking unigrams in an ngram %s\n",
                 getUnicharset().debug_str(orig_uch_id).string());
       }
       int orig_num_fragments = word->fragment_length(word_index);
       int num_unigrams = 0;
       word->remove_last_unichar_id();
       const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
       const char *ngram_str_end = ngram_str + strlen(ngram_str);
       const char *ngram_ptr = ngram_str;
       bool unigrams_ok = true;
       // Construct DawgArgs that reflect the current state.
       DawgInfoVector unigram_active_dawgs = *(more_args->active_dawgs);
       DawgInfoVector unigram_constraints = *(more_args->constraints);
       DawgInfoVector unigram_updated_active_dawgs;
       DawgInfoVector unigram_updated_constraints;
       DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_constraints,
                                  &unigram_updated_active_dawgs,
                                  &unigram_updated_constraints, 0.0);
       unigram_dawg_args.permuter = more_args->permuter;
       // Check unigrams in the ngram with letter_is_okay().
       while (unigrams_ok && ngram_ptr < ngram_str_end) {
         int step = getUnicharset().step(ngram_ptr);
         UNICHAR_ID uch_id = (step <= 0) ? INVALID_UNICHAR_ID :
             getUnicharset().unichar_to_id(ngram_ptr, step);
         ngram_ptr += step;
         ++num_unigrams;
         word->append_unichar_id(uch_id, 1, 0.0, 0.0);
         unigrams_ok = unigrams_ok && (this->*letter_is_okay_)(
             &unigram_dawg_args, word_index+num_unigrams-1, word,
             word_ending && (ngram_ptr == ngram_str_end));
         (*unigram_dawg_args.active_dawgs) =
           *(unigram_dawg_args.updated_active_dawgs);
         (*unigram_dawg_args.constraints) =
           *(unigram_dawg_args.updated_constraints);
         if (segment_dawg_debug) {
           tprintf("unigram %s is %s\n",
                   getUnicharset().debug_str(uch_id).string(),
                   unigrams_ok ? "OK" : "not OK");
         }
       }
       // Restore the word and copy the updated dawg state if needed.
       while (num_unigrams-- > 0) word->remove_last_unichar_id();
       word->append_unichar_id_space_allocated(
           orig_uch_id, orig_num_fragments, 0.0, 0.0);
       if (unigrams_ok) {
         checked_unigrams = true;
         more_args->permuter = unigram_dawg_args.permuter;
         *(more_args->updated_active_dawgs) =
           *(unigram_dawg_args.updated_active_dawgs);
         *(more_args->updated_constraints) =
           *(unigram_dawg_args.updated_constraints);
       }
     }

     // Check which dawgs from dawgs_ vector contain the word
     // up to and including the current unichar.
     if (checked_unigrams ||
         (this->*letter_is_okay_)(more_args, word_index, word, word_ending)) {
       // Add a new word choice
       if (word_ending) {
         if (segment_dawg_debug) {
           tprintf("found word = %s\n",
                   word->debug_string(getUnicharset()).string());
         }
         WERD_CHOICE *adjusted_word = word;
         WERD_CHOICE hyphen_tail_word;
         if (!ambigs_mode && hyphen_base_size() > 0) {
           hyphen_tail_word = *word;
           remove_hyphen_head(&hyphen_tail_word);
           adjusted_word = &hyphen_tail_word;
         }
         adjusted_word->set_permuter(more_args->permuter);
         if (!ambigs_mode) {
           adjust_word(adjusted_word, &certainties[hyphen_base_size()]);
         }
         update_best_choice(*adjusted_word, best_choice);
       } else {  // search the next letter
         // Make updated_* point to the next entries in the DawgInfoVector
         // arrays (that were originally created in dawg_permute_and_select)
         ++(more_args->updated_active_dawgs);
         ++(more_args->updated_constraints);
         // Make active_dawgs and constraints point to the updated ones.
         ++(more_args->active_dawgs);
         ++(more_args->constraints);
         permute_choices(debug, char_choices, char_choice_index + 1,
                         prev_char_frag_info, word, certainties, limit,
                         best_choice, more_args);
         // Restore previous state to explore another letter in this position.
         --(more_args->updated_active_dawgs);
         --(more_args->updated_constraints);
         --(more_args->active_dawgs);
         --(more_args->constraints);
       }
     } else {
       if (segment_dawg_debug) {
         tprintf("last unichar not OK at index %d in %s\n",
                 word_index, word->debug_string(getUnicharset()).string());
       }
     }
   }
 }

 /**********************************************************************
  * dawg_permute_and_select
  *
  * Recursively explore all the possible character combinations in
  * the given char_choices. Use go_deeper_dawg_fxn() to search all the
  * dawgs in the dawgs_ vector in parallel and discard invalid words.
  *
  * Allocate and return a WERD_CHOICE with the best valid word found.
  * **********************************************************************/
 WERD_CHOICE *Dict::dawg_permute_and_select(
     const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
   WERD_CHOICE *best_choice = new WERD_CHOICE();
   best_choice->make_bad();
   best_choice->set_rating(rating_limit);
   if (char_choices.length() == 0) return best_choice;
   DawgInfoVector *active_dawgs = new DawgInfoVector[char_choices.length() + 1];
   DawgInfoVector *constraints =  new DawgInfoVector[char_choices.length() + 1];
   init_active_dawgs(&(active_dawgs[0]));
   init_constraints(&(constraints[0]));
   DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]),
                      &(active_dawgs[1]), &(constraints[1]),
                      (segment_penalty_dict_case_bad /
                       segment_penalty_dict_case_ok));
   WERD_CHOICE word(MAX_WERD_LENGTH);
   copy_hyphen_info(&word);
   // Discard rating and certainty of the hyphen base (if any).
   word.set_rating(0.0);
   word.set_certainty(0.0);
   if (word.length() + char_choices.length() > MAX_WERD_LENGTH) {
     return best_choice;  // the word is too long to permute
   }
   float certainties[MAX_WERD_LENGTH];
   this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;
   permute_choices(segment_dawg_debug ? "segment_dawg_debug" : NULL,
                   char_choices, 0, NULL, &word, certainties,
                   &rating_limit, best_choice, &dawg_args);
   delete[] active_dawgs;
   delete[] constraints;
   return best_choice;
 }

 // Fill the given active_dawgs vector with dawgs that could contain the
 // beginning of the word. If hyphenated() returns true, copy the entries
 // from hyphen_active_dawgs_ instead.
 void Dict::init_active_dawgs(DawgInfoVector *active_dawgs) {
   int i;
   if (hyphenated()) {
     *active_dawgs = hyphen_active_dawgs_;
     if (dawg_debug_level >= 3) {
       for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
         tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
                 hyphen_active_dawgs_[i].dawg_index,
                 hyphen_active_dawgs_[i].ref);
   }
   }
   } else {
     for (i = 0; i < dawgs_.length(); ++i) {
       if (kBeginningDawgsType[(dawgs_[i])->type()]) {
         *active_dawgs += DawgInfo(i, NO_EDGE);
         if (dawg_debug_level >= 3) {
           tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
     }
   }
 }
   }
   }

 // If hyphenated() returns true, copy the entries from hyphen_constraints_
 // into the given constraints vector.
 void Dict::init_constraints(DawgInfoVector *constraints) {
   if (hyphenated()) {
     *constraints = hyphen_constraints_;
     if (dawg_debug_level >= 3) {
       for (int i = 0; i < hyphen_constraints_.size(); ++i) {
         tprintf("Adding hyphen constraint [%d, " REFFORMAT "]\n",
                 hyphen_constraints_[i].dawg_index,
                 hyphen_constraints_[i].ref);
   }
 }
 }
 }

 }  // namespace tesseract
	/* --C--
	********************************************************************************
	*
	* File: permdawg.c (Formerly permdawg.c)
	* Description: Scale word choices by a dictionary
	* Author: Mark Seaman, OCR Technology
	* Created: Fri Oct 16 14:37:00 1987
	* Modified: Tue Jul 9 15:43:18 1991 (Mark Seaman) marks@hpgrlt
	* Language: C
	* Package: N/A
	* Status: Reusable Software Component
	*
	* (c) Copyright 1987, Hewlett-Packard Company.
	** Licensed under the Apache License, Version 2.0 (the "License");
	** you may not use this file except in compliance with the License.
	** You may obtain a copy of the License at
	** http://www.apache.org/licenses/LICENSE-2.0
	** Unless required by applicable law or agreed to in writing, software
	** distributed under the License is distributed on an "AS IS" BASIS,
	** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	** See the License for the specific language governing permissions and
	** limitations under the License.
	*
	*********************************************************************************/
	/*----------------------------------------------------------------------
	I n c l u d e s
	----------------------------------------------------------------------*/

	#include "context.h"
	#include "conversion.h"
	#include "cutil.h"
	#include "dawg.h"
	#include "freelist.h"
	#include "globals.h"
	#include "ndminx.h"
	#include "permdawg.h"
	#include "permute.h"
	#include "stopper.h"
	#include "tordvars.h"
	#include "tprintf.h"
	#include "varable.h"

	#include <ctype.h>
	#include "dict.h"
	#include "image.h"

	/*----------------------------------------------------------------------
	V a r i a b l e s
	----------------------------------------------------------------------*/
	BOOL_VAR(segment_dawg_debug, 0, "Debug mode for word segmentation");

	double_VAR(segment_penalty_dict_case_bad, OK_WERD,
	"Default score multiplier for word matches, which may have "
	"case issues (lower is better).");

	double_VAR(segment_penalty_dict_case_ok, GOOD_WERD,
	"Score multiplier for word matches that have good case "
	"(lower is better).");

	double_VAR(segment_penalty_dict_frequent_word, FREQ_WERD,
	"Score multiplier for word matches which have good case and are "
	"frequent in the given language (lower is better).");

	/*----------------------------------------------------------------------
	F u n c t i o n s
	----------------------------------------------------------------------*/
	namespace tesseract {

	static const float kPermDawgRatingPad = 5.0;

	/**********************************************************************
	* adjust_word
	*
	* Assign an adjusted value to a string that is a word. The value
	* that this word choice has is based on case and punctuation rules.
	**********************************************************************/
	void Dict::adjust_word(WERD_CHOICE *word,
	float *certainty_array) {
	float adjust_factor;
	float new_rating = word->rating();

	if (segment_dawg_debug) {
	tprintf ("Word: %s %4.2f ",
	word->debug_string(getUnicharset()).string(), word->rating());
	}

	new_rating += RATING_PAD;
	if (Context::case_ok(*word, getUnicharset())) {
	if (freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) {
	word->set_permuter(FREQ_DAWG_PERM);
	new_rating *= segment_penalty_dict_frequent_word;
	adjust_factor = segment_penalty_dict_frequent_word;
	if (segment_dawg_debug)
	tprintf(", F, %4.2f ", (double)segment_penalty_dict_frequent_word);
	} else {
	new_rating *= segment_penalty_dict_case_ok;
	adjust_factor = segment_penalty_dict_case_ok;
	if (segment_dawg_debug)
	tprintf(", %4.2f ", (double)segment_penalty_dict_case_ok);
	}
	} else {
	new_rating *= segment_penalty_dict_case_bad;
	adjust_factor = segment_penalty_dict_case_bad;
	if (segment_dawg_debug) {
	tprintf(", C %4.2f ", (double)segment_penalty_dict_case_bad);
	}
	}
	new_rating -= RATING_PAD;
	word->set_rating(new_rating);

	LogNewChoice(*word, adjust_factor, certainty_array, false);

	if (segment_dawg_debug)
	tprintf(" --> %4.2f\n", new_rating);
	}

	/**********************************************************************
	* go_deeper_dawg_fxn
	*
	* If the choice being composed so far could be a dictionary word
	* keep exploring choices.
	**********************************************************************/
	void Dict::go_deeper_dawg_fxn(
	const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
	int char_choice_index,
	const CHAR_FRAGMENT_INFO *prev_char_frag_info,
	bool word_ending, WERD_CHOICE *word, float certainties[],
	float limit, WERD_CHOICE best_choice, void *void_more_args) {
	DawgArgs more_args = reinterpret_cast<DawgArgs>(void_more_args);
	int word_index = word->length() - 1;

	// There are two modes for deciding whether to go deeper: regular dawg
	// permuter mode and the special ambigs mode. If *limit is <= 0.0 the
	// function switches to the ambigs mode (this is the case when
	// dawg_permute_and_select() function is called from NoDangerousAmbigs()) and
	// only searches for the first choice that has a rating better than *limit
	// (in this case ratings are fake, since the real ratings can not be < 0).
	// Modification of the hyphen state is turned off in the ambigs mode.
	// When in the regular dawg permuter mode, the function explores all the
	// possible words and chooses the one with the best rating. The letters with
	// ratings that are far worse than the ones seen so far are pruned out.
	bool ambigs_mode = (*limit <= 0.0);
	if (ambigs_mode) {
	if (best_choice->rating() < *limit) return;
	} else {
	// Prune bad subwords
	if (more_args->rating_array[word_index] == NO_RATING) {
	more_args->rating_array[word_index] = word->rating();
	} else {
	float permdawg_limit = more_args->rating_array[word_index] *
	more_args->rating_margin + kPermDawgRatingPad;
	if (permdawg_limit < word->rating()) {
	if (segment_dawg_debug) {
	tprintf("early pruned word rating=%4.2f,"
	" permdawg_limit=%4.2f, word=%s\n", word->rating(),
	permdawg_limit, word->debug_string(getUnicharset()).string());
	}
	return;
	}
	}
	}
	// Deal with hyphens
	if (word_ending && has_hyphen_end(*word) && !ambigs_mode) {
	if (segment_dawg_debug)
	tprintf("new hyphen choice = %s\n",
	word->debug_string(getUnicharset()).string());
	word->set_permuter(more_args->permuter);
	adjust_word(word, certainties);
	set_hyphen_word(word, (more_args->active_dawgs),
	*(more_args->constraints));
	update_best_choice(*word, best_choice);
	} else { // Look up char in DAWG
	// TODO(daria): update the rest of the code that specifies alternative
	// letter_is_okay_ functions (e.g. TessCharNgram class) to work with
	// multi-byte unichars and/or unichar ids.

	// If the current unichar is an ngram first try calling
	// letter_is_okay() for each unigram it contains separately.
	UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
	bool checked_unigrams = false;
	if (getUnicharset().get_isngram(orig_uch_id)) {
	if (segment_dawg_debug) {
	tprintf("checking unigrams in an ngram %s\n",
	getUnicharset().debug_str(orig_uch_id).string());
	}
	int orig_num_fragments = word->fragment_length(word_index);
	int num_unigrams = 0;
	word->remove_last_unichar_id();
	const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
	const char *ngram_str_end = ngram_str + strlen(ngram_str);
	const char *ngram_ptr = ngram_str;
	bool unigrams_ok = true;
	// Construct DawgArgs that reflect the current state.
	DawgInfoVector unigram_active_dawgs = *(more_args->active_dawgs);
	DawgInfoVector unigram_constraints = *(more_args->constraints);
	DawgInfoVector unigram_updated_active_dawgs;
	DawgInfoVector unigram_updated_constraints;
	DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_constraints,
	&unigram_updated_active_dawgs,
	&unigram_updated_constraints, 0.0);
	unigram_dawg_args.permuter = more_args->permuter;
	// Check unigrams in the ngram with letter_is_okay().
	while (unigrams_ok && ngram_ptr < ngram_str_end) {
	int step = getUnicharset().step(ngram_ptr);
	UNICHAR_ID uch_id = (step <= 0) ? INVALID_UNICHAR_ID :
	getUnicharset().unichar_to_id(ngram_ptr, step);
	ngram_ptr += step;
	++num_unigrams;
	word->append_unichar_id(uch_id, 1, 0.0, 0.0);
	unigrams_ok = unigrams_ok && (this->*letter_is_okay_)(
	&unigram_dawg_args, word_index+num_unigrams-1, word,
	word_ending && (ngram_ptr == ngram_str_end));
	(*unigram_dawg_args.active_dawgs) =
	*(unigram_dawg_args.updated_active_dawgs);
	(*unigram_dawg_args.constraints) =
	*(unigram_dawg_args.updated_constraints);
	if (segment_dawg_debug) {
	tprintf("unigram %s is %s\n",
	getUnicharset().debug_str(uch_id).string(),
	unigrams_ok ? "OK" : "not OK");
	}
	}
	// Restore the word and copy the updated dawg state if needed.
	while (num_unigrams-- > 0) word->remove_last_unichar_id();
	word->append_unichar_id_space_allocated(
	orig_uch_id, orig_num_fragments, 0.0, 0.0);
	if (unigrams_ok) {
	checked_unigrams = true;
	more_args->permuter = unigram_dawg_args.permuter;
	*(more_args->updated_active_dawgs) =
	*(unigram_dawg_args.updated_active_dawgs);
	*(more_args->updated_constraints) =
	*(unigram_dawg_args.updated_constraints);
	}
	}

	// Check which dawgs from dawgs_ vector contain the word
	// up to and including the current unichar.
	if (checked_unigrams \|\|
	(this->*letter_is_okay_)(more_args, word_index, word, word_ending)) {
	// Add a new word choice
	if (word_ending) {
	if (segment_dawg_debug) {
	tprintf("found word = %s\n",
	word->debug_string(getUnicharset()).string());
	}
	WERD_CHOICE *adjusted_word = word;
	WERD_CHOICE hyphen_tail_word;
	if (!ambigs_mode && hyphen_base_size() > 0) {
	hyphen_tail_word = *word;
	remove_hyphen_head(&hyphen_tail_word);
	adjusted_word = &hyphen_tail_word;
	}
	adjusted_word->set_permuter(more_args->permuter);
	if (!ambigs_mode) {
	adjust_word(adjusted_word, &certainties[hyphen_base_size()]);
	}
	update_best_choice(*adjusted_word, best_choice);
	} else { // search the next letter
	// Make updated_* point to the next entries in the DawgInfoVector
	// arrays (that were originally created in dawg_permute_and_select)
	++(more_args->updated_active_dawgs);
	++(more_args->updated_constraints);
	// Make active_dawgs and constraints point to the updated ones.
	++(more_args->active_dawgs);
	++(more_args->constraints);
	permute_choices(debug, char_choices, char_choice_index + 1,
	prev_char_frag_info, word, certainties, limit,
	best_choice, more_args);
	// Restore previous state to explore another letter in this position.
	--(more_args->updated_active_dawgs);
	--(more_args->updated_constraints);
	--(more_args->active_dawgs);
	--(more_args->constraints);
	}
	} else {
	if (segment_dawg_debug) {
	tprintf("last unichar not OK at index %d in %s\n",
	word_index, word->debug_string(getUnicharset()).string());
	}
	}
	}
	}

	/**********************************************************************
	* dawg_permute_and_select
	*
	* Recursively explore all the possible character combinations in
	* the given char_choices. Use go_deeper_dawg_fxn() to search all the
	* dawgs in the dawgs_ vector in parallel and discard invalid words.
	*
	* Allocate and return a WERD_CHOICE with the best valid word found.
	* **********************************************************************/
	WERD_CHOICE *Dict::dawg_permute_and_select(
	const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
	WERD_CHOICE *best_choice = new WERD_CHOICE();
	best_choice->make_bad();
	best_choice->set_rating(rating_limit);
	if (char_choices.length() == 0) return best_choice;
	DawgInfoVector *active_dawgs = new DawgInfoVector[char_choices.length() + 1];
	DawgInfoVector *constraints = new DawgInfoVector[char_choices.length() + 1];
	init_active_dawgs(&(active_dawgs[0]));
	init_constraints(&(constraints[0]));
	DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]),
	&(active_dawgs[1]), &(constraints[1]),
	(segment_penalty_dict_case_bad /
	segment_penalty_dict_case_ok));
	WERD_CHOICE word(MAX_WERD_LENGTH);
	copy_hyphen_info(&word);
	// Discard rating and certainty of the hyphen base (if any).
	word.set_rating(0.0);
	word.set_certainty(0.0);
	if (word.length() + char_choices.length() > MAX_WERD_LENGTH) {
	return best_choice; // the word is too long to permute
	}
	float certainties[MAX_WERD_LENGTH];
	this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;
	permute_choices(segment_dawg_debug ? "segment_dawg_debug" : NULL,
	char_choices, 0, NULL, &word, certainties,
	&rating_limit, best_choice, &dawg_args);
	delete[] active_dawgs;
	delete[] constraints;
	return best_choice;
	}

	// Fill the given active_dawgs vector with dawgs that could contain the
	// beginning of the word. If hyphenated() returns true, copy the entries
	// from hyphen_active_dawgs_ instead.
	void Dict::init_active_dawgs(DawgInfoVector *active_dawgs) {
	int i;
	if (hyphenated()) {
	*active_dawgs = hyphen_active_dawgs_;
	if (dawg_debug_level >= 3) {
	for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
	tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
	hyphen_active_dawgs_[i].dawg_index,
	hyphen_active_dawgs_[i].ref);
	}
	}
	} else {
	for (i = 0; i < dawgs_.length(); ++i) {
	if (kBeginningDawgsType[(dawgs_[i])->type()]) {
	*active_dawgs += DawgInfo(i, NO_EDGE);
	if (dawg_debug_level >= 3) {
	tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
	}
	}
	}
	}
	}

	// If hyphenated() returns true, copy the entries from hyphen_constraints_
	// into the given constraints vector.
	void Dict::init_constraints(DawgInfoVector *constraints) {
	if (hyphenated()) {
	*constraints = hyphen_constraints_;
	if (dawg_debug_level >= 3) {
	for (int i = 0; i < hyphen_constraints_.size(); ++i) {
	tprintf("Adding hyphen constraint [%d, " REFFORMAT "]\n",
	hyphen_constraints_[i].dawg_index,
	hyphen_constraints_[i].ref);
	}
	}
	}
	}

	} // namespace tesseract