dict/stopper.cpp - platform/external/tesseract - Git at Google

 /******************************************************************************
  **	Filename:    stopper.c
  **	Purpose:     Stopping criteria for word classifier.
  **	Author:      Dan Johnson
  **	History:     Mon Apr 29 14:56:49 1991, DSJ, Created.
  **
  **	(c) Copyright Hewlett-Packard Company, 1988.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  ******************************************************************************/
 /**----------------------------------------------------------------------------
           Include Files and Type Defines
 ----------------------------------------------------------------------------**/
 #include "stopper.h"
 #include "emalloc.h"
 #include "matchdefs.h"
 #include "debug.h"
 #include "callcpp.h"
 #include "permute.h"
 #include "context.h"
 #include "permnum.h"
 #include "danerror.h"
 #include "const.h"
 #include "freelist.h"
 #include "efio.h"
 #include "globals.h"
 #include "scanutils.h"
 #include "unichar.h"
 #include "varable.h"
 #include "dict.h"
 #include "image.h"
 #include "ccutil.h"
 #include "ratngs.h"

 #include <stdio.h>
 #include <string.h>
 #include <ctype.h>
 #include <math.h>
 #ifdef __UNIX__
 #include <assert.h>
 #endif

 /* these are kludges - add appropriate .h file later */
 /* from adaptmatch.cpp */
 double_VAR(certainty_scale, 20.0, "Certainty scaling factor");

 #define MAX_WERD_SIZE   100
 #define MAX_AMBIG_SIZE    3
 #define DANGEROUS_AMBIGS  "DangAmbigs"

 typedef LIST AMBIG_TABLE;

 typedef struct
 {
   VIABLE_CHOICE Choice;
   float ChunkCertainty[MAX_NUM_CHUNKS];
   UNICHAR_ID ChunkClass[MAX_NUM_CHUNKS];
 } EXPANDED_CHOICE;

 typedef struct
 {
   char ambig[2 * (UNICHAR_LEN * MAX_AMBIG_SIZE) + 2];
   char lengths[2 * (MAX_AMBIG_SIZE) + 2];
 } AMBIG_SPEC;

 /**----------------------------------------------------------------------------
           Macros
 ----------------------------------------------------------------------------**/
 #define BestCertainty(Choices)  (((VIABLE_CHOICE) first_node (Choices))->Certainty)
 #define BestRating(Choices) (((VIABLE_CHOICE) first_node (Choices))->Rating)
 #define BestFactor(Choices) (((VIABLE_CHOICE) first_node (Choices))->AdjustFactor)

 #define AmbigThreshold(F1,F2)	(((F2) - (F1)) * stopper_ambiguity_threshold_gain - \
 				stopper_ambiguity_threshold_offset)

 /*---------------------------------------------------------------------------
           Private Function Prototoypes
 ----------------------------------------------------------------------------*/
 void AddNewChunk(VIABLE_CHOICE Choice, int Blob);

 int CmpChoiceRatings(void *arg1,   //VIABLE_CHOICE         Choice1,
                      void *arg2);  //VIABLE_CHOICE         Choice2);

 void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice);

 int FreeBadChoice(void *item1,   //VIABLE_CHOICE                 Choice,
                   void *item2);  //EXPANDED_CHOICE                       *BestChoice);

 int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices,
                        const WERD_CHOICE &BestChoice);

 /**----------------------------------------------------------------------------
         Global Data Definitions and Declarations
 ----------------------------------------------------------------------------**/
 /* Name of file containing potentially dangerous ambiguities */
 static const char *DangerousAmbigs = DANGEROUS_AMBIGS;

 /* Word for which stopper debug information should be printed to stdout */
 static char *WordToDebug = NULL;
 static char *WordToDebug_lengths = NULL;

 /* flag used to disable accumulation of word choices during compound word
   permutation */
 BOOL8 KeepWordChoices = TRUE;

 /* additional certainty padding allowed before a word is rejected */
 static FLOAT32 RejectOffset = 0.0;

 /* structures to keep track of viable word choices */
 static VIABLE_CHOICE BestRawChoice = NULL;
 static LIST BestChoices = NIL;
 static PIECES_STATE CurrentSegmentation;

 double_VAR(stopper_nondict_certainty_base, -2.50,
            "Certainty threshold for non-dict words");

 double_VAR(stopper_phase2_certainty_rejection_offset, 1.0,
            "Reject certainty offset");

 INT_VAR(stopper_smallword_size, 2,
         "Size of dict word to be treated as non-dict word");

 double_VAR(stopper_certainty_per_char, -0.50,
            "Certainty to add for each dict char above small word size.");

 double_VAR(stopper_allowable_character_badness, 3.0,
            "Max certaintly variation allowed in a word (in sigma)");

 INT_VAR(stopper_debug_level, 0,
         "Stopper debug level");

 double_VAR(stopper_ambiguity_threshold_gain, 8.0,
            "Gain factor for ambiguity threshold");

 double_VAR(stopper_ambiguity_threshold_offset, 1.5,
            "Certainty offset for ambiguity threshold");

 extern int first_pass;
 INT_VAR (tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");

 /**----------------------------------------------------------------------------
               Public Code
 ----------------------------------------------------------------------------**/
 /*---------------------------------------------------------------------------*/
 namespace tesseract {
 int Dict::AcceptableChoice(const BLOB_CHOICE_LIST_VECTOR &Choices,
                            const WERD_CHOICE &BestChoice,
                            const WERD_CHOICE &RawChoice,
                            DANGERR *fixpt,
                            ACCEPTABLE_CHOICE_CALLER caller) {
 /*
  **	Parameters:
  **		Choices		choices for current segmentation
  **		BestChoice	best choice for current segmentation
  **		RawChoice	best raw choice for current segmentation
  **	Globals:
  **		stopper_nondict_certainty_base	certainty for a non-dict word
  **		stopper_smallword_size		size of word to be treated as non-word
  **		stopper_certainty_per_char	certainty to add for each dict char
  **	Operation: Return TRUE if the results from this segmentation are
  **		good enough to stop.  Otherwise return FALSE.
  **	Return: TRUE or FALSE.
  **	Exceptions: none
  **	History: Mon Apr 29 14:57:32 1991, DSJ, Created.
  */
   float CertaintyThreshold = stopper_nondict_certainty_base;
   int WordSize;

   if (fixpt != NULL)
     fixpt->index = -1;
   if (BestChoice.length() == 0)
     return (FALSE);
   if (caller == CHOPPER_CALLER && BestChoice.fragment_mark()) {
     if (stopper_debug_level >= 1) {
       cprintf("AcceptableChoice(): a choice with fragments beats BestChoice");
     }
     return false;
   }

   // TODO(daria): remove this conversion once dawg (valid_word)
   // is switched to use unichar ids, valid_number is deprecated
   // and DanAmbigs are fixed to work with unichar ids.
   STRING word_str = BestChoice.unichar_string();
   STRING word_lengths_str = BestChoice.unichar_lengths();

   if (stopper_debug_level >= 1)
     cprintf ("\nStopper:  %s (word=%c, case=%c, punct=%c)\n",
       word_str.string(),
       (valid_word(word_str.string()) ? 'y' : 'n'),
       (case_ok(word_str.string(), word_lengths_str.string()) ? 'y' : 'n'),
       ((punctuation_ok(word_str.string(),
                        word_lengths_str.string()) != -1) ? 'y' : 'n'));

   if (valid_word(word_str.string()) &&
       case_ok(word_str.string(), word_lengths_str.string()) &&
       punctuation_ok(word_str.string(), word_lengths_str.string()) != -1) {
     WordSize = LengthOfShortestAlphaRun(BestChoice);
     WordSize -= stopper_smallword_size;
     if (WordSize < 0)
       WordSize = 0;
     CertaintyThreshold += WordSize * stopper_certainty_per_char;
   } else if (stopper_numbers_on &&
              valid_number(word_str.string(), word_lengths_str.string())) {
     CertaintyThreshold += stopper_numbers_on * stopper_certainty_per_char;
   }

   if (stopper_debug_level >= 1)
     cprintf ("Stopper:  Certainty = %4.1f, Threshold = %4.1f\n",
              BestChoice.certainty(), CertaintyThreshold);

   if (NoDangerousAmbig(word_str.string(), word_lengths_str.string(), fixpt) &&
       BestChoice.certainty() > CertaintyThreshold &&
       UniformCertainties(Choices, BestChoice)) {
     return (TRUE);
   } else {
     return (FALSE);
   }

 }                                /* AcceptableChoice */


 /*---------------------------------------------------------------------------*/
 int Dict::AcceptableResult(const WERD_CHOICE &BestChoice,
                            const WERD_CHOICE &RawChoice) {
 /*
  **	Parameters:
  **		BestChoice	best choice for current word
  **		RawChoice	best raw choice for current word
  **	Globals:
  **		stopper_nondict_certainty_base	certainty for a non-dict word
  **		stopper_smallword_size		size of word to be treated as non-word
  **		stopper_certainty_per_char	certainty to add for each dict char
  **		BestChoices		list of all good choices found
  **		RejectOffset		allowed offset before a word is rejected
  **	Operation: Return FALSE if the best choice for the current word
  **		is questionable and should be tried again on the second
  **		pass or should be flagged to the user.
  **	Return: TRUE or FALSE.
  **	Exceptions: none
  **	History: Thu May  9 14:05:05 1991, DSJ, Created.
  */
   float CertaintyThreshold = stopper_nondict_certainty_base - RejectOffset;
   int WordSize;

   // TODO(daria): remove this conversion once dawg (valid_word)
   // is switched to use unichar ids, valid_number is deprecated
   // and DanAmbigs are fixed to work with unichar ids.
   STRING word_str = BestChoice.unichar_string();
   STRING word_lengths_str = BestChoice.unichar_lengths();

   if (stopper_debug_level >= 1)
     cprintf ("\nRejecter: %s (word=%c, case=%c, punct=%c, unambig=%c)\n",
       word_str.string(),
       (valid_word(word_str.string()) ? 'y' : 'n'),
       (case_ok(word_str.string(), word_lengths_str.string()) ? 'y' : 'n'),
       ((punctuation_ok(word_str.string(),
                        word_lengths_str.string()) != -1) ? 'y' : 'n'),
       ((rest (BestChoices) != NIL) ? 'n' : 'y'));

   if (BestChoice.length() == 0 || CurrentWordAmbig())
     return (FALSE);
   if (BestChoice.fragment_mark()) {
     if (stopper_debug_level >= 1) {
       cprintf("AcceptableResult(): a choice with fragments beats BestChoice\n");
     }
     return false;
   }
   if (valid_word(word_str.string()) &&
       case_ok(word_str.string(), word_lengths_str.string()) &&
       (punctuation_ok(word_str.string(), word_lengths_str.string())) != -1) {
     WordSize = LengthOfShortestAlphaRun(BestChoice);
     WordSize -= stopper_smallword_size;
     if (WordSize < 0)
       WordSize = 0;
     CertaintyThreshold += WordSize * stopper_certainty_per_char;
   }

   if (stopper_debug_level >= 1)
     cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f   ",
       BestChoice.certainty(), CertaintyThreshold);

   if (BestChoice.certainty() > CertaintyThreshold) {
     if (stopper_debug_level >= 1)
       cprintf("ACCEPTED\n");
     return (TRUE);
   }
   else {
     if (stopper_debug_level >= 1)
       cprintf("REJECTED\n");
     return (FALSE);
   }
 }                                /* AcceptableResult */
 }  // namespace tesseract


 /*---------------------------------------------------------------------------*/
 int AlternativeChoicesWorseThan(FLOAT32 Threshold) {
 /*
  **	Parameters:
  **		Threshold	minimum adjust factor for alternative choices
  **	Globals:
  **		BestChoices	alternative choices for current word
  **	Operation: This routine returns TRUE if there are no alternative
  **		choices for the current word OR if all alternatives have
  **		an adjust factor worse than Threshold.
  **	Return: TRUE or FALSE.
  **	Exceptions: none
  **	History: Mon Jun  3 09:36:31 1991, DSJ, Created.
  */
   LIST Alternatives;
   VIABLE_CHOICE Choice;

   Alternatives = rest (BestChoices);
   iterate(Alternatives) {
     Choice = (VIABLE_CHOICE) first_node (Alternatives);
     if (Choice->AdjustFactor <= Threshold)
       return (FALSE);
   }

   return (TRUE);

 }                                /* AlternativeChoicesWorseThan */


 namespace tesseract {
 /*---------------------------------------------------------------------------*/
 int Dict::CurrentBestChoiceIs(const WERD_CHOICE &WordChoice) {
 /*
  **	Parameters:
  **             Word            word that will be compared to the best choice
  **	Globals:
  **		BestChoices	set of best choices for current word
  **	Operation: Returns TRUE if Word is the same as the current best
  **		choice, FALSE otherwise.
  **	Return: TRUE or FALSE
  **	Exceptions: none
  **	History: Thu May 30 14:44:22 1991, DSJ, Created.
  */
   return (BestChoices != NIL &&
           StringSameAs(WordChoice, (VIABLE_CHOICE)first_node(BestChoices)));
 }                                /* CurrentBestChoiceIs */


 /*---------------------------------------------------------------------------*/
 FLOAT32 Dict::CurrentBestChoiceAdjustFactor() {
 /*
  **	Parameters: none
  **	Globals:
  **		BestChoices	set of best choices for current word
  **	Operation: Return the adjustment factor for the best choice for
  **		the current word.
  **	Return: Adjust factor for current best choice.
  **	Exceptions: none
  **	History: Thu May 30 14:48:24 1991, DSJ, Created.
  */
   VIABLE_CHOICE BestChoice;

   if (BestChoices == NIL)
     return (MAX_FLOAT32);

   BestChoice = (VIABLE_CHOICE) first_node (BestChoices);
   return (BestChoice->AdjustFactor);

 }                                /* CurrentBestChoiceAdjustFactor */


 /*---------------------------------------------------------------------------*/
 int Dict::CurrentWordAmbig() {
 /*
  **	Parameters: none
  **	Globals:
  **		BestChoices	set of best choices for current word
  **	Operation: This routine returns TRUE if there are multiple good
  **		choices for the current word and FALSE otherwise.
  **	Return: TRUE or FALSE
  **	Exceptions: none
  **	History: Wed May 22 15:38:38 1991, DSJ, Created.
  */
   return (rest (BestChoices) != NIL);

 }                                /* CurrentWordAmbig */


 /*---------------------------------------------------------------------------*/
 void Dict::DebugWordChoices() {
 /*
  **	Parameters: none
  **	Globals:
  **		BestRawChoice
  **		BestChoices
  **	Operation: Print the current choices for this word to stdout.
  **	Return: none
  **	Exceptions: none
  **	History: Wed May 15 13:52:08 1991, DSJ, Created.
  */
   LIST Choices;
   int i;
   char LabelString[80];
   VIABLE_CHOICE VChoice = (VIABLE_CHOICE)first_node(BestChoices);
   bool force_debug =
     fragments_debug && VChoice != NULL && VChoice->ComposedFromCharFragments;

   if (stopper_debug_level >= 1 || force_debug ||
       (WordToDebug && BestChoices &&
        StringSameAs(WordToDebug, WordToDebug_lengths,
                     (VIABLE_CHOICE)first_node(BestChoices)))) {
     if (BestRawChoice)
       PrintViableChoice(stderr, "\nBest Raw Choice:   ", BestRawChoice);

     i = 1;
     Choices = BestChoices;
     if (Choices)
       cprintf("\nBest Cooked Choices:\n");
     iterate(Choices) {
       sprintf(LabelString, "Cooked Choice #%d:  ", i);
       PrintViableChoice(stderr, LabelString,
                         (VIABLE_CHOICE)first_node(Choices));
       i++;
     }
   }
 }                                /* DebugWordChoices */
 }  // namespace tesseract


 /*---------------------------------------------------------------------------*/
 void FilterWordChoices() {
 /*
  **	Parameters: none
  **	Globals:
  **		BestChoices	set of choices for current word
  **	Operation: This routine removes from BestChoices all choices which
  **		are not within a reasonable range of the best choice.
  **	Return: none
  **	Exceptions: none
  **	History: Wed May 15 13:08:24 1991, DSJ, Created.
  */
   EXPANDED_CHOICE BestChoice;

   if (BestChoices == NIL || second_node (BestChoices) == NIL)
     return;

   /* compute certainties and class for each chunk in best choice */
   ExpandChoice ((VIABLE_CHOICE_STRUCT *) first_node (BestChoices), &BestChoice);

   set_rest (BestChoices, delete_d (rest (BestChoices),
     &BestChoice, FreeBadChoice));

 }                                /* FilterWordChoices */


 /*---------------------------------------------------------------------------*/
 void FindClassifierErrors(FLOAT32 MinRating,
                           FLOAT32 MaxRating,
                           FLOAT32 RatingMargin,
                           FLOAT32 Thresholds[]) {
 /*
  **	Parameters:
  **		MinRating		limits how tight to make a template
  **		MaxRating		limits how loose to make a template
  **		RatingMargin		amount of margin to put in template
  **		Thresholds[]		place to put error thresholds
  **	Globals: none
  **	Operation: This routine compares the best choice for the current
  **		word to the best raw choice to determine which characters
  **		were classified incorrectly by the classifier.  It then
  **		places a separate threshold into Thresholds for each
  **		character in the word.  If the classifier was correct,
  **		MaxRating is placed into Thresholds.  If the
  **		classifier was incorrect, the avg. match rating (error
  **		percentage) of the classifier's incorrect choice minus
  **		some margin is
  **		placed into thresholds.  This can then be used by the
  **		caller to try to create a new template for the desired
  **		class that will classify the character with a rating better
  **		than the threshold value.  The match rating placed into
  **		Thresholds is never allowed to be below MinRating in order
  **		to prevent trying to make overly tight templates.
  **	Return: none (results are placed in Thresholds)
  **	Exceptions: none
  **	History: Fri May 31 16:02:57 1991, DSJ, Created.
  */
   EXPANDED_CHOICE BestRaw;
   VIABLE_CHOICE Choice;
   int i, j, Chunk;
   FLOAT32 AvgRating;
   int NumErrorChunks;

   assert (BestChoices != NIL);
   assert (BestRawChoice != NULL);

   ExpandChoice(BestRawChoice, &BestRaw);
   Choice = (VIABLE_CHOICE) first_node (BestChoices);

   for (i = 0, Chunk = 0; i < Choice->Length; i++, Thresholds++) {
     AvgRating = 0.0;
     NumErrorChunks = 0;

     for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
       if (Choice->Blob[i].Class != BestRaw.ChunkClass[Chunk]) {
         AvgRating += BestRaw.ChunkCertainty[Chunk];
         NumErrorChunks++;
       }
     }

     if (NumErrorChunks > 0) {
       AvgRating /= NumErrorChunks;
       *Thresholds = (AvgRating / -certainty_scale) * (1.0 - RatingMargin);
     }
     else
       *Thresholds = MaxRating;

     if (*Thresholds > MaxRating)
       *Thresholds = MaxRating;
     if (*Thresholds < MinRating)
       *Thresholds = MinRating;
   }
 }                                /* FindClassifierErrors */


 /*---------------------------------------------------------------------------*/
 void InitStopperVars() {
 /*
  **	Parameters: none
  **	Globals: none
  **	Operation: Initializes the control variables used by the stopper.
  **	Return: none
  **	Exceptions: none
  **	History: Thu May  9 10:06:04 1991, DSJ, Created.
  */
   VALUE dummy;

   string_variable (DangerousAmbigs, "DangerousAmbigs", DANGEROUS_AMBIGS);
   string_variable (WordToDebug, "WordToDebug", "");
   string_variable (WordToDebug_lengths, "WordToDebug_lengths", "");

 }                                /* InitStopperVars */


 /*---------------------------------------------------------------------------*/
 void InitChoiceAccum() {
 /*
  **	Parameters: none
  **	Globals: none
  **	Operation: This routine initializes the data structures used to
  **		keep track the good word choices found for a word.
  **	Return: none
  **	Exceptions: none
  **	History: Fri May 17 07:59:00 1991, DSJ, Created.
  */
   BLOB_WIDTH *BlobWidth, *End;

   if (BestRawChoice)
     memfree(BestRawChoice);

   if (BestChoices)
     destroy_nodes(BestChoices, memfree);

   BestRawChoice = NULL;
   BestChoices = NIL;
   EnableChoiceAccum();

   for (BlobWidth = CurrentSegmentation,
     End = CurrentSegmentation + MAX_NUM_CHUNKS;
     BlobWidth < End; *BlobWidth++ = 1);

 }                                /* InitChoiceAccum */


 /*---------------------------------------------------------------------------*/
 namespace tesseract {
 void Dict::LogNewRawChoice(const WERD_CHOICE &WordChoice,
                            FLOAT32 AdjustFactor,
                            const float Certainties[]) {
 /*
  **	Parameters:
  **		Choice		new raw choice for current word
  **		AdjustFactor	adjustment factor which was applied to choice
  **		Certainties	certainties for each char in new choice
  **	Globals:
  **		BestRawChoice	best raw choice so far for current word
  **	Operation: This routine compares Choice to the best raw (non-dict)
  **		choice so far and replaces it if the new choice is better.
  **	Return: none
  **	Exceptions: none
  **	History: Wed May 15 09:57:19 1991, DSJ, Created.
  */
   if (!KeepWordChoices)
     return;

   if (!BestRawChoice)
     BestRawChoice = NewViableChoice (WordChoice, AdjustFactor, Certainties);
   else if (WordChoice.rating() < BestRawChoice->Rating) {
     if (ChoiceSameAs(WordChoice, BestRawChoice))
       FillViableChoice(WordChoice, AdjustFactor, Certainties, true,
                        BestRawChoice);
     else {
       memfree(BestRawChoice);
       BestRawChoice = NewViableChoice(WordChoice, AdjustFactor, Certainties);
     }
   }
 }                                /* LogNewRawChoice */

 }  // namespace tesseract

 /*---------------------------------------------------------------------------*/
 void LogNewSegmentation(PIECES_STATE BlobWidth) {
 /*
  **	Parameters:
  **		BlobWidth[]	number of chunks in each blob in segmentation
  **	Globals:
  **		CurrentSegmentation	blob widths for current segmentation
  **	Operation: This routine updates the blob widths in CurrentSegmentation
  **		to be the same as provided in BlobWidth.
  **	Return: none
  **	Exceptions: none
  **	History: Mon May 20 11:52:26 1991, DSJ, Created.
  */
   BLOB_WIDTH *Segmentation;

   for (Segmentation = CurrentSegmentation; *BlobWidth != 0;
     BlobWidth++, Segmentation++)
   *Segmentation = *BlobWidth;
   *Segmentation = 0;

 }                                /* LogNewSegmentation */


 /*---------------------------------------------------------------------------*/
 void LogNewSplit(int Blob) {
 /*
  **	Parameters:
  **		Blob	index of blob that was split
  **	Globals:
  **		BestRawChoice	current best raw choice
  **		BestChoices	list of best choices found so far
  **	Operation: This routine adds 1 chunk to the specified blob for each
  **		choice in BestChoices and for the BestRawChoice.
  **	Return: none
  **	Exceptions: none
  **	History: Mon May 20 11:38:56 1991, DSJ, Created.
  */
   LIST Choices;

   if (BestRawChoice) {
     AddNewChunk(BestRawChoice, Blob);
   }

   Choices = BestChoices;
   iterate(Choices) {
     AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob);
   }

 }                                /* LogNewSplit */


 /*---------------------------------------------------------------------------*/
 namespace tesseract {
 void Dict::LogNewWordChoice(const WERD_CHOICE &WordChoice,
                             FLOAT32 AdjustFactor,
                             const float Certainties[]) {
 /*
  **	Parameters:
  **		Choice		new choice for current word
  **		AdjustFactor	adjustment factor which was applied to choice
  **		Certainties	certainties for each char in new choice
  **	Globals:
  **		BestChoices	best choices so far for current word
  **	Operation: This routine adds Choice to BestChoices if the
  **		adjusted certainty for Choice is within a reasonable range
  **		of the best choice in BestChoices.  The BestChoices
  **		list is kept in sorted order by rating. Duplicates are
  **		removed.
  **	Return: none
  **	Exceptions: none
  **	History: Wed May 15 09:57:19 1991, DSJ, Created.
  */
   VIABLE_CHOICE NewChoice;
   LIST Choices;
   FLOAT32 Threshold;

   if (!KeepWordChoices)
     return;

   /* throw out obviously bad choices to save some work */
   if (BestChoices != NIL) {
     Threshold = AmbigThreshold (BestFactor (BestChoices), AdjustFactor);
     if (Threshold > -stopper_ambiguity_threshold_offset)
       Threshold = -stopper_ambiguity_threshold_offset;
     if (WordChoice.certainty() - BestCertainty (BestChoices) < Threshold)
       return;
   }

   /* see if a choice with the same text string has already been found */
   NewChoice = NULL;
   Choices = BestChoices;

   iterate(Choices) {
     if (ChoiceSameAs (WordChoice, (VIABLE_CHOICE) first_node (Choices))) {
       if (WordChoice.rating() < BestRating (Choices)) {
         NewChoice = (VIABLE_CHOICE) first_node (Choices);
       } else {
         return;
       }
     }
   }

   if (NewChoice) {
     FillViableChoice(WordChoice, AdjustFactor, Certainties, true, NewChoice);
     BestChoices = delete_d(BestChoices, NewChoice, is_same_node);
   }
   else {
     NewChoice = NewViableChoice (WordChoice, AdjustFactor, Certainties);
   }

   BestChoices = s_adjoin (BestChoices, NewChoice, CmpChoiceRatings);
   if (stopper_debug_level >= 2)
     PrintViableChoice (stderr, "New Word Choice:  ", NewChoice);
   if (count (BestChoices) > tessedit_truncate_wordchoice_log) {
     Choices =
       (LIST) nth_cell (BestChoices, tessedit_truncate_wordchoice_log);
     destroy_nodes (rest (Choices), Efree);
     set_rest(Choices, NIL);
   }

 }                                /* LogNewWordChoice */


 /*---------------------------------------------------------------------------*/
 static AMBIG_TABLE *AmbigFor = NULL;

 int Dict::NoDangerousAmbig(const char *Word,
                            const char *Word_lengths,
                            DANGERR *fixpt) {
 /*
  **	Parameters:
  **		Word	word to check for dangerous ambiguities
  **		Word_lengths	lengths of unichars in Word
  **	Globals: none
  **	Operation: This word checks each letter in word against a list
  **		of potentially ambiguous characters.  If a match is found
  **		that letter is replaced with its ambiguity and tested in
  **		the dictionary.  If the ambiguous word is found in the
  **		dictionary, FALSE is returned.  Otherwise, the search
  **		continues for other ambiguities.  If no ambiguities that
  **		match in the dictionary are found, TRUE is returned.
  **	Return: TRUE if Word contains no dangerous ambiguities.
  **	Exceptions: none
  **	History: Mon May  6 16:28:56 1991, DSJ, Created.
  */

   char NewWord[MAX_WERD_SIZE * UNICHAR_LEN + 1];
   char *NextNewChar;
   int bad_index = 0;

   if (!AmbigFor)
     AmbigFor = FillAmbigTable ();

   NextNewChar = NewWord;
   while (*Word) {
     if (AmbigsFound (NewWord, NextNewChar,
                      Word + *Word_lengths, Word_lengths + 1,
                      AmbigFor[getUnicharset().unichar_to_id(
                          Word, *Word_lengths)],
                      fixpt)) {
       if (fixpt != NULL)
         fixpt->index = bad_index;
       return (FALSE);
     } else {
       strncpy(NextNewChar, Word, *Word_lengths);
       NextNewChar += *Word_lengths;
       Word += *Word_lengths;
       Word_lengths++;
       bad_index++;
     }
   }
   return (TRUE);
 }                                /* NoDangerousAmbig */

 void Dict::EndDangerousAmbigs() {
   if (AmbigFor != NULL) {
     for (int i = 0; i <= MAX_CLASS_ID; ++i) {
       destroy_nodes(AmbigFor[i], Efree);
     }
     Efree(AmbigFor);
     AmbigFor = NULL;
   }
 }
 }  // namespace tesseract

 /*---------------------------------------------------------------------------*/
 void SettupStopperPass1() {
 /*
  **	Parameters: none
  **	Globals:
  **		RejectOffset	offset allowed before word is rejected
  **	Operation: This routine performs any settup of stopper variables
  **		that is needed in preparation for the first pass.
  **	Return: none
  **	Exceptions: none
  **	History: Mon Jun  3 12:32:00 1991, DSJ, Created.
  */
   RejectOffset = 0.0;
 }                                /* SettupStopperPass1 */


 /*---------------------------------------------------------------------------*/
 void SettupStopperPass2() {
 /*
  **	Parameters: none
  **	Globals:
  **		RejectOffset	offset allowed before word is rejected
  **	Operation: This routine performs any settup of stopper variables
  **		that is needed in preparation for the second pass.
  **	Return: none
  **	Exceptions: none
  **	History: Mon Jun  3 12:32:00 1991, DSJ, Created.
  */
   RejectOffset = stopper_phase2_certainty_rejection_offset;
 }                                /* SettupStopperPass2 */


 /**----------------------------------------------------------------------------
               Private Code
 ----------------------------------------------------------------------------**/
 /*---------------------------------------------------------------------------*/
 void AddNewChunk(VIABLE_CHOICE Choice, int Blob) {
 /*
  **	Parameters:
  **		Choice	choice to add a new chunk to
  **		Blob	index of blob being split
  **	Globals: none
  **	Operation: This routine increments the chunk count of the character
  **		in Choice which corresponds to Blob.
  **	Return: none
  **	Exceptions: none
  **	History: Mon May 20 11:43:27 1991, DSJ, Created.
  */
   int i, LastChunk;

   for (i = 0, LastChunk = 0; i < Choice->Length; i++) {
     LastChunk += Choice->Blob[i].NumChunks;
     if (Blob < LastChunk) {
       (Choice->Blob[i].NumChunks)++;
       return;
     }
   }
   mem_tidy (1);
   cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n",
            Choice->Length, LastChunk, Blob);
   assert(FALSE);  /* this should never get executed */

 }                                /* AddNewChunk */


 /*---------------------------------------------------------------------------*/
 namespace tesseract {
 int Dict::AmbigsFound(char *Word,
                       char *CurrentChar,
                       const char *Tail,
                       const char *Tail_lengths,
                       LIST Ambigs,
                       DANGERR *fixpt) {
 /*
  **	Parameters:
  **		Word		word being tested for ambiguities
  **		CurrentChar	position in Word to put ambig replacement
  **		Tail		end of word to place after ambiguity
  **		Tail_lengths    lengths of the unichars in Tail
  **		Ambigs		list of ambiguities to test at this position
  **	Globals: none
  **	Operation: For each ambiguity in Ambigs, see if the remainder of
  **		the test string matches the start of Tail.  If it does,
  **		construct a word consisting of the contents of Word up to,
  **		but not including, CurrentChar followed by the replacement
  **		string for the ambiguity followed by the unmatched
  **		contents of Tail.  Then test this word to see if it
  **		is a dictionary word.  If it is return TRUE.  If none of
  **		the ambiguities result in a dictionary word, return FALSE.
  **	Return: TRUE if the Word is ambiguous at the specified position
  **	Exceptions: none
  **	History: Thu May  9 10:10:28 1991, DSJ, Created.
  */
   AMBIG_SPEC *AmbigSpec;
   char *ambig;
   char *ambig_lengths;
   const char *UnmatchedTail;
   const char *UnmatchedTail_lengths;
   int Matches;
   int bad_length;

   iterate(Ambigs) {
     AmbigSpec = (AMBIG_SPEC *) first_node (Ambigs);
     ambig = AmbigSpec->ambig;
     ambig_lengths = AmbigSpec->lengths;
     bad_length = 1;
     UnmatchedTail = Tail;
     UnmatchedTail_lengths = Tail_lengths;
     Matches = TRUE;

     while (*ambig != ' ' && Matches)
       if (*UnmatchedTail_lengths == *ambig_lengths &&
           strncmp(ambig, UnmatchedTail, *ambig_lengths) == 0) {
         ambig += *(ambig_lengths++);
         UnmatchedTail += *(UnmatchedTail_lengths++);
         bad_length++;
       }
       else
         Matches = FALSE;

     if (Matches) {
       ambig += *(ambig_lengths++); /* skip over the space */
                                    /* insert replacement string */
       strcpy(CurrentChar, ambig);
                                    /* add tail */
       strcat(Word, UnmatchedTail);
       if (valid_word (Word)) {
         if (stopper_debug_level >= 1)
           cprintf ("Stopper:  Possible ambiguous word = %s\n", Word);
         if (fixpt != NULL) {
           fixpt->good_length = strlen (ambig_lengths);
           fixpt->bad_length = bad_length;
         }
         return (TRUE);
       }
     }
   }
   return (FALSE);

 }                                /* AmbigsFound */


 /*---------------------------------------------------------------------------*/
 int Dict::ChoiceSameAs(const WERD_CHOICE &WordChoice,
                        VIABLE_CHOICE ViableChoice) {
 /*
  **	Parameters:
  **		Choice		choice to compare to ViableChoice
  **		ViableChoice	viable choice to compare to Choice
  **	Globals: none
  **	Operation: This routine compares the corresponding strings of
  **		Choice and ViableChoice and returns TRUE if they are the
  **		same, FALSE otherwise.
  **	Return: TRUE or FALSE.
  **	Exceptions: none
  **	History: Fri May 17 08:48:04 1991, DSJ, Created.
  */
   return (StringSameAs(WordChoice, ViableChoice));

 }                                /* ChoiceSameAs */
 }  // namespace tesseract


 /*---------------------------------------------------------------------------*/
 int CmpChoiceRatings(void *arg1,    //VIABLE_CHOICE                 Choice1,
                      void *arg2) {  //VIABLE_CHOICE                 Choice2)
 /*
  **	Parameters:
  **		Choice1, Choice2	choices to compare ratings for
  **	Globals: none
  **	Operation: Return -1 if the rating for Choice1 is less than the
  **		rating for Choice2, otherwise return (1).
  **	Return: -1 or 1
  **	Exceptions: none
  **	History: Wed May 15 13:02:37 1991, DSJ, Created.
  */
   float R1, R2;
   VIABLE_CHOICE Choice1 = (VIABLE_CHOICE) arg1;
   VIABLE_CHOICE Choice2 = (VIABLE_CHOICE) arg2;

   R1 = Choice1->Rating;
   R2 = Choice2->Rating;

   if (R1 < R2)
     return (-1);
   else
     return (1);

 }                                /* CmpChoiceRatings */


 /*---------------------------------------------------------------------------*/
 void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice) {
 /*
  **	Parameters:
  **		Choice		choice to be expanded
  **		ExpandedChoice	place to put resulting expanded choice
  **	Globals: none
  **	Operation: This routine expands Choice and places the results
  **		in ExpandedChoice.  The primary function of expansion
  **		is to create an two arrays, one which holds the corresponding
  **		certainty for each chunk in Choice, and one which holds
  **		the class for each chunk.
  **	Return: none (results are placed in ExpandedChoice)
  **	Exceptions: none
  **	History: Fri May 31 15:21:57 1991, DSJ, Created.
  */
   int i, j, Chunk;

   ExpandedChoice->Choice = Choice;
   for (i = 0, Chunk = 0; i < Choice->Length; i++)
   for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
     ExpandedChoice->ChunkCertainty[Chunk] = Choice->Blob[i].Certainty;
     ExpandedChoice->ChunkClass[Chunk] = Choice->Blob[i].Class;
   }
 }                                /* ExpandChoice */


 /*---------------------------------------------------------------------------*/
 namespace tesseract {
 AMBIG_TABLE *Dict::FillAmbigTable() {
 /*
  **	Parameters: none
  **	Globals:
  **		DangerousAmbigs		filename of dangerous ambig info
  **	Operation: This routine allocates a new ambiguity table and fills
  **		it in from the file specified by DangerousAmbigs.  An
  **		ambiguity table is an array of lists.  The array is indexed
  **		by a class id.  Therefore, each entry in the table provides
  **		a list of potential ambiguities which can start with the
  **		corresponding character.  Each potential ambiguity is
  **		described by a string which contains the remainder of the
  **		test string followed by a space followed by the replacement
  **		string.  For example the ambiguity "rn -> m", would be
  **		located in the table at index 'r'.  The string corresponding
  **		to this ambiguity would be "n m".
  **	Return: Pointer to new ambiguity table.
  **	Exceptions: none
  **	History: Thu May  9 09:20:57 1991, DSJ, Created.
  */
   FILE *AmbigFile;
   AMBIG_TABLE *NewTable;
   int i;
   int AmbigPartSize;
   char buffer[256 * UNICHAR_LEN];
   char TestString[256 * UNICHAR_LEN];
   char TestString_lengths[256];
   char ReplacementString[256 * UNICHAR_LEN];
   char ReplacementString_lengths[256];
   STRING name;
   char lengths[2];
   AMBIG_SPEC *AmbigSpec;
   UNICHAR_ID unichar_id;

   lengths[1] = 0;

   name = getImage()->getCCUtil()->language_data_path_prefix;
   name += DangerousAmbigs;
   AmbigFile = Efopen (name.string(), "r");
   NewTable = (AMBIG_TABLE *) Emalloc (sizeof (LIST) * (MAX_CLASS_ID + 1));

   for (i = 0; i <= MAX_CLASS_ID; i++)
     NewTable[i] = NIL;

   while (fscanf (AmbigFile, "%d", &AmbigPartSize) == 1) {
     TestString[0] = '\0';
     TestString_lengths[0] = 0;
     ReplacementString[0] = '\0';
     ReplacementString_lengths[0] = 0;
     bool illegal_char = false;
     for (i = 0; i < AmbigPartSize; ++i) {
       fscanf (AmbigFile, "%s", buffer);
       strcat(TestString, buffer);
       lengths[0] = strlen(buffer);
       strcat(TestString_lengths, lengths);
       if (!getUnicharset().contains_unichar(buffer))
         illegal_char = true;
     }
     fscanf (AmbigFile, "%d", &AmbigPartSize);
     for (i = 0; i < AmbigPartSize; ++i) {
       fscanf (AmbigFile, "%s", buffer);
       strcat(ReplacementString, buffer);
       lengths[0] = strlen(buffer);
       strcat(ReplacementString_lengths, lengths);
       if (!getUnicharset().contains_unichar(buffer))
         illegal_char = true;
     }

     if (strlen (TestString_lengths) > MAX_AMBIG_SIZE ||
         strlen (ReplacementString_lengths) > MAX_AMBIG_SIZE)
       DoError (0, "Illegal ambiguity specification!");
     if (illegal_char) {
       continue;
     }

     AmbigSpec = (AMBIG_SPEC *) Emalloc (sizeof (AMBIG_SPEC));

     strcpy(AmbigSpec->ambig, TestString + TestString_lengths[0]);
     strcat(AmbigSpec->ambig, " ");
     strcat(AmbigSpec->ambig, ReplacementString);

     strcpy(AmbigSpec->lengths, TestString_lengths + 1);
     lengths[0] = 1;
     strcat(AmbigSpec->lengths, lengths);
     strcat(AmbigSpec->lengths, ReplacementString_lengths);
     unichar_id = getUnicharset().unichar_to_id(TestString,
                                                TestString_lengths[0]);
     NewTable[unichar_id] = push_last (NewTable[unichar_id], AmbigSpec);
   }

   fclose(AmbigFile);
   return (NewTable);

 }                                /* FillAmbigTable */
 }  // namespace tesseract

 /*---------------------------------------------------------------------------*/
 int FreeBadChoice(void *item1,    //VIABLE_CHOICE                 Choice,
                   void *item2) {  //EXPANDED_CHOICE                       *BestChoice)
 /*
  **	Parameters:
  **		Choice			choice to be tested
  **		BestChoice		best choice found
  **	Globals:
  **		stopper_ambiguity_threshold_gain
  **		stopper_ambiguity_threshold_offset
  **	Operation: If the certainty of any chunk in Choice is not ambiguous
  **		with the corresponding chunk in the best choice, free
  **		Choice and return TRUE.  Otherwise, return FALSE.
  **	Return: TRUE or FALSE.
  **	Exceptions: none
  **	History: Wed May 15 13:20:26 1991, DSJ, Created.
  */
   int i, j, Chunk;
   FLOAT32 Threshold;
   VIABLE_CHOICE Choice;
   EXPANDED_CHOICE *BestChoice;

   Choice = (VIABLE_CHOICE) item1;
   BestChoice = (EXPANDED_CHOICE *) item2;

   Threshold = AmbigThreshold (BestChoice->Choice->AdjustFactor,
     Choice->AdjustFactor);

   for (i = 0, Chunk = 0; i < Choice->Length; i++)
     for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++)
       if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] &&
     Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] <
       Threshold) {
         memfree(Choice);
     return (TRUE);
   }

   return (FALSE);

 }                                /* FreeBadChoice */


 /*---------------------------------------------------------------------------*/
 namespace tesseract {
 int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) {
 /*
  **	Parameters:
  **		Word            word to be tested
  **	Globals: none
  **	Operation: Return the length of the shortest alpha run in Word.
  **	Return:  Return the length of the shortest alpha run in Word.
  **	Exceptions: none
  **	History: Tue May 14 07:50:45 1991, DSJ, Created.
  */
   register int Shortest = MAXINT;
   register int Length;
   int x;
   int y;

   for (x = 0; x < WordChoice.length(); ++x) {
     if (getUnicharset().get_isalpha(WordChoice.unichar_id(x))) {
       for (y = x + 1, Length = 1;
            y < WordChoice.length() &&
            getUnicharset().get_isalpha(WordChoice.unichar_id(y));
            ++y, ++Length);
       if (Length < Shortest) {
         Shortest = Length;
       }
       if (y == WordChoice.length()) {
         break;
       }
     }
   }
   if (Shortest == MAXINT)
     Shortest = 0;

   return (Shortest);

 }                                /* LengthOfShortestAlphaRun */


 /*---------------------------------------------------------------------------*/
 VIABLE_CHOICE Dict::NewViableChoice(const WERD_CHOICE &WordChoice,
                                     FLOAT32 AdjustFactor,
                                     const float Certainties[]) {
 /*
  **	Parameters:
  **		Choice		choice to be converted to a viable choice
  **		AdjustFactor	factor used to adjust ratings for Choice
  **		Certainties	certainty for each character in Choice
  **	Globals:
  **		CurrentSegmentation	segmentation corresponding to Choice
  **	Operation: Allocate a new viable choice data structure, copy
  **		Choice, Certainties, and CurrentSegmentation into it,
  **		and return a pointer to it.
  **	Return: Ptr to new viable choice.
  **	Exceptions: none
  **	History: Thu May 16 15:28:29 1991, DSJ, Created.
  */
   int Length = WordChoice.length();
   assert (Length <= MAX_NUM_CHUNKS && Length > 0);
   VIABLE_CHOICE NewChoice = (VIABLE_CHOICE) Emalloc (
       sizeof (VIABLE_CHOICE_STRUCT) + (Length - 1) * sizeof (CHAR_CHOICE));
   FillViableChoice(WordChoice, AdjustFactor, Certainties, false, NewChoice);
   return (NewChoice);
 }                                /* NewViableChoice */


 /*---------------------------------------------------------------------------*/
 void Dict::PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice) {
 /*
  **	Parameters:
  **		File	open text file to print Choice to
  **		Label	text label to be printed with Choice
  **		Choice	choice to be printed
  **	Globals: none
  **	Operation: This routine dumps a text representation of the
  **		specified Choice to File.
  **	Return: none
  **	Exceptions: none
  **	History: Mon May 20 11:16:44 1991, DSJ, Created.
  */
   int i, j;

   fprintf (File, "%s", Label);

   fprintf(File, "(R=%5.1f, C=%4.1f, F=%4.2f, Frag=%d)  ",
     Choice->Rating, Choice->Certainty,
     Choice->AdjustFactor, Choice->ComposedFromCharFragments);

   for (i = 0; i < Choice->Length; i++)
     fprintf(File, "%s", getUnicharset().id_to_unichar(Choice->Blob[i].Class));
   fprintf(File, "\n");

   for (i = 0; i < Choice->Length; i++) {
     fprintf(File, "  %s", getUnicharset().id_to_unichar(Choice->Blob[i].Class));
     for (j = 0; j < Choice->Blob[i].NumChunks - 1; j++)
       fprintf(File, "    ");
   }
   fprintf(File, "\n");

   for (i = 0; i < Choice->Length; i++) {
     for (j = 0; j < Choice->Blob[i].NumChunks; j++)
       fprintf(File, "%3d ", (int) (Choice->Blob[i].Certainty * -10.0));
   }
   fprintf(File, "\n");

   for (i = 0; i < Choice->Length; i++) {
     for (j = 0; j < Choice->Blob[i].NumChunks; j++)
       fprintf(File, "%3d ", Choice->Blob[i].NumChunks);
   }
   fprintf(File, "\n");
 }                                /* PrintViableChoice */


 /*---------------------------------------------------------------------------*/
 void Dict::FillViableChoice(const WERD_CHOICE &WordChoice,
                             FLOAT32 AdjustFactor, const float Certainties[],
                             bool SameString, VIABLE_CHOICE ViableChoice) {
 /*
  **	Parameters:
  **		WordChoice 	a choice with info that will be copied
  **		AdjustFactor	factor used to adjust ratings for AChoice
  **		Certainties	certainty for each character in AChoice
  **             SameString      if true the string in the viable choice
  **                             will not be changed
  **		ViableChoice	existing viable choice to fill in
  **	Globals:
  **		CurrentSegmentation	segmentation for NewChoice
  **	Operation:
  **             Fill ViableChoice with information from AChoice,
  **             AdjustFactor, and Certainties.
  **	Return: none
  **	Exceptions: none
  **	History: Fri May 17 13:35:58 1991, DSJ, Created.
  */
   CHAR_CHOICE *NewChar;
   BLOB_WIDTH *BlobWidth;
   int x;

   ViableChoice->Rating = WordChoice.rating();
   ViableChoice->Certainty = WordChoice.certainty();
   ViableChoice->AdjustFactor = AdjustFactor;
   ViableChoice->ComposedFromCharFragments = false;
   if (!SameString) {
     ViableChoice->Length = WordChoice.length();
   }
   for (x = 0,
        NewChar = &(ViableChoice->Blob[0]),
        BlobWidth = CurrentSegmentation;
        x < WordChoice.length();
        x++, NewChar++, Certainties++, BlobWidth++) {
     if (!SameString) {
       NewChar->Class = WordChoice.unichar_id(x);
     }
     NewChar->NumChunks = *BlobWidth;
     NewChar->Certainty = *Certainties;
     for (int i = 1; i < WordChoice.fragment_length(x); ++i) {
       BlobWidth++;
       assert(*BlobWidth > 0);
       NewChar->NumChunks += *BlobWidth;
       ViableChoice->ComposedFromCharFragments = true;
     }
   }
 }                                /* FillViableChoice */


 // Compares unichar ids in word_choice to those in viable_choice,
 // returns true if they are the same, false otherwise.
 bool Dict::StringSameAs(const WERD_CHOICE &WordChoice,
                         VIABLE_CHOICE ViableChoice) {
   if (WordChoice.length() != ViableChoice->Length) {
     return false;
   }
   int i;
   CHAR_CHOICE *CharChoice;
   for (i = 0, CharChoice = &(ViableChoice->Blob[0]);
        i < ViableChoice->Length; CharChoice++, i++) {
     if (CharChoice->Class != WordChoice.unichar_id(i)) {
       return false;
     }
   }
   return true;
 }

 /*---------------------------------------------------------------------------*/
 int Dict::StringSameAs(const char *String,
                        const char *String_lengths,
                        VIABLE_CHOICE ViableChoice) {
 /*
  **	Parameters:
  **		String		string to compare to ViableChoice
  **		String_lengths	lengths of unichars in String
  **		ViableChoice	viable choice to compare to String
  **	Globals: none
  **	Operation: This routine compares String to ViableChoice and
  **		returns TRUE if they are the same, FALSE otherwise.
  **	Return: TRUE or FALSE.
  **	Exceptions: none
  **	History: Fri May 17 08:48:04 1991, DSJ, Created.
  */
   CHAR_CHOICE *Char;
   int i;
   int current_unichar_length;

   for (Char = &(ViableChoice->Blob[0]), i = 0;
     i < ViableChoice->Length;
        String += *(String_lengths++), Char++, i++) {
     current_unichar_length = strlen(getUnicharset().id_to_unichar(Char->Class));
   if (current_unichar_length != *String_lengths ||
       strncmp(String, getUnicharset().id_to_unichar(Char->Class),
               current_unichar_length) != 0)
     return (FALSE);
   }

   if (*String == 0)
     return (TRUE);
   else
     return (FALSE);

 }                                /* StringSameAs */
 }  // namespace tesseract

 /*---------------------------------------------------------------------------*/
 int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices,
                        const WERD_CHOICE &BestChoice) {
 /*
  **	Parameters:
  **		Choices		choices for current segmentation
  **		BestChoice	best choice for current segmentation
  **	Globals:
  **		stopper_allowable_character_badness	max allowed certainty variation
  **	Operation: This routine returns TRUE if the certainty of the
  **		BestChoice word is within a reasonable range of the average
  **		certainties for the best choices for each character in
  **		the segmentation.  This test is used to catch words in which
  **		one character is much worse than the other characters in
  **		the word (i.e. FALSE will be returned in that case).
  **		The algorithm computes the mean and std deviation of the
  **		certainties in the word with the worst certainty thrown out.
  **	Return: TRUE or FALSE.
  **	Exceptions: none
  **	History: Tue May 14 08:23:21 1991, DSJ, Created.
  */
   float Certainty;
   float WorstCertainty = MAX_FLOAT32;
   float CertaintyThreshold;
   FLOAT64 TotalCertainty;
   FLOAT64 TotalCertaintySquared;
   FLOAT64 Variance;
   FLOAT32 Mean, StdDev;
   int WordLength;

   WordLength = Choices.length();
   if (WordLength < 3)
     return (TRUE);

   TotalCertainty = TotalCertaintySquared = 0.0;
   BLOB_CHOICE_IT BlobChoiceIt;
   for (int i = 0; i < Choices.length(); ++i) {
     BlobChoiceIt.set_to_list(Choices.get(i));
     Certainty = BlobChoiceIt.data()->certainty();
     TotalCertainty += Certainty;
     TotalCertaintySquared += Certainty * Certainty;
     if (Certainty < WorstCertainty)
       WorstCertainty = Certainty;
   }

   /* subtract off worst certainty from statistics */
   WordLength--;
   TotalCertainty -= WorstCertainty;
   TotalCertaintySquared -= WorstCertainty * WorstCertainty;

   Mean = TotalCertainty / WordLength;
   Variance = ((WordLength * TotalCertaintySquared -
     TotalCertainty * TotalCertainty) /
     (WordLength * (WordLength - 1)));
   if (Variance < 0.0)
     Variance = 0.0;
   StdDev = sqrt (Variance);

   CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
   if (CertaintyThreshold > stopper_nondict_certainty_base)
     CertaintyThreshold = stopper_nondict_certainty_base;

   if (BestChoice.certainty() < CertaintyThreshold) {
     if (stopper_debug_level >= 1)
       cprintf("Stopper: Non-uniform certainty = %4.1f"
               " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
               BestChoice.certainty(), Mean, StdDev, CertaintyThreshold);
     return (FALSE);
   } else {
     return (TRUE);
   }
 }                                /* UniformCertainties */