dict/permnum.cpp - platform/external/tesseract - Git at Google

 /* -*-C-*-
  ********************************************************************************
  *
  * File:        permnum.c  (Formerly permnum.c)
  * Description:
  * Author:       Mark Seaman, OCR Technology
  * Created:      Fri Oct 16 14:37:00 1987
  * Modified:     Tue Jul  2 14:12:43 1991 (Mark Seaman) marks@hpgrlt
  * Language:     C
  * Package:      N/A
  * Status:       Reusable Software Component
  *
  * (c) Copyright 1987, Hewlett-Packard Company.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  *********************************************************************************/
 /*----------------------------------------------------------------------
               I n c l u d e s
 ----------------------------------------------------------------------*/
 #include "const.h"
 #include "permnum.h"
 #include "debug.h"
 #include "permute.h"
 #include "dawg.h"
 #include "tordvars.h"
 #include "stopper.h"
 #include "globals.h"
 #include "ndminx.h"
 #include "dict.h"
 #include "image.h"
 #include "ccutil.h"
 #include "conversion.h"

 #include <math.h>
 #include <ctype.h>

 /*----------------------------------------------------------------------
               V a r i a b l e s
 ----------------------------------------------------------------------*/
 #if 0
 static const char *allowed_alpha_strs[] = {
   "jan", "feb", "mar", "apr", "may", "jun",
   "jul", "aug", "sep", "oct", "nov", "dec", NULL
 };
 #endif

 #if 0
 static const char *allowed_char_strs[] = {
   "adfjmnos", "aceopu", "bcglnrptvy"
 };
 #endif

 const int kNumStates = 7;

 static int number_state_table[kNumStates][8] = { {
                                  /* 0. Beginning of string        */
   /*	l	d	o	a	t	1	2	3                    */
     0, 1, 1, -99, -99, 4, -99, -99
   },
   {                              /* 1. After a digit or operator  */
     -99, 1, 1, 3, 2, 4, 3, 3
   },
   {                              /* 2. After trailing punctuation */
     -99, -99, 1, -99, 2, -99, -99, -99
   },
   {                              /* 3. After a alpha character    */
     -99, -99, 3, 3, 2, 3, 3, 3
   },
   {                              /* 4. After 1st  char */
     -99, -1, -1, -99, -2, -99, 5, -99
   },
   {                              /* 5. After 2nd  char */
     -99, -1, -1, -99, -2, -99, -99, 6
   },
   {                              /* 6. After 3rd  char */
     -99, -1, -1, -99, -2, -99, -99, -99
   }
 };

 // The state is coded with its true state shifted left by kStateShift.
 // A repeat count (starting with 0) is stored in the lower bits
 // No state is allowed to occur more than kMaxRepeats times.
 const int kStateShift = 4;
 const int kRepeatMask = (1 << kStateShift) - 1;

 const int kMaxRepeats[kNumStates] = {
   3, 10, 3, 3, 3, 3, 3
 };

 double_VAR(segment_penalty_number_good, GOOD_NUMBER,
            "Score multiplier for good-looking numbers "
            "(lower is better).");

 double_VAR(segment_penalty_number_ok, OK_NUMBER,
            "Score multiplier for ok-looking numbers "
            "(lower is better).");

 BOOL_VAR(number_debug, 0, "Segmentation number debug mode");

 INT_VAR(segment_digits_max, 3,
         "Maximum length of a number we will try to segment.");

 /*----------------------------------------------------------------------
               M a c r o s
 ----------------------------------------------------------------------*/
 /**********************************************************************
  * isleading
  *
  * Return non-zero if this is a leading type punctuation mark for the
  * numeric grammar.
  **********************************************************************/

 #define isleading(ch)     \
 ((ch == '{'  ) ||       \
 	(ch == '['  ) ||       \
 	(ch == '('  ) ||       \
 	(ch == '#'  ) ||       \
 	(ch == '@'  ) ||       \
 	(ch == '$'  ))

 /**********************************************************************
  * istrailing
  *
  * Return non-zero if this is a leading type punctuation mark for the
  * numeric grammar.
  **********************************************************************/

 #define istrailing(ch)    \
 ((ch == '}'  ) ||       \
 	(ch == ']'  ) ||       \
 	(ch == ')'  ) ||       \
 	(ch == ';'  ) ||       \
 	(ch == ':'  ) ||       \
 	(ch == ','  ) ||       \
 	(ch == '.'  ) ||       \
 	(ch == '%'  ))

 /**********************************************************************
  * isoperator
  *
  * Return non-zero if this is a leading type punctuation mark for the
  * numeric grammar.
  **********************************************************************/

 #define isoperator(ch)    \
 ((ch == '*'  ) ||       \
 	(ch == '+'  ) ||       \
 	(ch == '-'  ) ||       \
 	(ch == '/'  ) ||       \
 	(ch == '.'  ) ||       \
 	(ch == ':'  ) ||       \
 	(ch == ','  ))

 /*----------------------------------------------------------------------
               F u n c t i o n s
 ----------------------------------------------------------------------*/
 /**********************************************************************
  * adjust_number
  *
  * Assign an adjusted value to a string that is a word.  The value
  * that this word choice has is based on case and punctuation rules.
  **********************************************************************/
 namespace tesseract {
 void Dict::adjust_number(A_CHOICE *best_choice, float *certainty_array) {
   float adjust_factor;

   if (segment_adjust_debug)
     cprintf ("Number: %s %4.2f ",
       class_string (best_choice), class_rating (best_choice));

   class_rating (best_choice) += RATING_PAD;
   if (pure_number (class_string (best_choice), class_lengths (best_choice))) {
     class_rating (best_choice) *= segment_penalty_number_good;
     adjust_factor = segment_penalty_number_good;
     if (segment_adjust_debug)
       cprintf (", %4.2f ", (double)segment_penalty_number_good);
   }
   else {
     class_rating (best_choice) *= segment_penalty_number_ok;
     adjust_factor = segment_penalty_number_ok;
     if (segment_adjust_debug)
       cprintf (", N, %4.2f ", (double)segment_penalty_number_ok);
   }

   class_rating (best_choice) -= RATING_PAD;
   LogNewWordChoice(best_choice, adjust_factor,
                    certainty_array, getUnicharset());
   if (segment_adjust_debug)
     cprintf (" --> %4.2f\n", class_rating (best_choice));
 }


 /**********************************************************************
  * append_number_choices
  *
  * Check to see whether or not the next choice is worth appending to
  * the string being generated.  If so then keep going deeper into the
  * word.
  **********************************************************************/
 void Dict::append_number_choices(int state,
                                  char *word,
                                  char unichar_lengths[],
                                  int unichar_offsets[],
                                  CHOICES_LIST choices,
                                  int char_choice_index,
                                  int word_index,
                                  A_CHOICE *this_choice,
                                  float *limit,
                                  float rating,
                                  float certainty,
                                  float *certainty_array,
                                  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
                                  char fragment_lengths[],
                                  CHOICES *result) {
   int x;
   int offset;
   CHAR_FRAGMENT_INFO char_frag_info;
   int word_ending =
     (char_choice_index == (array_count(choices) - 1)) ? TRUE : FALSE;
   const char *ch = NULL;

   /* Deal lwith fragments */
   if (!fragment_state_okay(
       getUnicharset().unichar_to_id(class_string(this_choice)),
       class_rating(this_choice), class_certainty(this_choice),
       prev_char_frag_info,
       (number_debug && (fragments_debug > 1)) ? "number_debug" : NULL,
       word_ending, &char_frag_info)) {
     return;  // this_choice must be an invalid fragment
   }
   if (char_frag_info.unichar_id != INVALID_UNICHAR_ID) {
     ch = getUnicharset().id_to_unichar(char_frag_info.unichar_id);
   }
   if (ch == NULL) {   // this character is a fragment
     JOIN_ON(*result,  // so search the next letter
             number_permute(state, choices, char_choice_index + 1,
                            word_index, limit, word, unichar_lengths,
                            unichar_offsets, rating, certainty, certainty_array,
                            &char_frag_info, fragment_lengths));
     return;
   }

   /* Add new character */
   strcpy(word + unichar_offsets[word_index], ch);

   unichar_lengths[word_index] = strlen(ch);
   unichar_lengths[word_index + 1] = 0;
   fragment_lengths[word_index] = char_frag_info.num_fragments;
   fragment_lengths[word_index + 1] = 0;
   unichar_offsets[word_index + 1] = unichar_offsets[word_index] +
       unichar_lengths[word_index];

   if (word[unichar_offsets[word_index]] == '\0') {
     word[unichar_offsets[word_index]] = ' ';
     word[unichar_offsets[word_index] + 1] = '\0';
     unichar_lengths[word_index] = 1;
     unichar_lengths[word_index + 1] = 0;
     fragment_lengths[word_index] = 1;
     fragment_lengths[word_index + 1] = 0;
     unichar_offsets[word_index + 1] = unichar_offsets[word_index] +
         unichar_lengths[word_index];
   }
   certainty_array[word_index] = char_frag_info.certainty;
   rating += char_frag_info.rating;
   certainty = MIN (char_frag_info.certainty, certainty);

   if (rating < *limit) {
     state = number_state_change (state, word + unichar_offsets[word_index],
                                  unichar_lengths + word_index);
     if (number_debug) {
       cprintf ("%s rating=%4.2f  state=%d\n", word, rating, state);
     }

     if (state != -1) {

       if ((state >> kStateShift) == 3 &&
           word_index + 3 < array_count (choices)) {
         return;
       }

       if (word_ending) {
         for (x = 0, offset = 0; x <= word_index; offset += unichar_lengths[x++]) {
           if (getUnicharset().get_isdigit (
               word + offset, unichar_lengths[x])) {
             if (number_debug) cprintf ("new choice = %s\n", word);
             push_on (*result, new_choice (word, unichar_lengths, rating,
                                           certainty, -1, NULL, NUMBER_PERM,
                                           false, fragment_lengths));

             adjust_number ((A_CHOICE *) first_node (*result), certainty_array);
             if (best_rating (*result) > *limit) {
               free_choice (first_node (*result));
               pop_off(*result);
             }
             else {
               *limit = best_rating (*result);
               break;
             }
           }
         }
       }
       else {
         JOIN_ON (*result,
           number_permute (state, choices, char_choice_index + 1,
                           word_index + 1, limit, word, unichar_lengths,
                           unichar_offsets, rating, certainty,
                           certainty_array, &char_frag_info, fragment_lengths));
       }
     }
   }
   else {
     if (number_debug)
       cprintf ("pruned word (%s, rating=%4.2f, limit=%4.2f)\n",
         word, rating, *limit);
   }
 }


 /**********************************************************************
  * number_character_type
  *
  * Decide which type of a character (with regard to the numeric state
  * table) we are looking at.
  **********************************************************************/
 int Dict::number_character_type(  //current state
                                   const char* ch,
                                   int length,
                                   int state) {
   if (getUnicharset().get_isalpha (ch, length)) {
 #if 0
     if (state < 4
         && strchr (allowed_char_strs[0], lower_char) != NULL)
       return 5;
     else if (state == 4
         && strchr (allowed_char_strs[1], lower_char) != NULL)
       return 6;
     else if (state == 5
         && strchr (allowed_char_strs[2], lower_char) != NULL)
       return 7;
 #endif
     return 3;
   }
   else if (getUnicharset().get_isdigit (ch, length))
     return (1);
   else if (length == 1 && isoperator (*ch))
     return (2);
   else if (length == 1 && istrailing (*ch))
     return (4);
   else if (length == 1 && isleading (*ch))
     return (0);
   else
     return (-1);
 }


 /**********************************************************************
  * number_state_change
  *
  * Execute a state transition according to the state table and
  * additional rules.
  **********************************************************************/
 int Dict::number_state_change(int state,             //current state
                               const char *word,      //current char
                               const char *lengths) { //length of current char
   int char_type;                 //type of char
   int new_state;                 //state to return
   int old_state = state >> kStateShift;
   int repeats = state & kRepeatMask;
 #if 0
   int index;
   char copy_word[4];             //tolowered chars
 #endif

   char_type = number_character_type (word, *lengths, old_state);
   if (char_type == -1)
     return -1;
   new_state = number_state_table[old_state][char_type];
   if (new_state == old_state) {
     ++repeats;
     if (repeats >= kMaxRepeats[old_state])
       return -1;
   } else {
     repeats = 0;
   }
   if (new_state >= 0)
     return (new_state << kStateShift) | repeats;
   if (new_state == -99)
     return -1;

   //now check to see if the last state-3 chars in the word
   //make an allowable word. For now only 3 letter words
   //are allowed
   if (old_state != 6)
     return -1;                   //only 3 letters now
 #if 0
   copy_word[0] = tolower (word[-3]);
   copy_word[1] = tolower (word[-2]);
   copy_word[2] = tolower (word[-1]);
   copy_word[3] = '\0';
   for (index = 0; allowed_alpha_strs[index] != NULL; index++) {
     if (strcmp (copy_word, allowed_alpha_strs[index]) == 0)
       return (-new_state) << kStateShift;
   }
 #endif
   return -1;                     //not a good word
 }


 /**********************************************************************
  * number_permute
  *
  * Permute all the valid string that match the 'grammar' of numbers.
  * The valid syntax for numbers is encoded in a state table.  The
  * permuter uses this state table to enumerate all the string that
  * can be produced using the input choices.
  **********************************************************************/
 CHOICES Dict::number_permute(int state,
                              CHOICES_LIST choices,
                              int char_choice_index,
                              int word_index,
                              float *limit,
                              char *word,
                              char unichar_lengths[],
                              int unichar_offsets[],
                              float rating,
                              float certainty,
                              float *certainty_array,
                              const CHAR_FRAGMENT_INFO *prev_char_frag_info,
                              char fragment_lengths[]) {
   CHOICES result = NIL;
   CHOICES c;
   int depth = 0;

   if (number_debug) {
     cprintf ("number_permute (state=%d, char_choice_index=%d,"
              " word_index=%d, limit=%4.2f, ", state, char_choice_index,
              word_index, *limit);
     cprintf ("word=%s, rating=%4.2f, certainty=%4.2f)\n",
       word, rating, certainty);
   }
   if (char_choice_index < array_count (choices)) {
     iterate_list (c, (CHOICES) array_index (choices, char_choice_index)) {
       if (depth++ < segment_digits_max)
         append_number_choices (state, word, unichar_lengths, unichar_offsets,
                                choices, char_choice_index, word_index,
                                (A_CHOICE *) first_node (c), limit, rating,
                                certainty, certainty_array, prev_char_frag_info,
                                fragment_lengths, &result);
     }
   }
   if (result && number_debug == 1)
     print_choices ("number_permute:", result);
   return (result);
 }


 /**********************************************************************
  * number_permute_and_select
  *
  * Permute all the possible valid numbers and adjust their ratings.
  * Save the best rating.
  **********************************************************************/
 A_CHOICE *Dict::number_permute_and_select(CHOICES_LIST char_choices,
                                           float rating_limit) {
   CHOICES result = NIL;
   char word[UNICHAR_LEN * MAX_WERD_LENGTH + 1];
   char unichar_lengths[MAX_WERD_LENGTH + 1];
   int unichar_offsets[MAX_WERD_LENGTH + 1];
   float certainty_array[MAX_WERD_LENGTH + 1];
   char fragment_lengths[MAX_WERD_LENGTH + 1];  // num fragments from which
                                                // each char was constructed
   float rating = rating_limit;
   A_CHOICE *best_choice;

   best_choice = new_choice (NULL, NULL, MAXFLOAT, -MAXFLOAT, -1, NO_PERM);

   if (array_count (char_choices) <= MAX_WERD_LENGTH) {
     word[0] = '\0';
     unichar_lengths[0] = 0;
     fragment_lengths[0] = 0;
     unichar_offsets[0] = 0;
     result = number_permute (0, char_choices, 0, 0, &rating, word,
                              unichar_lengths, unichar_offsets,
                              0.0, 0.0, certainty_array, NULL, fragment_lengths);
     if (display_ratings && result)
       print_choices ("number_permuter", result);

     while (result != NIL) {
       if (best_rating (result) < class_rating (best_choice)) {
         clone_choice (best_choice, (A_CHOICE *) first_node (result));
       }
       free_choice (first_node (result));
       pop_off(result);
     }
   }
   return (best_choice);
 }
 }  // namespace tesseract


 /**********************************************************************
  * pure_number
  *
  * Check to see if this string is a pure number (one that does not end
  * with alphabetic characters).
  **********************************************************************/
 namespace tesseract {
 int Dict::pure_number(const char *string, const char *lengths) {
   int x;
   int offset;

   x = strlen (lengths) - 1;
   offset = strlen (string) - lengths[x];
   for (;x >= 0; offset -= lengths[--x]) {
     if (getUnicharset().get_isdigit (string + offset, lengths[x])) {
       return (TRUE);
     }
     else if (getUnicharset().get_isalpha (string + offset, lengths[x]))
       return (FALSE);
   }
   return (FALSE);
 }


 /**********************************************************************
  * valid_number
  *
  * Check this string to see if it is a valid number.  Return TRUE if
  * it is.
  **********************************************************************/
 int Dict::valid_number(const char *string, const char *lengths) {
   int state = 0;
   int char_index;
   int offset;
   int num_chars = strlen (lengths);
   int num_digits = 0;

   for (char_index = 0, offset = 0; char_index < num_chars;
        offset += lengths[char_index++]) {

     state = number_state_change (state, string + offset, lengths + char_index);
     if (state == -1)
       return (FALSE);
     if (getUnicharset().get_isdigit (string + offset, lengths[char_index]))
       num_digits++;
   }
   return num_digits > num_chars - num_digits;
 }
 }  // namespace tesseract
	/* --C--
	********************************************************************************
	*
	* File: permnum.c (Formerly permnum.c)
	* Description:
	* Author: Mark Seaman, OCR Technology
	* Created: Fri Oct 16 14:37:00 1987
	* Modified: Tue Jul 2 14:12:43 1991 (Mark Seaman) marks@hpgrlt
	* Language: C
	* Package: N/A
	* Status: Reusable Software Component
	*
	* (c) Copyright 1987, Hewlett-Packard Company.
	** Licensed under the Apache License, Version 2.0 (the "License");
	** you may not use this file except in compliance with the License.
	** You may obtain a copy of the License at
	** http://www.apache.org/licenses/LICENSE-2.0
	** Unless required by applicable law or agreed to in writing, software
	** distributed under the License is distributed on an "AS IS" BASIS,
	** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	** See the License for the specific language governing permissions and
	** limitations under the License.
	*
	*********************************************************************************/
	/*----------------------------------------------------------------------
	I n c l u d e s
	----------------------------------------------------------------------*/
	#include "const.h"
	#include "permnum.h"
	#include "debug.h"
	#include "permute.h"
	#include "dawg.h"
	#include "tordvars.h"
	#include "stopper.h"
	#include "globals.h"
	#include "ndminx.h"
	#include "dict.h"
	#include "image.h"
	#include "ccutil.h"
	#include "conversion.h"

	#include <math.h>
	#include <ctype.h>

	/*----------------------------------------------------------------------
	V a r i a b l e s
	----------------------------------------------------------------------*/
	#if 0
	static const char *allowed_alpha_strs[] = {
	"jan", "feb", "mar", "apr", "may", "jun",
	"jul", "aug", "sep", "oct", "nov", "dec", NULL
	};
	#endif

	#if 0
	static const char *allowed_char_strs[] = {
	"adfjmnos", "aceopu", "bcglnrptvy"
	};
	#endif

	const int kNumStates = 7;

	static int number_state_table[kNumStates][8] = { {
	/* 0. Beginning of string */
	/* l d o a t 1 2 3 */
	0, 1, 1, -99, -99, 4, -99, -99
	},
	{ /* 1. After a digit or operator */
	-99, 1, 1, 3, 2, 4, 3, 3
	},
	{ /* 2. After trailing punctuation */
	-99, -99, 1, -99, 2, -99, -99, -99
	},
	{ /* 3. After a alpha character */
	-99, -99, 3, 3, 2, 3, 3, 3
	},
	{ /* 4. After 1st char */
	-99, -1, -1, -99, -2, -99, 5, -99
	},
	{ /* 5. After 2nd char */
	-99, -1, -1, -99, -2, -99, -99, 6
	},
	{ /* 6. After 3rd char */
	-99, -1, -1, -99, -2, -99, -99, -99
	}
	};

	// The state is coded with its true state shifted left by kStateShift.
	// A repeat count (starting with 0) is stored in the lower bits
	// No state is allowed to occur more than kMaxRepeats times.
	const int kStateShift = 4;
	const int kRepeatMask = (1 << kStateShift) - 1;

	const int kMaxRepeats[kNumStates] = {
	3, 10, 3, 3, 3, 3, 3
	};

	double_VAR(segment_penalty_number_good, GOOD_NUMBER,
	"Score multiplier for good-looking numbers "
	"(lower is better).");

	double_VAR(segment_penalty_number_ok, OK_NUMBER,
	"Score multiplier for ok-looking numbers "
	"(lower is better).");

	BOOL_VAR(number_debug, 0, "Segmentation number debug mode");

	INT_VAR(segment_digits_max, 3,
	"Maximum length of a number we will try to segment.");

	/*----------------------------------------------------------------------
	M a c r o s
	----------------------------------------------------------------------*/
	/**********************************************************************
	* isleading
	*
	* Return non-zero if this is a leading type punctuation mark for the
	* numeric grammar.
	**********************************************************************/

	#define isleading(ch) \
	((ch == '{' ) \|\| \
	(ch == '[' ) \|\| \
	(ch == '(' ) \|\| \
	(ch == '#' ) \|\| \
	(ch == '@' ) \|\| \
	(ch == '$' ))

	/**********************************************************************
	* istrailing
	*
	* Return non-zero if this is a leading type punctuation mark for the
	* numeric grammar.
	**********************************************************************/

	#define istrailing(ch) \
	((ch == '}' ) \|\| \
	(ch == ']' ) \|\| \
	(ch == ')' ) \|\| \
	(ch == ';' ) \|\| \
	(ch == ':' ) \|\| \
	(ch == ',' ) \|\| \
	(ch == '.' ) \|\| \
	(ch == '%' ))

	/**********************************************************************
	* isoperator
	*
	* Return non-zero if this is a leading type punctuation mark for the
	* numeric grammar.
	**********************************************************************/

	#define isoperator(ch) \
	((ch == '*' ) \|\| \
	(ch == '+' ) \|\| \
	(ch == '-' ) \|\| \
	(ch == '/' ) \|\| \
	(ch == '.' ) \|\| \
	(ch == ':' ) \|\| \
	(ch == ',' ))

	/*----------------------------------------------------------------------
	F u n c t i o n s
	----------------------------------------------------------------------*/
	/**********************************************************************
	* adjust_number
	*
	* Assign an adjusted value to a string that is a word. The value
	* that this word choice has is based on case and punctuation rules.
	**********************************************************************/
	namespace tesseract {
	void Dict::adjust_number(A_CHOICE best_choice, float certainty_array) {
	float adjust_factor;

	if (segment_adjust_debug)
	cprintf ("Number: %s %4.2f ",
	class_string (best_choice), class_rating (best_choice));

	class_rating (best_choice) += RATING_PAD;
	if (pure_number (class_string (best_choice), class_lengths (best_choice))) {
	class_rating (best_choice) *= segment_penalty_number_good;
	adjust_factor = segment_penalty_number_good;
	if (segment_adjust_debug)
	cprintf (", %4.2f ", (double)segment_penalty_number_good);
	}
	else {
	class_rating (best_choice) *= segment_penalty_number_ok;
	adjust_factor = segment_penalty_number_ok;
	if (segment_adjust_debug)
	cprintf (", N, %4.2f ", (double)segment_penalty_number_ok);
	}

	class_rating (best_choice) -= RATING_PAD;
	LogNewWordChoice(best_choice, adjust_factor,
	certainty_array, getUnicharset());
	if (segment_adjust_debug)
	cprintf (" --> %4.2f\n", class_rating (best_choice));
	}


	/**********************************************************************
	* append_number_choices
	*
	* Check to see whether or not the next choice is worth appending to
	* the string being generated. If so then keep going deeper into the
	* word.
	**********************************************************************/
	void Dict::append_number_choices(int state,
	char *word,
	char unichar_lengths[],
	int unichar_offsets[],
	CHOICES_LIST choices,
	int char_choice_index,
	int word_index,
	A_CHOICE *this_choice,
	float *limit,
	float rating,
	float certainty,
	float *certainty_array,
	const CHAR_FRAGMENT_INFO *prev_char_frag_info,
	char fragment_lengths[],
	CHOICES *result) {
	int x;
	int offset;
	CHAR_FRAGMENT_INFO char_frag_info;
	int word_ending =
	(char_choice_index == (array_count(choices) - 1)) ? TRUE : FALSE;
	const char *ch = NULL;

	/* Deal lwith fragments */
	if (!fragment_state_okay(
	getUnicharset().unichar_to_id(class_string(this_choice)),
	class_rating(this_choice), class_certainty(this_choice),
	prev_char_frag_info,
	(number_debug && (fragments_debug > 1)) ? "number_debug" : NULL,
	word_ending, &char_frag_info)) {
	return; // this_choice must be an invalid fragment
	}
	if (char_frag_info.unichar_id != INVALID_UNICHAR_ID) {
	ch = getUnicharset().id_to_unichar(char_frag_info.unichar_id);
	}
	if (ch == NULL) { // this character is a fragment
	JOIN_ON(*result, // so search the next letter
	number_permute(state, choices, char_choice_index + 1,
	word_index, limit, word, unichar_lengths,
	unichar_offsets, rating, certainty, certainty_array,
	&char_frag_info, fragment_lengths));
	return;
	}

	/* Add new character */
	strcpy(word + unichar_offsets[word_index], ch);

	unichar_lengths[word_index] = strlen(ch);
	unichar_lengths[word_index + 1] = 0;
	fragment_lengths[word_index] = char_frag_info.num_fragments;
	fragment_lengths[word_index + 1] = 0;
	unichar_offsets[word_index + 1] = unichar_offsets[word_index] +
	unichar_lengths[word_index];

	if (word[unichar_offsets[word_index]] == '\0') {
	word[unichar_offsets[word_index]] = ' ';
	word[unichar_offsets[word_index] + 1] = '\0';
	unichar_lengths[word_index] = 1;
	unichar_lengths[word_index + 1] = 0;
	fragment_lengths[word_index] = 1;
	fragment_lengths[word_index + 1] = 0;
	unichar_offsets[word_index + 1] = unichar_offsets[word_index] +
	unichar_lengths[word_index];
	}
	certainty_array[word_index] = char_frag_info.certainty;
	rating += char_frag_info.rating;
	certainty = MIN (char_frag_info.certainty, certainty);

	if (rating < *limit) {
	state = number_state_change (state, word + unichar_offsets[word_index],
	unichar_lengths + word_index);
	if (number_debug) {
	cprintf ("%s rating=%4.2f state=%d\n", word, rating, state);
	}

	if (state != -1) {

	if ((state >> kStateShift) == 3 &&
	word_index + 3 < array_count (choices)) {
	return;
	}

	if (word_ending) {
	for (x = 0, offset = 0; x <= word_index; offset += unichar_lengths[x++]) {
	if (getUnicharset().get_isdigit (
	word + offset, unichar_lengths[x])) {
	if (number_debug) cprintf ("new choice = %s\n", word);
	push_on (*result, new_choice (word, unichar_lengths, rating,
	certainty, -1, NULL, NUMBER_PERM,
	false, fragment_lengths));

	adjust_number ((A_CHOICE ) first_node (result), certainty_array);
	if (best_rating (result) > limit) {
	free_choice (first_node (*result));
	pop_off(*result);
	}
	else {
	limit = best_rating (result);
	break;
	}
	}
	}
	}
	else {
	JOIN_ON (*result,
	number_permute (state, choices, char_choice_index + 1,
	word_index + 1, limit, word, unichar_lengths,
	unichar_offsets, rating, certainty,
	certainty_array, &char_frag_info, fragment_lengths));
	}
	}
	}
	else {
	if (number_debug)
	cprintf ("pruned word (%s, rating=%4.2f, limit=%4.2f)\n",
	word, rating, *limit);
	}
	}



	/**********************************************************************
	* number_character_type
	*
	* Decide which type of a character (with regard to the numeric state
	* table) we are looking at.
	**********************************************************************/
	int Dict::number_character_type( //current state
	const char* ch,
	int length,
	int state) {
	if (getUnicharset().get_isalpha (ch, length)) {
	#if 0
	if (state < 4
	&& strchr (allowed_char_strs[0], lower_char) != NULL)
	return 5;
	else if (state == 4
	&& strchr (allowed_char_strs[1], lower_char) != NULL)
	return 6;
	else if (state == 5
	&& strchr (allowed_char_strs[2], lower_char) != NULL)
	return 7;
	#endif
	return 3;
	}
	else if (getUnicharset().get_isdigit (ch, length))
	return (1);
	else if (length == 1 && isoperator (*ch))
	return (2);
	else if (length == 1 && istrailing (*ch))
	return (4);
	else if (length == 1 && isleading (*ch))
	return (0);
	else
	return (-1);
	}


	/**********************************************************************
	* number_state_change
	*
	* Execute a state transition according to the state table and
	* additional rules.
	**********************************************************************/
	int Dict::number_state_change(int state, //current state
	const char *word, //current char
	const char *lengths) { //length of current char
	int char_type; //type of char
	int new_state; //state to return
	int old_state = state >> kStateShift;
	int repeats = state & kRepeatMask;
	#if 0
	int index;
	char copy_word[4]; //tolowered chars
	#endif

	char_type = number_character_type (word, *lengths, old_state);
	if (char_type == -1)
	return -1;
	new_state = number_state_table[old_state][char_type];
	if (new_state == old_state) {
	++repeats;
	if (repeats >= kMaxRepeats[old_state])
	return -1;
	} else {
	repeats = 0;
	}
	if (new_state >= 0)
	return (new_state << kStateShift) \| repeats;
	if (new_state == -99)
	return -1;

	//now check to see if the last state-3 chars in the word
	//make an allowable word. For now only 3 letter words
	//are allowed
	if (old_state != 6)
	return -1; //only 3 letters now
	#if 0
	copy_word[0] = tolower (word[-3]);
	copy_word[1] = tolower (word[-2]);
	copy_word[2] = tolower (word[-1]);
	copy_word[3] = '\0';
	for (index = 0; allowed_alpha_strs[index] != NULL; index++) {
	if (strcmp (copy_word, allowed_alpha_strs[index]) == 0)
	return (-new_state) << kStateShift;
	}
	#endif
	return -1; //not a good word
	}


	/**********************************************************************
	* number_permute
	*
	* Permute all the valid string that match the 'grammar' of numbers.
	* The valid syntax for numbers is encoded in a state table. The
	* permuter uses this state table to enumerate all the string that
	* can be produced using the input choices.
	**********************************************************************/
	CHOICES Dict::number_permute(int state,
	CHOICES_LIST choices,
	int char_choice_index,
	int word_index,
	float *limit,
	char *word,
	char unichar_lengths[],
	int unichar_offsets[],
	float rating,
	float certainty,
	float *certainty_array,
	const CHAR_FRAGMENT_INFO *prev_char_frag_info,
	char fragment_lengths[]) {
	CHOICES result = NIL;
	CHOICES c;
	int depth = 0;

	if (number_debug) {
	cprintf ("number_permute (state=%d, char_choice_index=%d,"
	" word_index=%d, limit=%4.2f, ", state, char_choice_index,
	word_index, *limit);
	cprintf ("word=%s, rating=%4.2f, certainty=%4.2f)\n",
	word, rating, certainty);
	}
	if (char_choice_index < array_count (choices)) {
	iterate_list (c, (CHOICES) array_index (choices, char_choice_index)) {
	if (depth++ < segment_digits_max)
	append_number_choices (state, word, unichar_lengths, unichar_offsets,
	choices, char_choice_index, word_index,
	(A_CHOICE *) first_node (c), limit, rating,
	certainty, certainty_array, prev_char_frag_info,
	fragment_lengths, &result);
	}
	}
	if (result && number_debug == 1)
	print_choices ("number_permute:", result);
	return (result);
	}


	/**********************************************************************
	* number_permute_and_select
	*
	* Permute all the possible valid numbers and adjust their ratings.
	* Save the best rating.
	**********************************************************************/
	A_CHOICE *Dict::number_permute_and_select(CHOICES_LIST char_choices,
	float rating_limit) {
	CHOICES result = NIL;
	char word[UNICHAR_LEN * MAX_WERD_LENGTH + 1];
	char unichar_lengths[MAX_WERD_LENGTH + 1];
	int unichar_offsets[MAX_WERD_LENGTH + 1];
	float certainty_array[MAX_WERD_LENGTH + 1];
	char fragment_lengths[MAX_WERD_LENGTH + 1]; // num fragments from which
	// each char was constructed
	float rating = rating_limit;
	A_CHOICE *best_choice;

	best_choice = new_choice (NULL, NULL, MAXFLOAT, -MAXFLOAT, -1, NO_PERM);

	if (array_count (char_choices) <= MAX_WERD_LENGTH) {
	word[0] = '\0';
	unichar_lengths[0] = 0;
	fragment_lengths[0] = 0;
	unichar_offsets[0] = 0;
	result = number_permute (0, char_choices, 0, 0, &rating, word,
	unichar_lengths, unichar_offsets,
	0.0, 0.0, certainty_array, NULL, fragment_lengths);
	if (display_ratings && result)
	print_choices ("number_permuter", result);

	while (result != NIL) {
	if (best_rating (result) < class_rating (best_choice)) {
	clone_choice (best_choice, (A_CHOICE *) first_node (result));
	}
	free_choice (first_node (result));
	pop_off(result);
	}
	}
	return (best_choice);
	}
	} // namespace tesseract


	/**********************************************************************
	* pure_number
	*
	* Check to see if this string is a pure number (one that does not end
	* with alphabetic characters).
	**********************************************************************/
	namespace tesseract {
	int Dict::pure_number(const char string, const char lengths) {
	int x;
	int offset;

	x = strlen (lengths) - 1;
	offset = strlen (string) - lengths[x];
	for (;x >= 0; offset -= lengths[--x]) {
	if (getUnicharset().get_isdigit (string + offset, lengths[x])) {
	return (TRUE);
	}
	else if (getUnicharset().get_isalpha (string + offset, lengths[x]))
	return (FALSE);
	}
	return (FALSE);
	}


	/**********************************************************************
	* valid_number
	*
	* Check this string to see if it is a valid number. Return TRUE if
	* it is.
	**********************************************************************/
	int Dict::valid_number(const char string, const char lengths) {
	int state = 0;
	int char_index;
	int offset;
	int num_chars = strlen (lengths);
	int num_digits = 0;

	for (char_index = 0, offset = 0; char_index < num_chars;
	offset += lengths[char_index++]) {

	state = number_state_change (state, string + offset, lengths + char_index);
	if (state == -1)
	return (FALSE);
	if (getUnicharset().get_isdigit (string + offset, lengths[char_index]))
	num_digits++;
	}
	return num_digits > num_chars - num_digits;
	}
	} // namespace tesseract