blob: 577620f84788a94c0e4196eff3afab20a498b111 [file] [log] [blame]
/* -*-C-*-
* File: permnum.c (Formerly permnum.c)
* Description:
* Author: Mark Seaman, OCR Technology
* Created: Fri Oct 16 14:37:00 1987
* Modified: Tue Jul 2 14:12:43 1991 (Mark Seaman) marks@hpgrlt
* Language: C
* Package: N/A
* Status: Reusable Software Component
* (c) Copyright 1987, Hewlett-Packard Company.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** See the License for the specific language governing permissions and
** limitations under the License.
I n c l u d e s
#include "const.h"
#include "permnum.h"
#include "debug.h"
#include "permute.h"
#include "dawg.h"
#include "tordvars.h"
#include "stopper.h"
#include "globals.h"
#include "ndminx.h"
#include "dict.h"
#include "image.h"
#include "ccutil.h"
#include "conversion.h"
#include <math.h>
#include <ctype.h>
V a r i a b l e s
#if 0
static const char *allowed_alpha_strs[] = {
"jan", "feb", "mar", "apr", "may", "jun",
"jul", "aug", "sep", "oct", "nov", "dec", NULL
#if 0
static const char *allowed_char_strs[] = {
"adfjmnos", "aceopu", "bcglnrptvy"
const int kNumStates = 7;
static int number_state_table[kNumStates][8] = { {
/* 0. Beginning of string */
/* l d o a t 1 2 3 */
0, 1, 1, -99, -99, 4, -99, -99
{ /* 1. After a digit or operator */
-99, 1, 1, 3, 2, 4, 3, 3
{ /* 2. After trailing punctuation */
-99, -99, 1, -99, 2, -99, -99, -99
{ /* 3. After a alpha character */
-99, -99, 3, 3, 2, 3, 3, 3
{ /* 4. After 1st char */
-99, -1, -1, -99, -2, -99, 5, -99
{ /* 5. After 2nd char */
-99, -1, -1, -99, -2, -99, -99, 6
{ /* 6. After 3rd char */
-99, -1, -1, -99, -2, -99, -99, -99
// The state is coded with its true state shifted left by kStateShift.
// A repeat count (starting with 0) is stored in the lower bits
// No state is allowed to occur more than kMaxRepeats times.
const int kStateShift = 4;
const int kRepeatMask = (1 << kStateShift) - 1;
const int kMaxRepeats[kNumStates] = {
3, 10, 3, 3, 3, 3, 3
double_VAR(segment_penalty_number_good, GOOD_NUMBER,
"Score multiplier for good-looking numbers "
"(lower is better).");
double_VAR(segment_penalty_number_ok, OK_NUMBER,
"Score multiplier for ok-looking numbers "
"(lower is better).");
BOOL_VAR(number_debug, 0, "Segmentation number debug mode");
INT_VAR(segment_digits_max, 3,
"Maximum length of a number we will try to segment.");
M a c r o s
* isleading
* Return non-zero if this is a leading type punctuation mark for the
* numeric grammar.
#define isleading(ch) \
((ch == '{' ) || \
(ch == '[' ) || \
(ch == '(' ) || \
(ch == '#' ) || \
(ch == '@' ) || \
(ch == '$' ))
* istrailing
* Return non-zero if this is a leading type punctuation mark for the
* numeric grammar.
#define istrailing(ch) \
((ch == '}' ) || \
(ch == ']' ) || \
(ch == ')' ) || \
(ch == ';' ) || \
(ch == ':' ) || \
(ch == ',' ) || \
(ch == '.' ) || \
(ch == '%' ))
* isoperator
* Return non-zero if this is a leading type punctuation mark for the
* numeric grammar.
#define isoperator(ch) \
((ch == '*' ) || \
(ch == '+' ) || \
(ch == '-' ) || \
(ch == '/' ) || \
(ch == '.' ) || \
(ch == ':' ) || \
(ch == ',' ))
F u n c t i o n s
* adjust_number
* Assign an adjusted value to a string that is a word. The value
* that this word choice has is based on case and punctuation rules.
namespace tesseract {
void Dict::adjust_number(A_CHOICE *best_choice, float *certainty_array) {
float adjust_factor;
if (segment_adjust_debug)
cprintf ("Number: %s %4.2f ",
class_string (best_choice), class_rating (best_choice));
class_rating (best_choice) += RATING_PAD;
if (pure_number (class_string (best_choice), class_lengths (best_choice))) {
class_rating (best_choice) *= segment_penalty_number_good;
adjust_factor = segment_penalty_number_good;
if (segment_adjust_debug)
cprintf (", %4.2f ", (double)segment_penalty_number_good);
else {
class_rating (best_choice) *= segment_penalty_number_ok;
adjust_factor = segment_penalty_number_ok;
if (segment_adjust_debug)
cprintf (", N, %4.2f ", (double)segment_penalty_number_ok);
class_rating (best_choice) -= RATING_PAD;
LogNewWordChoice(best_choice, adjust_factor,
certainty_array, getUnicharset());
if (segment_adjust_debug)
cprintf (" --> %4.2f\n", class_rating (best_choice));
* append_number_choices
* Check to see whether or not the next choice is worth appending to
* the string being generated. If so then keep going deeper into the
* word.
void Dict::append_number_choices(int state,
char *word,
char unichar_lengths[],
int unichar_offsets[],
int char_choice_index,
int word_index,
A_CHOICE *this_choice,
float *limit,
float rating,
float certainty,
float *certainty_array,
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
char fragment_lengths[],
CHOICES *result) {
int x;
int offset;
CHAR_FRAGMENT_INFO char_frag_info;
int word_ending =
(char_choice_index == (array_count(choices) - 1)) ? TRUE : FALSE;
const char *ch = NULL;
/* Deal lwith fragments */
if (!fragment_state_okay(
class_rating(this_choice), class_certainty(this_choice),
(number_debug && (fragments_debug > 1)) ? "number_debug" : NULL,
word_ending, &char_frag_info)) {
return; // this_choice must be an invalid fragment
if (char_frag_info.unichar_id != INVALID_UNICHAR_ID) {
ch = getUnicharset().id_to_unichar(char_frag_info.unichar_id);
if (ch == NULL) { // this character is a fragment
JOIN_ON(*result, // so search the next letter
number_permute(state, choices, char_choice_index + 1,
word_index, limit, word, unichar_lengths,
unichar_offsets, rating, certainty, certainty_array,
&char_frag_info, fragment_lengths));
/* Add new character */
strcpy(word + unichar_offsets[word_index], ch);
unichar_lengths[word_index] = strlen(ch);
unichar_lengths[word_index + 1] = 0;
fragment_lengths[word_index] = char_frag_info.num_fragments;
fragment_lengths[word_index + 1] = 0;
unichar_offsets[word_index + 1] = unichar_offsets[word_index] +
if (word[unichar_offsets[word_index]] == '\0') {
word[unichar_offsets[word_index]] = ' ';
word[unichar_offsets[word_index] + 1] = '\0';
unichar_lengths[word_index] = 1;
unichar_lengths[word_index + 1] = 0;
fragment_lengths[word_index] = 1;
fragment_lengths[word_index + 1] = 0;
unichar_offsets[word_index + 1] = unichar_offsets[word_index] +
certainty_array[word_index] = char_frag_info.certainty;
rating += char_frag_info.rating;
certainty = MIN (char_frag_info.certainty, certainty);
if (rating < *limit) {
state = number_state_change (state, word + unichar_offsets[word_index],
unichar_lengths + word_index);
if (number_debug) {
cprintf ("%s rating=%4.2f state=%d\n", word, rating, state);
if (state != -1) {
if ((state >> kStateShift) == 3 &&
word_index + 3 < array_count (choices)) {
if (word_ending) {
for (x = 0, offset = 0; x <= word_index; offset += unichar_lengths[x++]) {
if (getUnicharset().get_isdigit (
word + offset, unichar_lengths[x])) {
if (number_debug) cprintf ("new choice = %s\n", word);
push_on (*result, new_choice (word, unichar_lengths, rating,
certainty, -1, NULL, NUMBER_PERM,
false, fragment_lengths));
adjust_number ((A_CHOICE *) first_node (*result), certainty_array);
if (best_rating (*result) > *limit) {
free_choice (first_node (*result));
else {
*limit = best_rating (*result);
else {
JOIN_ON (*result,
number_permute (state, choices, char_choice_index + 1,
word_index + 1, limit, word, unichar_lengths,
unichar_offsets, rating, certainty,
certainty_array, &char_frag_info, fragment_lengths));
else {
if (number_debug)
cprintf ("pruned word (%s, rating=%4.2f, limit=%4.2f)\n",
word, rating, *limit);
* number_character_type
* Decide which type of a character (with regard to the numeric state
* table) we are looking at.
int Dict::number_character_type( //current state
const char* ch,
int length,
int state) {
if (getUnicharset().get_isalpha (ch, length)) {
#if 0
if (state < 4
&& strchr (allowed_char_strs[0], lower_char) != NULL)
return 5;
else if (state == 4
&& strchr (allowed_char_strs[1], lower_char) != NULL)
return 6;
else if (state == 5
&& strchr (allowed_char_strs[2], lower_char) != NULL)
return 7;
return 3;
else if (getUnicharset().get_isdigit (ch, length))
return (1);
else if (length == 1 && isoperator (*ch))
return (2);
else if (length == 1 && istrailing (*ch))
return (4);
else if (length == 1 && isleading (*ch))
return (0);
return (-1);
* number_state_change
* Execute a state transition according to the state table and
* additional rules.
int Dict::number_state_change(int state, //current state
const char *word, //current char
const char *lengths) { //length of current char
int char_type; //type of char
int new_state; //state to return
int old_state = state >> kStateShift;
int repeats = state & kRepeatMask;
#if 0
int index;
char copy_word[4]; //tolowered chars
char_type = number_character_type (word, *lengths, old_state);
if (char_type == -1)
return -1;
new_state = number_state_table[old_state][char_type];
if (new_state == old_state) {
if (repeats >= kMaxRepeats[old_state])
return -1;
} else {
repeats = 0;
if (new_state >= 0)
return (new_state << kStateShift) | repeats;
if (new_state == -99)
return -1;
//now check to see if the last state-3 chars in the word
//make an allowable word. For now only 3 letter words
//are allowed
if (old_state != 6)
return -1; //only 3 letters now
#if 0
copy_word[0] = tolower (word[-3]);
copy_word[1] = tolower (word[-2]);
copy_word[2] = tolower (word[-1]);
copy_word[3] = '\0';
for (index = 0; allowed_alpha_strs[index] != NULL; index++) {
if (strcmp (copy_word, allowed_alpha_strs[index]) == 0)
return (-new_state) << kStateShift;
return -1; //not a good word
* number_permute
* Permute all the valid string that match the 'grammar' of numbers.
* The valid syntax for numbers is encoded in a state table. The
* permuter uses this state table to enumerate all the string that
* can be produced using the input choices.
CHOICES Dict::number_permute(int state,
int char_choice_index,
int word_index,
float *limit,
char *word,
char unichar_lengths[],
int unichar_offsets[],
float rating,
float certainty,
float *certainty_array,
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
char fragment_lengths[]) {
CHOICES result = NIL;
int depth = 0;
if (number_debug) {
cprintf ("number_permute (state=%d, char_choice_index=%d,"
" word_index=%d, limit=%4.2f, ", state, char_choice_index,
word_index, *limit);
cprintf ("word=%s, rating=%4.2f, certainty=%4.2f)\n",
word, rating, certainty);
if (char_choice_index < array_count (choices)) {
iterate_list (c, (CHOICES) array_index (choices, char_choice_index)) {
if (depth++ < segment_digits_max)
append_number_choices (state, word, unichar_lengths, unichar_offsets,
choices, char_choice_index, word_index,
(A_CHOICE *) first_node (c), limit, rating,
certainty, certainty_array, prev_char_frag_info,
fragment_lengths, &result);
if (result && number_debug == 1)
print_choices ("number_permute:", result);
return (result);
* number_permute_and_select
* Permute all the possible valid numbers and adjust their ratings.
* Save the best rating.
A_CHOICE *Dict::number_permute_and_select(CHOICES_LIST char_choices,
float rating_limit) {
CHOICES result = NIL;
char unichar_lengths[MAX_WERD_LENGTH + 1];
int unichar_offsets[MAX_WERD_LENGTH + 1];
float certainty_array[MAX_WERD_LENGTH + 1];
char fragment_lengths[MAX_WERD_LENGTH + 1]; // num fragments from which
// each char was constructed
float rating = rating_limit;
A_CHOICE *best_choice;
best_choice = new_choice (NULL, NULL, MAXFLOAT, -MAXFLOAT, -1, NO_PERM);
if (array_count (char_choices) <= MAX_WERD_LENGTH) {
word[0] = '\0';
unichar_lengths[0] = 0;
fragment_lengths[0] = 0;
unichar_offsets[0] = 0;
result = number_permute (0, char_choices, 0, 0, &rating, word,
unichar_lengths, unichar_offsets,
0.0, 0.0, certainty_array, NULL, fragment_lengths);
if (display_ratings && result)
print_choices ("number_permuter", result);
while (result != NIL) {
if (best_rating (result) < class_rating (best_choice)) {
clone_choice (best_choice, (A_CHOICE *) first_node (result));
free_choice (first_node (result));
return (best_choice);
} // namespace tesseract
* pure_number
* Check to see if this string is a pure number (one that does not end
* with alphabetic characters).
namespace tesseract {
int Dict::pure_number(const char *string, const char *lengths) {
int x;
int offset;
x = strlen (lengths) - 1;
offset = strlen (string) - lengths[x];
for (;x >= 0; offset -= lengths[--x]) {
if (getUnicharset().get_isdigit (string + offset, lengths[x])) {
return (TRUE);
else if (getUnicharset().get_isalpha (string + offset, lengths[x]))
return (FALSE);
return (FALSE);
* valid_number
* Check this string to see if it is a valid number. Return TRUE if
* it is.
int Dict::valid_number(const char *string, const char *lengths) {
int state = 0;
int char_index;
int offset;
int num_chars = strlen (lengths);
int num_digits = 0;
for (char_index = 0, offset = 0; char_index < num_chars;
offset += lengths[char_index++]) {
state = number_state_change (state, string + offset, lengths + char_index);
if (state == -1)
return (FALSE);
if (getUnicharset().get_isdigit (string + offset, lengths[char_index]))
return num_digits > num_chars - num_digits;
} // namespace tesseract