blob: d5ace119d150da1a665b2eca421505f58b2de4a2 [file] [log] [blame]
/******************************************************************************
** Filename: stopper.c
** Purpose: Stopping criteria for word classifier.
** Author: Dan Johnson
** History: Mon Apr 29 14:56:49 1991, DSJ, Created.
**
** (c) Copyright Hewlett-Packard Company, 1988.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
******************************************************************************/
/**----------------------------------------------------------------------------
Include Files and Type Defines
----------------------------------------------------------------------------**/
#include "stopper.h"
#include "emalloc.h"
#include "matchdefs.h"
#include "debug.h"
#include "callcpp.h"
#include "permute.h"
#include "context.h"
#include "permnum.h"
#include "danerror.h"
#include "const.h"
#include "freelist.h"
#include "efio.h"
#include "globals.h"
#include "scanutils.h"
#include "unichar.h"
#include "varable.h"
#include "dict.h"
#include "image.h"
#include "ccutil.h"
#include "ratngs.h"
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <math.h>
#ifdef __UNIX__
#include <assert.h>
#endif
/* these are kludges - add appropriate .h file later */
/* from adaptmatch.cpp */
double_VAR(certainty_scale, 20.0, "Certainty scaling factor");
#define MAX_WERD_SIZE 100
#define MAX_AMBIG_SIZE 3
#define DANGEROUS_AMBIGS "DangAmbigs"
typedef LIST AMBIG_TABLE;
typedef struct
{
VIABLE_CHOICE Choice;
float ChunkCertainty[MAX_NUM_CHUNKS];
UNICHAR_ID ChunkClass[MAX_NUM_CHUNKS];
} EXPANDED_CHOICE;
typedef struct
{
char ambig[2 * (UNICHAR_LEN * MAX_AMBIG_SIZE) + 2];
char lengths[2 * (MAX_AMBIG_SIZE) + 2];
} AMBIG_SPEC;
/**----------------------------------------------------------------------------
Macros
----------------------------------------------------------------------------**/
#define BestCertainty(Choices) (((VIABLE_CHOICE) first_node (Choices))->Certainty)
#define BestRating(Choices) (((VIABLE_CHOICE) first_node (Choices))->Rating)
#define BestFactor(Choices) (((VIABLE_CHOICE) first_node (Choices))->AdjustFactor)
#define AmbigThreshold(F1,F2) (((F2) - (F1)) * stopper_ambiguity_threshold_gain - \
stopper_ambiguity_threshold_offset)
/*---------------------------------------------------------------------------
Private Function Prototoypes
----------------------------------------------------------------------------*/
void AddNewChunk(VIABLE_CHOICE Choice, int Blob);
int CmpChoiceRatings(void *arg1, //VIABLE_CHOICE Choice1,
void *arg2); //VIABLE_CHOICE Choice2);
void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice);
int FreeBadChoice(void *item1, //VIABLE_CHOICE Choice,
void *item2); //EXPANDED_CHOICE *BestChoice);
int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices,
const WERD_CHOICE &BestChoice);
/**----------------------------------------------------------------------------
Global Data Definitions and Declarations
----------------------------------------------------------------------------**/
/* Name of file containing potentially dangerous ambiguities */
static const char *DangerousAmbigs = DANGEROUS_AMBIGS;
/* Word for which stopper debug information should be printed to stdout */
static char *WordToDebug = NULL;
static char *WordToDebug_lengths = NULL;
/* flag used to disable accumulation of word choices during compound word
permutation */
BOOL8 KeepWordChoices = TRUE;
/* additional certainty padding allowed before a word is rejected */
static FLOAT32 RejectOffset = 0.0;
/* structures to keep track of viable word choices */
static VIABLE_CHOICE BestRawChoice = NULL;
static LIST BestChoices = NIL;
static PIECES_STATE CurrentSegmentation;
double_VAR(stopper_nondict_certainty_base, -2.50,
"Certainty threshold for non-dict words");
double_VAR(stopper_phase2_certainty_rejection_offset, 1.0,
"Reject certainty offset");
INT_VAR(stopper_smallword_size, 2,
"Size of dict word to be treated as non-dict word");
double_VAR(stopper_certainty_per_char, -0.50,
"Certainty to add for each dict char above small word size.");
double_VAR(stopper_allowable_character_badness, 3.0,
"Max certaintly variation allowed in a word (in sigma)");
INT_VAR(stopper_debug_level, 0,
"Stopper debug level");
double_VAR(stopper_ambiguity_threshold_gain, 8.0,
"Gain factor for ambiguity threshold");
double_VAR(stopper_ambiguity_threshold_offset, 1.5,
"Certainty offset for ambiguity threshold");
extern int first_pass;
INT_VAR (tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
/**----------------------------------------------------------------------------
Public Code
----------------------------------------------------------------------------**/
/*---------------------------------------------------------------------------*/
namespace tesseract {
int Dict::AcceptableChoice(const BLOB_CHOICE_LIST_VECTOR &Choices,
const WERD_CHOICE &BestChoice,
const WERD_CHOICE &RawChoice,
DANGERR *fixpt,
ACCEPTABLE_CHOICE_CALLER caller) {
/*
** Parameters:
** Choices choices for current segmentation
** BestChoice best choice for current segmentation
** RawChoice best raw choice for current segmentation
** Globals:
** stopper_nondict_certainty_base certainty for a non-dict word
** stopper_smallword_size size of word to be treated as non-word
** stopper_certainty_per_char certainty to add for each dict char
** Operation: Return TRUE if the results from this segmentation are
** good enough to stop. Otherwise return FALSE.
** Return: TRUE or FALSE.
** Exceptions: none
** History: Mon Apr 29 14:57:32 1991, DSJ, Created.
*/
float CertaintyThreshold = stopper_nondict_certainty_base;
int WordSize;
if (fixpt != NULL)
fixpt->index = -1;
if (BestChoice.length() == 0)
return (FALSE);
if (caller == CHOPPER_CALLER && BestChoice.fragment_mark()) {
if (stopper_debug_level >= 1) {
cprintf("AcceptableChoice(): a choice with fragments beats BestChoice");
}
return false;
}
// TODO(daria): remove this conversion once dawg (valid_word)
// is switched to use unichar ids, valid_number is deprecated
// and DanAmbigs are fixed to work with unichar ids.
STRING word_str = BestChoice.unichar_string();
STRING word_lengths_str = BestChoice.unichar_lengths();
if (stopper_debug_level >= 1)
cprintf ("\nStopper: %s (word=%c, case=%c, punct=%c)\n",
word_str.string(),
(valid_word(word_str.string()) ? 'y' : 'n'),
(case_ok(word_str.string(), word_lengths_str.string()) ? 'y' : 'n'),
((punctuation_ok(word_str.string(),
word_lengths_str.string()) != -1) ? 'y' : 'n'));
if (valid_word(word_str.string()) &&
case_ok(word_str.string(), word_lengths_str.string()) &&
punctuation_ok(word_str.string(), word_lengths_str.string()) != -1) {
WordSize = LengthOfShortestAlphaRun(BestChoice);
WordSize -= stopper_smallword_size;
if (WordSize < 0)
WordSize = 0;
CertaintyThreshold += WordSize * stopper_certainty_per_char;
} else if (stopper_numbers_on &&
valid_number(word_str.string(), word_lengths_str.string())) {
CertaintyThreshold += stopper_numbers_on * stopper_certainty_per_char;
}
if (stopper_debug_level >= 1)
cprintf ("Stopper: Certainty = %4.1f, Threshold = %4.1f\n",
BestChoice.certainty(), CertaintyThreshold);
if (NoDangerousAmbig(word_str.string(), word_lengths_str.string(), fixpt) &&
BestChoice.certainty() > CertaintyThreshold &&
UniformCertainties(Choices, BestChoice)) {
return (TRUE);
} else {
return (FALSE);
}
} /* AcceptableChoice */
/*---------------------------------------------------------------------------*/
int Dict::AcceptableResult(const WERD_CHOICE &BestChoice,
const WERD_CHOICE &RawChoice) {
/*
** Parameters:
** BestChoice best choice for current word
** RawChoice best raw choice for current word
** Globals:
** stopper_nondict_certainty_base certainty for a non-dict word
** stopper_smallword_size size of word to be treated as non-word
** stopper_certainty_per_char certainty to add for each dict char
** BestChoices list of all good choices found
** RejectOffset allowed offset before a word is rejected
** Operation: Return FALSE if the best choice for the current word
** is questionable and should be tried again on the second
** pass or should be flagged to the user.
** Return: TRUE or FALSE.
** Exceptions: none
** History: Thu May 9 14:05:05 1991, DSJ, Created.
*/
float CertaintyThreshold = stopper_nondict_certainty_base - RejectOffset;
int WordSize;
// TODO(daria): remove this conversion once dawg (valid_word)
// is switched to use unichar ids, valid_number is deprecated
// and DanAmbigs are fixed to work with unichar ids.
STRING word_str = BestChoice.unichar_string();
STRING word_lengths_str = BestChoice.unichar_lengths();
if (stopper_debug_level >= 1)
cprintf ("\nRejecter: %s (word=%c, case=%c, punct=%c, unambig=%c)\n",
word_str.string(),
(valid_word(word_str.string()) ? 'y' : 'n'),
(case_ok(word_str.string(), word_lengths_str.string()) ? 'y' : 'n'),
((punctuation_ok(word_str.string(),
word_lengths_str.string()) != -1) ? 'y' : 'n'),
((rest (BestChoices) != NIL) ? 'n' : 'y'));
if (BestChoice.length() == 0 || CurrentWordAmbig())
return (FALSE);
if (BestChoice.fragment_mark()) {
if (stopper_debug_level >= 1) {
cprintf("AcceptableResult(): a choice with fragments beats BestChoice\n");
}
return false;
}
if (valid_word(word_str.string()) &&
case_ok(word_str.string(), word_lengths_str.string()) &&
(punctuation_ok(word_str.string(), word_lengths_str.string())) != -1) {
WordSize = LengthOfShortestAlphaRun(BestChoice);
WordSize -= stopper_smallword_size;
if (WordSize < 0)
WordSize = 0;
CertaintyThreshold += WordSize * stopper_certainty_per_char;
}
if (stopper_debug_level >= 1)
cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
BestChoice.certainty(), CertaintyThreshold);
if (BestChoice.certainty() > CertaintyThreshold) {
if (stopper_debug_level >= 1)
cprintf("ACCEPTED\n");
return (TRUE);
}
else {
if (stopper_debug_level >= 1)
cprintf("REJECTED\n");
return (FALSE);
}
} /* AcceptableResult */
} // namespace tesseract
/*---------------------------------------------------------------------------*/
int AlternativeChoicesWorseThan(FLOAT32 Threshold) {
/*
** Parameters:
** Threshold minimum adjust factor for alternative choices
** Globals:
** BestChoices alternative choices for current word
** Operation: This routine returns TRUE if there are no alternative
** choices for the current word OR if all alternatives have
** an adjust factor worse than Threshold.
** Return: TRUE or FALSE.
** Exceptions: none
** History: Mon Jun 3 09:36:31 1991, DSJ, Created.
*/
LIST Alternatives;
VIABLE_CHOICE Choice;
Alternatives = rest (BestChoices);
iterate(Alternatives) {
Choice = (VIABLE_CHOICE) first_node (Alternatives);
if (Choice->AdjustFactor <= Threshold)
return (FALSE);
}
return (TRUE);
} /* AlternativeChoicesWorseThan */
namespace tesseract {
/*---------------------------------------------------------------------------*/
int Dict::CurrentBestChoiceIs(const WERD_CHOICE &WordChoice) {
/*
** Parameters:
** Word word that will be compared to the best choice
** Globals:
** BestChoices set of best choices for current word
** Operation: Returns TRUE if Word is the same as the current best
** choice, FALSE otherwise.
** Return: TRUE or FALSE
** Exceptions: none
** History: Thu May 30 14:44:22 1991, DSJ, Created.
*/
return (BestChoices != NIL &&
StringSameAs(WordChoice, (VIABLE_CHOICE)first_node(BestChoices)));
} /* CurrentBestChoiceIs */
/*---------------------------------------------------------------------------*/
FLOAT32 Dict::CurrentBestChoiceAdjustFactor() {
/*
** Parameters: none
** Globals:
** BestChoices set of best choices for current word
** Operation: Return the adjustment factor for the best choice for
** the current word.
** Return: Adjust factor for current best choice.
** Exceptions: none
** History: Thu May 30 14:48:24 1991, DSJ, Created.
*/
VIABLE_CHOICE BestChoice;
if (BestChoices == NIL)
return (MAX_FLOAT32);
BestChoice = (VIABLE_CHOICE) first_node (BestChoices);
return (BestChoice->AdjustFactor);
} /* CurrentBestChoiceAdjustFactor */
/*---------------------------------------------------------------------------*/
int Dict::CurrentWordAmbig() {
/*
** Parameters: none
** Globals:
** BestChoices set of best choices for current word
** Operation: This routine returns TRUE if there are multiple good
** choices for the current word and FALSE otherwise.
** Return: TRUE or FALSE
** Exceptions: none
** History: Wed May 22 15:38:38 1991, DSJ, Created.
*/
return (rest (BestChoices) != NIL);
} /* CurrentWordAmbig */
/*---------------------------------------------------------------------------*/
void Dict::DebugWordChoices() {
/*
** Parameters: none
** Globals:
** BestRawChoice
** BestChoices
** Operation: Print the current choices for this word to stdout.
** Return: none
** Exceptions: none
** History: Wed May 15 13:52:08 1991, DSJ, Created.
*/
LIST Choices;
int i;
char LabelString[80];
VIABLE_CHOICE VChoice = (VIABLE_CHOICE)first_node(BestChoices);
bool force_debug =
fragments_debug && VChoice != NULL && VChoice->ComposedFromCharFragments;
if (stopper_debug_level >= 1 || force_debug ||
(WordToDebug && BestChoices &&
StringSameAs(WordToDebug, WordToDebug_lengths,
(VIABLE_CHOICE)first_node(BestChoices)))) {
if (BestRawChoice)
PrintViableChoice(stderr, "\nBest Raw Choice: ", BestRawChoice);
i = 1;
Choices = BestChoices;
if (Choices)
cprintf("\nBest Cooked Choices:\n");
iterate(Choices) {
sprintf(LabelString, "Cooked Choice #%d: ", i);
PrintViableChoice(stderr, LabelString,
(VIABLE_CHOICE)first_node(Choices));
i++;
}
}
} /* DebugWordChoices */
} // namespace tesseract
/*---------------------------------------------------------------------------*/
void FilterWordChoices() {
/*
** Parameters: none
** Globals:
** BestChoices set of choices for current word
** Operation: This routine removes from BestChoices all choices which
** are not within a reasonable range of the best choice.
** Return: none
** Exceptions: none
** History: Wed May 15 13:08:24 1991, DSJ, Created.
*/
EXPANDED_CHOICE BestChoice;
if (BestChoices == NIL || second_node (BestChoices) == NIL)
return;
/* compute certainties and class for each chunk in best choice */
ExpandChoice ((VIABLE_CHOICE_STRUCT *) first_node (BestChoices), &BestChoice);
set_rest (BestChoices, delete_d (rest (BestChoices),
&BestChoice, FreeBadChoice));
} /* FilterWordChoices */
/*---------------------------------------------------------------------------*/
void FindClassifierErrors(FLOAT32 MinRating,
FLOAT32 MaxRating,
FLOAT32 RatingMargin,
FLOAT32 Thresholds[]) {
/*
** Parameters:
** MinRating limits how tight to make a template
** MaxRating limits how loose to make a template
** RatingMargin amount of margin to put in template
** Thresholds[] place to put error thresholds
** Globals: none
** Operation: This routine compares the best choice for the current
** word to the best raw choice to determine which characters
** were classified incorrectly by the classifier. It then
** places a separate threshold into Thresholds for each
** character in the word. If the classifier was correct,
** MaxRating is placed into Thresholds. If the
** classifier was incorrect, the avg. match rating (error
** percentage) of the classifier's incorrect choice minus
** some margin is
** placed into thresholds. This can then be used by the
** caller to try to create a new template for the desired
** class that will classify the character with a rating better
** than the threshold value. The match rating placed into
** Thresholds is never allowed to be below MinRating in order
** to prevent trying to make overly tight templates.
** Return: none (results are placed in Thresholds)
** Exceptions: none
** History: Fri May 31 16:02:57 1991, DSJ, Created.
*/
EXPANDED_CHOICE BestRaw;
VIABLE_CHOICE Choice;
int i, j, Chunk;
FLOAT32 AvgRating;
int NumErrorChunks;
assert (BestChoices != NIL);
assert (BestRawChoice != NULL);
ExpandChoice(BestRawChoice, &BestRaw);
Choice = (VIABLE_CHOICE) first_node (BestChoices);
for (i = 0, Chunk = 0; i < Choice->Length; i++, Thresholds++) {
AvgRating = 0.0;
NumErrorChunks = 0;
for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
if (Choice->Blob[i].Class != BestRaw.ChunkClass[Chunk]) {
AvgRating += BestRaw.ChunkCertainty[Chunk];
NumErrorChunks++;
}
}
if (NumErrorChunks > 0) {
AvgRating /= NumErrorChunks;
*Thresholds = (AvgRating / -certainty_scale) * (1.0 - RatingMargin);
}
else
*Thresholds = MaxRating;
if (*Thresholds > MaxRating)
*Thresholds = MaxRating;
if (*Thresholds < MinRating)
*Thresholds = MinRating;
}
} /* FindClassifierErrors */
/*---------------------------------------------------------------------------*/
void InitStopperVars() {
/*
** Parameters: none
** Globals: none
** Operation: Initializes the control variables used by the stopper.
** Return: none
** Exceptions: none
** History: Thu May 9 10:06:04 1991, DSJ, Created.
*/
VALUE dummy;
string_variable (DangerousAmbigs, "DangerousAmbigs", DANGEROUS_AMBIGS);
string_variable (WordToDebug, "WordToDebug", "");
string_variable (WordToDebug_lengths, "WordToDebug_lengths", "");
} /* InitStopperVars */
/*---------------------------------------------------------------------------*/
void InitChoiceAccum() {
/*
** Parameters: none
** Globals: none
** Operation: This routine initializes the data structures used to
** keep track the good word choices found for a word.
** Return: none
** Exceptions: none
** History: Fri May 17 07:59:00 1991, DSJ, Created.
*/
BLOB_WIDTH *BlobWidth, *End;
if (BestRawChoice)
memfree(BestRawChoice);
if (BestChoices)
destroy_nodes(BestChoices, memfree);
BestRawChoice = NULL;
BestChoices = NIL;
EnableChoiceAccum();
for (BlobWidth = CurrentSegmentation,
End = CurrentSegmentation + MAX_NUM_CHUNKS;
BlobWidth < End; *BlobWidth++ = 1);
} /* InitChoiceAccum */
/*---------------------------------------------------------------------------*/
namespace tesseract {
void Dict::LogNewRawChoice(const WERD_CHOICE &WordChoice,
FLOAT32 AdjustFactor,
const float Certainties[]) {
/*
** Parameters:
** Choice new raw choice for current word
** AdjustFactor adjustment factor which was applied to choice
** Certainties certainties for each char in new choice
** Globals:
** BestRawChoice best raw choice so far for current word
** Operation: This routine compares Choice to the best raw (non-dict)
** choice so far and replaces it if the new choice is better.
** Return: none
** Exceptions: none
** History: Wed May 15 09:57:19 1991, DSJ, Created.
*/
if (!KeepWordChoices)
return;
if (!BestRawChoice)
BestRawChoice = NewViableChoice (WordChoice, AdjustFactor, Certainties);
else if (WordChoice.rating() < BestRawChoice->Rating) {
if (ChoiceSameAs(WordChoice, BestRawChoice))
FillViableChoice(WordChoice, AdjustFactor, Certainties, true,
BestRawChoice);
else {
memfree(BestRawChoice);
BestRawChoice = NewViableChoice(WordChoice, AdjustFactor, Certainties);
}
}
} /* LogNewRawChoice */
} // namespace tesseract
/*---------------------------------------------------------------------------*/
void LogNewSegmentation(PIECES_STATE BlobWidth) {
/*
** Parameters:
** BlobWidth[] number of chunks in each blob in segmentation
** Globals:
** CurrentSegmentation blob widths for current segmentation
** Operation: This routine updates the blob widths in CurrentSegmentation
** to be the same as provided in BlobWidth.
** Return: none
** Exceptions: none
** History: Mon May 20 11:52:26 1991, DSJ, Created.
*/
BLOB_WIDTH *Segmentation;
for (Segmentation = CurrentSegmentation; *BlobWidth != 0;
BlobWidth++, Segmentation++)
*Segmentation = *BlobWidth;
*Segmentation = 0;
} /* LogNewSegmentation */
/*---------------------------------------------------------------------------*/
void LogNewSplit(int Blob) {
/*
** Parameters:
** Blob index of blob that was split
** Globals:
** BestRawChoice current best raw choice
** BestChoices list of best choices found so far
** Operation: This routine adds 1 chunk to the specified blob for each
** choice in BestChoices and for the BestRawChoice.
** Return: none
** Exceptions: none
** History: Mon May 20 11:38:56 1991, DSJ, Created.
*/
LIST Choices;
if (BestRawChoice) {
AddNewChunk(BestRawChoice, Blob);
}
Choices = BestChoices;
iterate(Choices) {
AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob);
}
} /* LogNewSplit */
/*---------------------------------------------------------------------------*/
namespace tesseract {
void Dict::LogNewWordChoice(const WERD_CHOICE &WordChoice,
FLOAT32 AdjustFactor,
const float Certainties[]) {
/*
** Parameters:
** Choice new choice for current word
** AdjustFactor adjustment factor which was applied to choice
** Certainties certainties for each char in new choice
** Globals:
** BestChoices best choices so far for current word
** Operation: This routine adds Choice to BestChoices if the
** adjusted certainty for Choice is within a reasonable range
** of the best choice in BestChoices. The BestChoices
** list is kept in sorted order by rating. Duplicates are
** removed.
** Return: none
** Exceptions: none
** History: Wed May 15 09:57:19 1991, DSJ, Created.
*/
VIABLE_CHOICE NewChoice;
LIST Choices;
FLOAT32 Threshold;
if (!KeepWordChoices)
return;
/* throw out obviously bad choices to save some work */
if (BestChoices != NIL) {
Threshold = AmbigThreshold (BestFactor (BestChoices), AdjustFactor);
if (Threshold > -stopper_ambiguity_threshold_offset)
Threshold = -stopper_ambiguity_threshold_offset;
if (WordChoice.certainty() - BestCertainty (BestChoices) < Threshold)
return;
}
/* see if a choice with the same text string has already been found */
NewChoice = NULL;
Choices = BestChoices;
iterate(Choices) {
if (ChoiceSameAs (WordChoice, (VIABLE_CHOICE) first_node (Choices))) {
if (WordChoice.rating() < BestRating (Choices)) {
NewChoice = (VIABLE_CHOICE) first_node (Choices);
} else {
return;
}
}
}
if (NewChoice) {
FillViableChoice(WordChoice, AdjustFactor, Certainties, true, NewChoice);
BestChoices = delete_d(BestChoices, NewChoice, is_same_node);
}
else {
NewChoice = NewViableChoice (WordChoice, AdjustFactor, Certainties);
}
BestChoices = s_adjoin (BestChoices, NewChoice, CmpChoiceRatings);
if (stopper_debug_level >= 2)
PrintViableChoice (stderr, "New Word Choice: ", NewChoice);
if (count (BestChoices) > tessedit_truncate_wordchoice_log) {
Choices =
(LIST) nth_cell (BestChoices, tessedit_truncate_wordchoice_log);
destroy_nodes (rest (Choices), Efree);
set_rest(Choices, NIL);
}
} /* LogNewWordChoice */
/*---------------------------------------------------------------------------*/
static AMBIG_TABLE *AmbigFor = NULL;
int Dict::NoDangerousAmbig(const char *Word,
const char *Word_lengths,
DANGERR *fixpt) {
/*
** Parameters:
** Word word to check for dangerous ambiguities
** Word_lengths lengths of unichars in Word
** Globals: none
** Operation: This word checks each letter in word against a list
** of potentially ambiguous characters. If a match is found
** that letter is replaced with its ambiguity and tested in
** the dictionary. If the ambiguous word is found in the
** dictionary, FALSE is returned. Otherwise, the search
** continues for other ambiguities. If no ambiguities that
** match in the dictionary are found, TRUE is returned.
** Return: TRUE if Word contains no dangerous ambiguities.
** Exceptions: none
** History: Mon May 6 16:28:56 1991, DSJ, Created.
*/
char NewWord[MAX_WERD_SIZE * UNICHAR_LEN + 1];
char *NextNewChar;
int bad_index = 0;
if (!AmbigFor)
AmbigFor = FillAmbigTable ();
NextNewChar = NewWord;
while (*Word) {
if (AmbigsFound (NewWord, NextNewChar,
Word + *Word_lengths, Word_lengths + 1,
AmbigFor[getUnicharset().unichar_to_id(
Word, *Word_lengths)],
fixpt)) {
if (fixpt != NULL)
fixpt->index = bad_index;
return (FALSE);
} else {
strncpy(NextNewChar, Word, *Word_lengths);
NextNewChar += *Word_lengths;
Word += *Word_lengths;
Word_lengths++;
bad_index++;
}
}
return (TRUE);
} /* NoDangerousAmbig */
void Dict::EndDangerousAmbigs() {
if (AmbigFor != NULL) {
for (int i = 0; i <= MAX_CLASS_ID; ++i) {
destroy_nodes(AmbigFor[i], Efree);
}
Efree(AmbigFor);
AmbigFor = NULL;
}
}
} // namespace tesseract
/*---------------------------------------------------------------------------*/
void SettupStopperPass1() {
/*
** Parameters: none
** Globals:
** RejectOffset offset allowed before word is rejected
** Operation: This routine performs any settup of stopper variables
** that is needed in preparation for the first pass.
** Return: none
** Exceptions: none
** History: Mon Jun 3 12:32:00 1991, DSJ, Created.
*/
RejectOffset = 0.0;
} /* SettupStopperPass1 */
/*---------------------------------------------------------------------------*/
void SettupStopperPass2() {
/*
** Parameters: none
** Globals:
** RejectOffset offset allowed before word is rejected
** Operation: This routine performs any settup of stopper variables
** that is needed in preparation for the second pass.
** Return: none
** Exceptions: none
** History: Mon Jun 3 12:32:00 1991, DSJ, Created.
*/
RejectOffset = stopper_phase2_certainty_rejection_offset;
} /* SettupStopperPass2 */
/**----------------------------------------------------------------------------
Private Code
----------------------------------------------------------------------------**/
/*---------------------------------------------------------------------------*/
void AddNewChunk(VIABLE_CHOICE Choice, int Blob) {
/*
** Parameters:
** Choice choice to add a new chunk to
** Blob index of blob being split
** Globals: none
** Operation: This routine increments the chunk count of the character
** in Choice which corresponds to Blob.
** Return: none
** Exceptions: none
** History: Mon May 20 11:43:27 1991, DSJ, Created.
*/
int i, LastChunk;
for (i = 0, LastChunk = 0; i < Choice->Length; i++) {
LastChunk += Choice->Blob[i].NumChunks;
if (Blob < LastChunk) {
(Choice->Blob[i].NumChunks)++;
return;
}
}
mem_tidy (1);
cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n",
Choice->Length, LastChunk, Blob);
assert(FALSE); /* this should never get executed */
} /* AddNewChunk */
/*---------------------------------------------------------------------------*/
namespace tesseract {
int Dict::AmbigsFound(char *Word,
char *CurrentChar,
const char *Tail,
const char *Tail_lengths,
LIST Ambigs,
DANGERR *fixpt) {
/*
** Parameters:
** Word word being tested for ambiguities
** CurrentChar position in Word to put ambig replacement
** Tail end of word to place after ambiguity
** Tail_lengths lengths of the unichars in Tail
** Ambigs list of ambiguities to test at this position
** Globals: none
** Operation: For each ambiguity in Ambigs, see if the remainder of
** the test string matches the start of Tail. If it does,
** construct a word consisting of the contents of Word up to,
** but not including, CurrentChar followed by the replacement
** string for the ambiguity followed by the unmatched
** contents of Tail. Then test this word to see if it
** is a dictionary word. If it is return TRUE. If none of
** the ambiguities result in a dictionary word, return FALSE.
** Return: TRUE if the Word is ambiguous at the specified position
** Exceptions: none
** History: Thu May 9 10:10:28 1991, DSJ, Created.
*/
AMBIG_SPEC *AmbigSpec;
char *ambig;
char *ambig_lengths;
const char *UnmatchedTail;
const char *UnmatchedTail_lengths;
int Matches;
int bad_length;
iterate(Ambigs) {
AmbigSpec = (AMBIG_SPEC *) first_node (Ambigs);
ambig = AmbigSpec->ambig;
ambig_lengths = AmbigSpec->lengths;
bad_length = 1;
UnmatchedTail = Tail;
UnmatchedTail_lengths = Tail_lengths;
Matches = TRUE;
while (*ambig != ' ' && Matches)
if (*UnmatchedTail_lengths == *ambig_lengths &&
strncmp(ambig, UnmatchedTail, *ambig_lengths) == 0) {
ambig += *(ambig_lengths++);
UnmatchedTail += *(UnmatchedTail_lengths++);
bad_length++;
}
else
Matches = FALSE;
if (Matches) {
ambig += *(ambig_lengths++); /* skip over the space */
/* insert replacement string */
strcpy(CurrentChar, ambig);
/* add tail */
strcat(Word, UnmatchedTail);
if (valid_word (Word)) {
if (stopper_debug_level >= 1)
cprintf ("Stopper: Possible ambiguous word = %s\n", Word);
if (fixpt != NULL) {
fixpt->good_length = strlen (ambig_lengths);
fixpt->bad_length = bad_length;
}
return (TRUE);
}
}
}
return (FALSE);
} /* AmbigsFound */
/*---------------------------------------------------------------------------*/
int Dict::ChoiceSameAs(const WERD_CHOICE &WordChoice,
VIABLE_CHOICE ViableChoice) {
/*
** Parameters:
** Choice choice to compare to ViableChoice
** ViableChoice viable choice to compare to Choice
** Globals: none
** Operation: This routine compares the corresponding strings of
** Choice and ViableChoice and returns TRUE if they are the
** same, FALSE otherwise.
** Return: TRUE or FALSE.
** Exceptions: none
** History: Fri May 17 08:48:04 1991, DSJ, Created.
*/
return (StringSameAs(WordChoice, ViableChoice));
} /* ChoiceSameAs */
} // namespace tesseract
/*---------------------------------------------------------------------------*/
int CmpChoiceRatings(void *arg1, //VIABLE_CHOICE Choice1,
void *arg2) { //VIABLE_CHOICE Choice2)
/*
** Parameters:
** Choice1, Choice2 choices to compare ratings for
** Globals: none
** Operation: Return -1 if the rating for Choice1 is less than the
** rating for Choice2, otherwise return (1).
** Return: -1 or 1
** Exceptions: none
** History: Wed May 15 13:02:37 1991, DSJ, Created.
*/
float R1, R2;
VIABLE_CHOICE Choice1 = (VIABLE_CHOICE) arg1;
VIABLE_CHOICE Choice2 = (VIABLE_CHOICE) arg2;
R1 = Choice1->Rating;
R2 = Choice2->Rating;
if (R1 < R2)
return (-1);
else
return (1);
} /* CmpChoiceRatings */
/*---------------------------------------------------------------------------*/
void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice) {
/*
** Parameters:
** Choice choice to be expanded
** ExpandedChoice place to put resulting expanded choice
** Globals: none
** Operation: This routine expands Choice and places the results
** in ExpandedChoice. The primary function of expansion
** is to create an two arrays, one which holds the corresponding
** certainty for each chunk in Choice, and one which holds
** the class for each chunk.
** Return: none (results are placed in ExpandedChoice)
** Exceptions: none
** History: Fri May 31 15:21:57 1991, DSJ, Created.
*/
int i, j, Chunk;
ExpandedChoice->Choice = Choice;
for (i = 0, Chunk = 0; i < Choice->Length; i++)
for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
ExpandedChoice->ChunkCertainty[Chunk] = Choice->Blob[i].Certainty;
ExpandedChoice->ChunkClass[Chunk] = Choice->Blob[i].Class;
}
} /* ExpandChoice */
/*---------------------------------------------------------------------------*/
namespace tesseract {
AMBIG_TABLE *Dict::FillAmbigTable() {
/*
** Parameters: none
** Globals:
** DangerousAmbigs filename of dangerous ambig info
** Operation: This routine allocates a new ambiguity table and fills
** it in from the file specified by DangerousAmbigs. An
** ambiguity table is an array of lists. The array is indexed
** by a class id. Therefore, each entry in the table provides
** a list of potential ambiguities which can start with the
** corresponding character. Each potential ambiguity is
** described by a string which contains the remainder of the
** test string followed by a space followed by the replacement
** string. For example the ambiguity "rn -> m", would be
** located in the table at index 'r'. The string corresponding
** to this ambiguity would be "n m".
** Return: Pointer to new ambiguity table.
** Exceptions: none
** History: Thu May 9 09:20:57 1991, DSJ, Created.
*/
FILE *AmbigFile;
AMBIG_TABLE *NewTable;
int i;
int AmbigPartSize;
char buffer[256 * UNICHAR_LEN];
char TestString[256 * UNICHAR_LEN];
char TestString_lengths[256];
char ReplacementString[256 * UNICHAR_LEN];
char ReplacementString_lengths[256];
STRING name;
char lengths[2];
AMBIG_SPEC *AmbigSpec;
UNICHAR_ID unichar_id;
lengths[1] = 0;
name = getImage()->getCCUtil()->language_data_path_prefix;
name += DangerousAmbigs;
AmbigFile = Efopen (name.string(), "r");
NewTable = (AMBIG_TABLE *) Emalloc (sizeof (LIST) * (MAX_CLASS_ID + 1));
for (i = 0; i <= MAX_CLASS_ID; i++)
NewTable[i] = NIL;
while (fscanf (AmbigFile, "%d", &AmbigPartSize) == 1) {
TestString[0] = '\0';
TestString_lengths[0] = 0;
ReplacementString[0] = '\0';
ReplacementString_lengths[0] = 0;
bool illegal_char = false;
for (i = 0; i < AmbigPartSize; ++i) {
fscanf (AmbigFile, "%s", buffer);
strcat(TestString, buffer);
lengths[0] = strlen(buffer);
strcat(TestString_lengths, lengths);
if (!getUnicharset().contains_unichar(buffer))
illegal_char = true;
}
fscanf (AmbigFile, "%d", &AmbigPartSize);
for (i = 0; i < AmbigPartSize; ++i) {
fscanf (AmbigFile, "%s", buffer);
strcat(ReplacementString, buffer);
lengths[0] = strlen(buffer);
strcat(ReplacementString_lengths, lengths);
if (!getUnicharset().contains_unichar(buffer))
illegal_char = true;
}
if (strlen (TestString_lengths) > MAX_AMBIG_SIZE ||
strlen (ReplacementString_lengths) > MAX_AMBIG_SIZE)
DoError (0, "Illegal ambiguity specification!");
if (illegal_char) {
continue;
}
AmbigSpec = (AMBIG_SPEC *) Emalloc (sizeof (AMBIG_SPEC));
strcpy(AmbigSpec->ambig, TestString + TestString_lengths[0]);
strcat(AmbigSpec->ambig, " ");
strcat(AmbigSpec->ambig, ReplacementString);
strcpy(AmbigSpec->lengths, TestString_lengths + 1);
lengths[0] = 1;
strcat(AmbigSpec->lengths, lengths);
strcat(AmbigSpec->lengths, ReplacementString_lengths);
unichar_id = getUnicharset().unichar_to_id(TestString,
TestString_lengths[0]);
NewTable[unichar_id] = push_last (NewTable[unichar_id], AmbigSpec);
}
fclose(AmbigFile);
return (NewTable);
} /* FillAmbigTable */
} // namespace tesseract
/*---------------------------------------------------------------------------*/
int FreeBadChoice(void *item1, //VIABLE_CHOICE Choice,
void *item2) { //EXPANDED_CHOICE *BestChoice)
/*
** Parameters:
** Choice choice to be tested
** BestChoice best choice found
** Globals:
** stopper_ambiguity_threshold_gain
** stopper_ambiguity_threshold_offset
** Operation: If the certainty of any chunk in Choice is not ambiguous
** with the corresponding chunk in the best choice, free
** Choice and return TRUE. Otherwise, return FALSE.
** Return: TRUE or FALSE.
** Exceptions: none
** History: Wed May 15 13:20:26 1991, DSJ, Created.
*/
int i, j, Chunk;
FLOAT32 Threshold;
VIABLE_CHOICE Choice;
EXPANDED_CHOICE *BestChoice;
Choice = (VIABLE_CHOICE) item1;
BestChoice = (EXPANDED_CHOICE *) item2;
Threshold = AmbigThreshold (BestChoice->Choice->AdjustFactor,
Choice->AdjustFactor);
for (i = 0, Chunk = 0; i < Choice->Length; i++)
for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++)
if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] &&
Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] <
Threshold) {
memfree(Choice);
return (TRUE);
}
return (FALSE);
} /* FreeBadChoice */
/*---------------------------------------------------------------------------*/
namespace tesseract {
int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) {
/*
** Parameters:
** Word word to be tested
** Globals: none
** Operation: Return the length of the shortest alpha run in Word.
** Return: Return the length of the shortest alpha run in Word.
** Exceptions: none
** History: Tue May 14 07:50:45 1991, DSJ, Created.
*/
register int Shortest = MAXINT;
register int Length;
int x;
int y;
for (x = 0; x < WordChoice.length(); ++x) {
if (getUnicharset().get_isalpha(WordChoice.unichar_id(x))) {
for (y = x + 1, Length = 1;
y < WordChoice.length() &&
getUnicharset().get_isalpha(WordChoice.unichar_id(y));
++y, ++Length);
if (Length < Shortest) {
Shortest = Length;
}
if (y == WordChoice.length()) {
break;
}
}
}
if (Shortest == MAXINT)
Shortest = 0;
return (Shortest);
} /* LengthOfShortestAlphaRun */
/*---------------------------------------------------------------------------*/
VIABLE_CHOICE Dict::NewViableChoice(const WERD_CHOICE &WordChoice,
FLOAT32 AdjustFactor,
const float Certainties[]) {
/*
** Parameters:
** Choice choice to be converted to a viable choice
** AdjustFactor factor used to adjust ratings for Choice
** Certainties certainty for each character in Choice
** Globals:
** CurrentSegmentation segmentation corresponding to Choice
** Operation: Allocate a new viable choice data structure, copy
** Choice, Certainties, and CurrentSegmentation into it,
** and return a pointer to it.
** Return: Ptr to new viable choice.
** Exceptions: none
** History: Thu May 16 15:28:29 1991, DSJ, Created.
*/
int Length = WordChoice.length();
assert (Length <= MAX_NUM_CHUNKS && Length > 0);
VIABLE_CHOICE NewChoice = (VIABLE_CHOICE) Emalloc (
sizeof (VIABLE_CHOICE_STRUCT) + (Length - 1) * sizeof (CHAR_CHOICE));
FillViableChoice(WordChoice, AdjustFactor, Certainties, false, NewChoice);
return (NewChoice);
} /* NewViableChoice */
/*---------------------------------------------------------------------------*/
void Dict::PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice) {
/*
** Parameters:
** File open text file to print Choice to
** Label text label to be printed with Choice
** Choice choice to be printed
** Globals: none
** Operation: This routine dumps a text representation of the
** specified Choice to File.
** Return: none
** Exceptions: none
** History: Mon May 20 11:16:44 1991, DSJ, Created.
*/
int i, j;
fprintf (File, "%s", Label);
fprintf(File, "(R=%5.1f, C=%4.1f, F=%4.2f, Frag=%d) ",
Choice->Rating, Choice->Certainty,
Choice->AdjustFactor, Choice->ComposedFromCharFragments);
for (i = 0; i < Choice->Length; i++)
fprintf(File, "%s", getUnicharset().id_to_unichar(Choice->Blob[i].Class));
fprintf(File, "\n");
for (i = 0; i < Choice->Length; i++) {
fprintf(File, " %s", getUnicharset().id_to_unichar(Choice->Blob[i].Class));
for (j = 0; j < Choice->Blob[i].NumChunks - 1; j++)
fprintf(File, " ");
}
fprintf(File, "\n");
for (i = 0; i < Choice->Length; i++) {
for (j = 0; j < Choice->Blob[i].NumChunks; j++)
fprintf(File, "%3d ", (int) (Choice->Blob[i].Certainty * -10.0));
}
fprintf(File, "\n");
for (i = 0; i < Choice->Length; i++) {
for (j = 0; j < Choice->Blob[i].NumChunks; j++)
fprintf(File, "%3d ", Choice->Blob[i].NumChunks);
}
fprintf(File, "\n");
} /* PrintViableChoice */
/*---------------------------------------------------------------------------*/
void Dict::FillViableChoice(const WERD_CHOICE &WordChoice,
FLOAT32 AdjustFactor, const float Certainties[],
bool SameString, VIABLE_CHOICE ViableChoice) {
/*
** Parameters:
** WordChoice a choice with info that will be copied
** AdjustFactor factor used to adjust ratings for AChoice
** Certainties certainty for each character in AChoice
** SameString if true the string in the viable choice
** will not be changed
** ViableChoice existing viable choice to fill in
** Globals:
** CurrentSegmentation segmentation for NewChoice
** Operation:
** Fill ViableChoice with information from AChoice,
** AdjustFactor, and Certainties.
** Return: none
** Exceptions: none
** History: Fri May 17 13:35:58 1991, DSJ, Created.
*/
CHAR_CHOICE *NewChar;
BLOB_WIDTH *BlobWidth;
int x;
ViableChoice->Rating = WordChoice.rating();
ViableChoice->Certainty = WordChoice.certainty();
ViableChoice->AdjustFactor = AdjustFactor;
ViableChoice->ComposedFromCharFragments = false;
if (!SameString) {
ViableChoice->Length = WordChoice.length();
}
for (x = 0,
NewChar = &(ViableChoice->Blob[0]),
BlobWidth = CurrentSegmentation;
x < WordChoice.length();
x++, NewChar++, Certainties++, BlobWidth++) {
if (!SameString) {
NewChar->Class = WordChoice.unichar_id(x);
}
NewChar->NumChunks = *BlobWidth;
NewChar->Certainty = *Certainties;
for (int i = 1; i < WordChoice.fragment_length(x); ++i) {
BlobWidth++;
assert(*BlobWidth > 0);
NewChar->NumChunks += *BlobWidth;
ViableChoice->ComposedFromCharFragments = true;
}
}
} /* FillViableChoice */
// Compares unichar ids in word_choice to those in viable_choice,
// returns true if they are the same, false otherwise.
bool Dict::StringSameAs(const WERD_CHOICE &WordChoice,
VIABLE_CHOICE ViableChoice) {
if (WordChoice.length() != ViableChoice->Length) {
return false;
}
int i;
CHAR_CHOICE *CharChoice;
for (i = 0, CharChoice = &(ViableChoice->Blob[0]);
i < ViableChoice->Length; CharChoice++, i++) {
if (CharChoice->Class != WordChoice.unichar_id(i)) {
return false;
}
}
return true;
}
/*---------------------------------------------------------------------------*/
int Dict::StringSameAs(const char *String,
const char *String_lengths,
VIABLE_CHOICE ViableChoice) {
/*
** Parameters:
** String string to compare to ViableChoice
** String_lengths lengths of unichars in String
** ViableChoice viable choice to compare to String
** Globals: none
** Operation: This routine compares String to ViableChoice and
** returns TRUE if they are the same, FALSE otherwise.
** Return: TRUE or FALSE.
** Exceptions: none
** History: Fri May 17 08:48:04 1991, DSJ, Created.
*/
CHAR_CHOICE *Char;
int i;
int current_unichar_length;
for (Char = &(ViableChoice->Blob[0]), i = 0;
i < ViableChoice->Length;
String += *(String_lengths++), Char++, i++) {
current_unichar_length = strlen(getUnicharset().id_to_unichar(Char->Class));
if (current_unichar_length != *String_lengths ||
strncmp(String, getUnicharset().id_to_unichar(Char->Class),
current_unichar_length) != 0)
return (FALSE);
}
if (*String == 0)
return (TRUE);
else
return (FALSE);
} /* StringSameAs */
} // namespace tesseract
/*---------------------------------------------------------------------------*/
int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices,
const WERD_CHOICE &BestChoice) {
/*
** Parameters:
** Choices choices for current segmentation
** BestChoice best choice for current segmentation
** Globals:
** stopper_allowable_character_badness max allowed certainty variation
** Operation: This routine returns TRUE if the certainty of the
** BestChoice word is within a reasonable range of the average
** certainties for the best choices for each character in
** the segmentation. This test is used to catch words in which
** one character is much worse than the other characters in
** the word (i.e. FALSE will be returned in that case).
** The algorithm computes the mean and std deviation of the
** certainties in the word with the worst certainty thrown out.
** Return: TRUE or FALSE.
** Exceptions: none
** History: Tue May 14 08:23:21 1991, DSJ, Created.
*/
float Certainty;
float WorstCertainty = MAX_FLOAT32;
float CertaintyThreshold;
FLOAT64 TotalCertainty;
FLOAT64 TotalCertaintySquared;
FLOAT64 Variance;
FLOAT32 Mean, StdDev;
int WordLength;
WordLength = Choices.length();
if (WordLength < 3)
return (TRUE);
TotalCertainty = TotalCertaintySquared = 0.0;
BLOB_CHOICE_IT BlobChoiceIt;
for (int i = 0; i < Choices.length(); ++i) {
BlobChoiceIt.set_to_list(Choices.get(i));
Certainty = BlobChoiceIt.data()->certainty();
TotalCertainty += Certainty;
TotalCertaintySquared += Certainty * Certainty;
if (Certainty < WorstCertainty)
WorstCertainty = Certainty;
}
/* subtract off worst certainty from statistics */
WordLength--;
TotalCertainty -= WorstCertainty;
TotalCertaintySquared -= WorstCertainty * WorstCertainty;
Mean = TotalCertainty / WordLength;
Variance = ((WordLength * TotalCertaintySquared -
TotalCertainty * TotalCertainty) /
(WordLength * (WordLength - 1)));
if (Variance < 0.0)
Variance = 0.0;
StdDev = sqrt (Variance);
CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
if (CertaintyThreshold > stopper_nondict_certainty_base)
CertaintyThreshold = stopper_nondict_certainty_base;
if (BestChoice.certainty() < CertaintyThreshold) {
if (stopper_debug_level >= 1)
cprintf("Stopper: Non-uniform certainty = %4.1f"
" (m=%4.1f, s=%4.1f, t=%4.1f)\n",
BestChoice.certainty(), Mean, StdDev, CertaintyThreshold);
return (FALSE);
} else {
return (TRUE);
}
} /* UniformCertainties */