| /****************************************************************************** |
| ** Filename: stopper.c |
| ** Purpose: Stopping criteria for word classifier. |
| ** Author: Dan Johnson |
| ** History: Mon Apr 29 14:56:49 1991, DSJ, Created. |
| ** |
| ** (c) Copyright Hewlett-Packard Company, 1988. |
| ** Licensed under the Apache License, Version 2.0 (the "License"); |
| ** you may not use this file except in compliance with the License. |
| ** You may obtain a copy of the License at |
| ** http://www.apache.org/licenses/LICENSE-2.0 |
| ** Unless required by applicable law or agreed to in writing, software |
| ** distributed under the License is distributed on an "AS IS" BASIS, |
| ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ** See the License for the specific language governing permissions and |
| ** limitations under the License. |
| ******************************************************************************/ |
| /**---------------------------------------------------------------------------- |
| Include Files and Type Defines |
| ----------------------------------------------------------------------------**/ |
| #include "stopper.h" |
| #include "emalloc.h" |
| #include "matchdefs.h" |
| #include "callcpp.h" |
| #include "permute.h" |
| #include "context.h" |
| #include "danerror.h" |
| #include "const.h" |
| #include "freelist.h" |
| #include "efio.h" |
| #include "scanutils.h" |
| #include "unichar.h" |
| #include "varable.h" |
| #include "dict.h" |
| #include "image.h" |
| #include "ccutil.h" |
| #include "ratngs.h" |
| #include "ambigs.h" |
| |
| #include <stdio.h> |
| #include <string.h> |
| #include <ctype.h> |
| #include <math.h> |
| #ifdef __UNIX__ |
| #include <assert.h> |
| #endif |
| |
| /* these are kludges - add appropriate .h file later */ |
| /* from adaptmatch.cpp */ |
| #define MAX_WERD_SIZE 100 |
| |
| typedef struct |
| { |
| VIABLE_CHOICE Choice; |
| float ChunkCertainty[MAX_NUM_CHUNKS]; |
| UNICHAR_ID ChunkClass[MAX_NUM_CHUNKS]; |
| } EXPANDED_CHOICE; |
| |
| /**---------------------------------------------------------------------------- |
| Macros |
| ----------------------------------------------------------------------------**/ |
| #define BestCertainty(Choices) (((VIABLE_CHOICE) first_node (Choices))->Certainty) |
| #define BestRating(Choices) (((VIABLE_CHOICE) first_node (Choices))->Rating) |
| #define BestFactor(Choices) (((VIABLE_CHOICE) first_node (Choices))->AdjustFactor) |
| |
| #define AmbigThreshold(F1,F2) (((F2) - (F1)) * stopper_ambiguity_threshold_gain - \ |
| stopper_ambiguity_threshold_offset) |
| |
| /*--------------------------------------------------------------------------- |
| Private Function Prototoypes |
| ----------------------------------------------------------------------------*/ |
| void AddNewChunk(VIABLE_CHOICE Choice, int Blob); |
| |
| int CmpChoiceRatings(void *arg1, //VIABLE_CHOICE Choice1, |
| void *arg2); //VIABLE_CHOICE Choice2); |
| |
| void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice); |
| |
| int FreeBadChoice(void *item1, //VIABLE_CHOICE Choice, |
| void *item2); //EXPANDED_CHOICE *BestChoice); |
| |
| int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices, |
| const WERD_CHOICE &BestChoice); |
| |
| /**---------------------------------------------------------------------- |
| V a r i a b l e s |
| ----------------------------------------------------------------------**/ |
| double_VAR(certainty_scale, 20.0, "Certainty scaling factor"); |
| |
| double_VAR(stopper_nondict_certainty_base, -2.50, |
| "Certainty threshold for non-dict words"); |
| |
| double_VAR(stopper_phase2_certainty_rejection_offset, 1.0, |
| "Reject certainty offset"); |
| |
| INT_VAR(stopper_smallword_size, 2, |
| "Size of dict word to be treated as non-dict word"); |
| |
| double_VAR(stopper_certainty_per_char, -0.50, |
| "Certainty to add for each dict char above small word size."); |
| |
| double_VAR(stopper_allowable_character_badness, 3.0, |
| "Max certaintly variation allowed in a word (in sigma)"); |
| |
| INT_VAR(stopper_debug_level, 0, "Stopper debug level"); |
| |
| double_VAR(stopper_ambiguity_threshold_gain, 8.0, |
| "Gain factor for ambiguity threshold"); |
| |
| double_VAR(stopper_ambiguity_threshold_offset, 1.5, |
| "Certainty offset for ambiguity threshold"); |
| |
| BOOL_VAR(stopper_no_acceptable_choices, false, |
| "Make AcceptableChoice() always return false. Useful" |
| " when there is a need to explore all segmentations"); |
| |
| BOOL_VAR(save_raw_choices, false, "Save all explored raw choices"); |
| |
| INT_VAR (tessedit_truncate_wordchoice_log, 10, "Max words to keep in list"); |
| |
| STRING_VAR(word_to_debug, "", "Word for which stopper debug information" |
| " should be printed to stdout"); |
| |
| STRING_VAR(word_to_debug_lengths, "", "Lengths of unichars in word_to_debug"); |
| |
| /**---------------------------------------------------------------------------- |
| Public Code |
| ----------------------------------------------------------------------------**/ |
| /*---------------------------------------------------------------------------*/ |
| namespace tesseract { |
| int Dict::AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices, |
| WERD_CHOICE *BestChoice, |
| const WERD_CHOICE &RawChoice, |
| DANGERR *fixpt, |
| ACCEPTABLE_CHOICE_CALLER caller, |
| bool *modified_blobs) { |
| /* |
| ** Parameters: |
| ** Choices choices for current segmentation |
| ** BestChoice best choice for current segmentation |
| ** RawChoice best raw choice for current segmentation |
| ** Variables Used: |
| ** stopper_nondict_certainty_base certainty for a non-dict word |
| ** stopper_smallword_size size of word to be treated as non-word |
| ** stopper_certainty_per_char certainty to add for each dict char |
| ** Operation: Return TRUE if the results from this segmentation are |
| ** good enough to stop. Otherwise return FALSE. |
| ** Return: TRUE or FALSE. |
| ** Exceptions: none |
| ** History: Mon Apr 29 14:57:32 1991, DSJ, Created. |
| */ |
| float CertaintyThreshold = stopper_nondict_certainty_base; |
| int WordSize; |
| |
| if (stopper_no_acceptable_choices) return false; |
| |
| if (fixpt != NULL) |
| fixpt->index = -1; |
| if (BestChoice->length() == 0) |
| return (FALSE); |
| if (caller == CHOPPER_CALLER && BestChoice->fragment_mark()) { |
| if (stopper_debug_level >= 1) { |
| cprintf("AcceptableChoice(): a choice with fragments beats BestChoice"); |
| } |
| return false; |
| } |
| |
| bool no_dang_ambigs = |
| NoDangerousAmbig(BestChoice, fixpt, true, Choices, modified_blobs); |
| |
| if (stopper_debug_level >= 1) |
| tprintf("\nStopper: %s (word=%c, case=%c)\n", |
| BestChoice->debug_string(getUnicharset()).string(), |
| (valid_word(*BestChoice) ? 'y' : 'n'), |
| (Context::case_ok(*BestChoice, getUnicharset()) ? 'y' : 'n')); |
| |
| if (valid_word(*BestChoice) && |
| Context::case_ok(*BestChoice, getUnicharset())) { |
| WordSize = LengthOfShortestAlphaRun(*BestChoice); |
| WordSize -= stopper_smallword_size; |
| if (WordSize < 0) |
| WordSize = 0; |
| CertaintyThreshold += WordSize * stopper_certainty_per_char; |
| } |
| |
| if (stopper_debug_level >= 1) |
| tprintf("Stopper: Certainty = %4.1f, Threshold = %4.1f\n", |
| BestChoice->certainty(), CertaintyThreshold); |
| |
| if (no_dang_ambigs && |
| BestChoice->certainty() > CertaintyThreshold && |
| UniformCertainties(*Choices, *BestChoice)) { |
| return (TRUE); |
| } else { |
| return (FALSE); |
| } |
| } /* AcceptableChoice */ |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| int Dict::AcceptableResult(const WERD_CHOICE &BestChoice, |
| const WERD_CHOICE &RawChoice) { |
| /* |
| ** Parameters: |
| ** BestChoice best choice for current word |
| ** RawChoice best raw choice for current word |
| ** Variables Used: |
| ** stopper_nondict_certainty_base certainty for a non-dict word |
| ** stopper_smallword_size size of word to be treated as non-word |
| ** stopper_certainty_per_char certainty to add for each dict char |
| ** best_choices_ list of all good choices found |
| ** reject_offset_ allowed offset before a word is rejected |
| ** Operation: Return FALSE if the best choice for the current word |
| ** is questionable and should be tried again on the second |
| ** pass or should be flagged to the user. |
| ** Return: TRUE or FALSE. |
| ** Exceptions: none |
| ** History: Thu May 9 14:05:05 1991, DSJ, Created. |
| */ |
| float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_; |
| int WordSize; |
| |
| if (stopper_debug_level >= 1) { |
| tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c)\n", |
| BestChoice.debug_string(getUnicharset()).string(), |
| (valid_word(BestChoice) ? 'y' : 'n'), |
| (Context::case_ok(BestChoice, getUnicharset()) ? 'y' : 'n'), |
| ((rest (best_choices_) != NIL) ? 'n' : 'y')); |
| } |
| |
| if (BestChoice.length() == 0 || CurrentWordAmbig()) |
| return (FALSE); |
| if (BestChoice.fragment_mark()) { |
| if (stopper_debug_level >= 1) { |
| cprintf("AcceptableResult(): a choice with fragments beats BestChoice\n"); |
| } |
| return false; |
| } |
| if (valid_word(BestChoice) && |
| Context::case_ok(BestChoice, getUnicharset())) { |
| WordSize = LengthOfShortestAlphaRun(BestChoice); |
| WordSize -= stopper_smallword_size; |
| if (WordSize < 0) |
| WordSize = 0; |
| CertaintyThreshold += WordSize * stopper_certainty_per_char; |
| } |
| |
| if (stopper_debug_level >= 1) |
| cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f ", |
| BestChoice.certainty(), CertaintyThreshold); |
| |
| if (BestChoice.certainty() > CertaintyThreshold && |
| !stopper_no_acceptable_choices) { |
| if (stopper_debug_level >= 1) |
| cprintf("ACCEPTED\n"); |
| return (TRUE); |
| } |
| else { |
| if (stopper_debug_level >= 1) |
| cprintf("REJECTED\n"); |
| return (FALSE); |
| } |
| } /* AcceptableResult */ |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| int Dict::AlternativeChoicesWorseThan(FLOAT32 Threshold) { |
| /* |
| ** Parameters: |
| ** Threshold minimum adjust factor for alternative choices |
| ** Variables Used: |
| ** best_choices_ alternative choices for current word |
| ** Operation: This routine returns TRUE if there are no alternative |
| ** choices for the current word OR if all alternatives have |
| ** an adjust factor worse than Threshold. |
| ** Return: TRUE or FALSE. |
| ** Exceptions: none |
| ** History: Mon Jun 3 09:36:31 1991, DSJ, Created. |
| */ |
| LIST Alternatives; |
| VIABLE_CHOICE Choice; |
| |
| Alternatives = rest (best_choices_); |
| iterate(Alternatives) { |
| Choice = (VIABLE_CHOICE) first_node (Alternatives); |
| if (Choice->AdjustFactor <= Threshold) |
| return (FALSE); |
| } |
| |
| return (TRUE); |
| |
| } /* AlternativeChoicesWorseThan */ |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| int Dict::CurrentBestChoiceIs(const WERD_CHOICE &WordChoice) { |
| /* |
| ** Parameters: |
| ** Word word that will be compared to the best choice |
| ** Variables Used: |
| ** best_choices_ set of best choices for current word |
| ** Operation: Returns TRUE if Word is the same as the current best |
| ** choice, FALSE otherwise. |
| ** Return: TRUE or FALSE |
| ** Exceptions: none |
| ** History: Thu May 30 14:44:22 1991, DSJ, Created. |
| */ |
| return (best_choices_ != NIL && |
| StringSameAs(WordChoice, (VIABLE_CHOICE)first_node(best_choices_))); |
| } /* CurrentBestChoiceIs */ |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| FLOAT32 Dict::CurrentBestChoiceAdjustFactor() { |
| /* |
| ** Parameters: none |
| ** Variables Used: |
| ** best_choices_ set of best choices for current word |
| ** Operation: Return the adjustment factor for the best choice for |
| ** the current word. |
| ** Return: Adjust factor for current best choice. |
| ** Exceptions: none |
| ** History: Thu May 30 14:48:24 1991, DSJ, Created. |
| */ |
| VIABLE_CHOICE BestChoice; |
| |
| if (best_choices_ == NIL) |
| return (MAX_FLOAT32); |
| |
| BestChoice = (VIABLE_CHOICE) first_node (best_choices_); |
| return (BestChoice->AdjustFactor); |
| |
| } /* CurrentBestChoiceAdjustFactor */ |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| int Dict::CurrentWordAmbig() { |
| /* |
| ** Parameters: none |
| ** Variables Used: |
| ** best_choices_ set of best choices for current word |
| ** Operation: This routine returns TRUE if there are multiple good |
| ** choices for the current word and FALSE otherwise. |
| ** Return: TRUE or FALSE |
| ** Exceptions: none |
| ** History: Wed May 22 15:38:38 1991, DSJ, Created. |
| */ |
| return (rest (best_choices_) != NIL); |
| |
| } /* CurrentWordAmbig */ |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| void Dict::DebugWordChoices() { |
| /* |
| ** Parameters: none |
| ** Variables Used: |
| ** best_raw_choice_ |
| ** best_choices_ |
| ** Operation: Print the current choices for this word to stdout. |
| ** Return: none |
| ** Exceptions: none |
| ** History: Wed May 15 13:52:08 1991, DSJ, Created. |
| */ |
| LIST Choices; |
| int i; |
| char LabelString[80]; |
| VIABLE_CHOICE VChoice = (VIABLE_CHOICE)first_node(best_choices_); |
| bool force_debug = |
| fragments_debug && VChoice != NULL && VChoice->ComposedFromCharFragments; |
| |
| if (stopper_debug_level >= 1 || force_debug || |
| (((STRING)word_to_debug).length() > 0 && best_choices_ && |
| StringSameAs(word_to_debug.string(), word_to_debug_lengths.string(), |
| (VIABLE_CHOICE)first_node(best_choices_)))) { |
| if (best_raw_choice_) |
| PrintViableChoice(stderr, "\nBest Raw Choice: ", best_raw_choice_); |
| |
| i = 1; |
| Choices = best_choices_; |
| if (Choices) |
| cprintf("\nBest Cooked Choices:\n"); |
| iterate(Choices) { |
| sprintf(LabelString, "Cooked Choice #%d: ", i); |
| PrintViableChoice(stderr, LabelString, |
| (VIABLE_CHOICE)first_node(Choices)); |
| i++; |
| } |
| } |
| } /* DebugWordChoices */ |
| |
| // Print all the choices in raw_choices_ list for non 1-1 ambiguities. |
| void Dict::PrintAmbigAlternatives(FILE *file, const char *label, |
| int label_num_unichars) { |
| iterate(raw_choices_) { |
| VIABLE_CHOICE Choice = (VIABLE_CHOICE)first_node(raw_choices_); |
| if (Choice->Length > 0 && |
| (label_num_unichars > 1 || Choice->Length > 1)) { |
| for (int i = 0; i < Choice->Length; i++) { |
| fprintf(file, "%s", |
| getUnicharset().id_to_unichar(Choice->Blob[i].Class)); |
| } |
| fflush(file); |
| fprintf(file, "\t%s\t%.4f\t%.4f\n", label, |
| Choice->Rating, Choice->Certainty); |
| } |
| } |
| } |
| |
| /*---------------------------------------------------------------------------*/ |
| void Dict::FilterWordChoices() { |
| /* |
| ** Parameters: none |
| ** Variables Used: |
| ** best_choices_ set of choices for current word |
| ** Operation: This routine removes from best_choices_ all choices which |
| ** are not within a reasonable range of the best choice. |
| ** Return: none |
| ** Exceptions: none |
| ** History: Wed May 15 13:08:24 1991, DSJ, Created. |
| */ |
| EXPANDED_CHOICE BestChoice; |
| |
| if (best_choices_ == NIL || second_node (best_choices_) == NIL) |
| return; |
| |
| /* compute certainties and class for each chunk in best choice */ |
| ExpandChoice((VIABLE_CHOICE_STRUCT *)first_node(best_choices_), &BestChoice); |
| |
| set_rest (best_choices_, delete_d (rest (best_choices_), |
| &BestChoice, FreeBadChoice)); |
| |
| } /* FilterWordChoices */ |
| |
| /*---------------------------------------------------------------------------*/ |
| void Dict::FindClassifierErrors(FLOAT32 MinRating, |
| FLOAT32 MaxRating, |
| FLOAT32 RatingMargin, |
| FLOAT32 Thresholds[]) { |
| /* |
| ** Parameters: |
| ** MinRating limits how tight to make a template |
| ** MaxRating limits how loose to make a template |
| ** RatingMargin amount of margin to put in template |
| ** Thresholds[] place to put error thresholds |
| ** Operation: This routine compares the best choice for the current |
| ** word to the best raw choice to determine which characters |
| ** were classified incorrectly by the classifier. It then |
| ** places a separate threshold into Thresholds for each |
| ** character in the word. If the classifier was correct, |
| ** MaxRating is placed into Thresholds. If the |
| ** classifier was incorrect, the avg. match rating (error |
| ** percentage) of the classifier's incorrect choice minus |
| ** some margin is |
| ** placed into thresholds. This can then be used by the |
| ** caller to try to create a new template for the desired |
| ** class that will classify the character with a rating better |
| ** than the threshold value. The match rating placed into |
| ** Thresholds is never allowed to be below MinRating in order |
| ** to prevent trying to make overly tight templates. |
| ** Return: none (results are placed in Thresholds) |
| ** Exceptions: none |
| ** History: Fri May 31 16:02:57 1991, DSJ, Created. |
| */ |
| EXPANDED_CHOICE BestRaw; |
| VIABLE_CHOICE Choice; |
| int i, j, Chunk; |
| FLOAT32 AvgRating; |
| int NumErrorChunks; |
| |
| assert (best_choices_ != NIL); |
| assert (best_raw_choice_ != NULL); |
| |
| ExpandChoice(best_raw_choice_, &BestRaw); |
| Choice = (VIABLE_CHOICE) first_node (best_choices_); |
| |
| for (i = 0, Chunk = 0; i < Choice->Length; i++, Thresholds++) { |
| AvgRating = 0.0; |
| NumErrorChunks = 0; |
| |
| for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) { |
| if (Choice->Blob[i].Class != BestRaw.ChunkClass[Chunk]) { |
| AvgRating += BestRaw.ChunkCertainty[Chunk]; |
| NumErrorChunks++; |
| } |
| } |
| |
| if (NumErrorChunks > 0) { |
| AvgRating /= NumErrorChunks; |
| *Thresholds = (AvgRating / -certainty_scale) * (1.0 - RatingMargin); |
| } |
| else |
| *Thresholds = MaxRating; |
| |
| if (*Thresholds > MaxRating) |
| *Thresholds = MaxRating; |
| if (*Thresholds < MinRating) |
| *Thresholds = MinRating; |
| } |
| } /* FindClassifierErrors */ |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| void Dict::InitChoiceAccum() { |
| /* |
| ** Parameters: none |
| ** Operation: This routine initializes the data structures used to |
| ** keep track the good word choices found for a word. |
| ** Return: none |
| ** Exceptions: none |
| ** History: Fri May 17 07:59:00 1991, DSJ, Created. |
| */ |
| BLOB_WIDTH *BlobWidth, *End; |
| |
| if (best_raw_choice_) |
| memfree(best_raw_choice_); |
| best_raw_choice_ = NULL; |
| |
| if (best_choices_) |
| destroy_nodes(best_choices_, memfree); |
| best_choices_ = NIL; |
| |
| if (raw_choices_) |
| destroy_nodes(raw_choices_, memfree); |
| raw_choices_ = NIL; |
| |
| EnableChoiceAccum(); |
| |
| for (BlobWidth = current_segmentation_, |
| End = current_segmentation_ + MAX_NUM_CHUNKS; |
| BlobWidth < End; *BlobWidth++ = 1); |
| |
| } /* InitChoiceAccum */ |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| void Dict::LogNewSegmentation(PIECES_STATE BlobWidth) { |
| /* |
| ** Parameters: |
| ** BlobWidth[] number of chunks in each blob in segmentation |
| ** Variables Used: |
| ** current_segmentation blob widths for current segmentation |
| ** Operation: This routine updates the blob widths in current_segmentation |
| ** to be the same as provided in BlobWidth. |
| ** Return: none |
| ** Exceptions: none |
| ** History: Mon May 20 11:52:26 1991, DSJ, Created. |
| */ |
| BLOB_WIDTH *Segmentation; |
| |
| for (Segmentation = current_segmentation_; *BlobWidth != 0; |
| BlobWidth++, Segmentation++) |
| *Segmentation = *BlobWidth; |
| *Segmentation = 0; |
| |
| } /* LogNewSegmentation */ |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| void Dict::LogNewSplit(int Blob) { |
| /* |
| ** Parameters: |
| ** Blob index of blob that was split |
| ** Variables Used: |
| ** best_raw_choice_ current best raw choice |
| ** best_choices_ list of best choices found so far |
| ** Operation: This routine adds 1 chunk to the specified blob for each |
| ** choice in best_choices_ and for the best_raw_choice_. |
| ** Return: none |
| ** Exceptions: none |
| ** History: Mon May 20 11:38:56 1991, DSJ, Created. |
| */ |
| LIST Choices; |
| |
| if (best_raw_choice_) { |
| AddNewChunk(best_raw_choice_, Blob); |
| } |
| |
| Choices = best_choices_; |
| iterate(Choices) { |
| AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob); |
| } |
| Choices = raw_choices_; |
| iterate(Choices) { |
| AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob); |
| } |
| } /* LogNewSplit */ |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| void Dict::LogNewChoice(const WERD_CHOICE &WordChoice, |
| FLOAT32 AdjustFactor, |
| const float Certainties[], |
| bool raw_choice) { |
| /* |
| ** Parameters: |
| ** Choice new choice for current word |
| ** AdjustFactor adjustment factor which was applied to choice |
| ** Certainties certainties for each char in new choice |
| ** ChoicesList list with choices seen so far |
| ** Variables Used: |
| ** best_raw_choice_ best raw choice so far for current word |
| ** Operation: This routine adds Choice to ChoicesList if the |
| ** adjusted certainty for Choice is within a reasonable range |
| ** of the best choice in ChoicesList. The ChoicesList |
| ** list is kept in sorted order by rating. Duplicates are |
| ** removed. |
| ** Return: none |
| ** Exceptions: none |
| ** History: Wed May 15 09:57:19 1991, DSJ, Created. |
| */ |
| VIABLE_CHOICE NewChoice; |
| LIST ChoicesList; |
| LIST Choices; |
| FLOAT32 Threshold; |
| |
| if (!keep_word_choices_) |
| return; |
| |
| if (raw_choice) { |
| if (!best_raw_choice_) |
| best_raw_choice_ = NewViableChoice(WordChoice, AdjustFactor, Certainties); |
| else if (WordChoice.rating() < best_raw_choice_->Rating) { |
| if (ChoiceSameAs(WordChoice, best_raw_choice_)) |
| FillViableChoice(WordChoice, AdjustFactor, Certainties, true, |
| best_raw_choice_); |
| else { |
| memfree(best_raw_choice_); |
| best_raw_choice_ = |
| NewViableChoice(WordChoice, AdjustFactor, Certainties); |
| } |
| } |
| if (!save_raw_choices) return; |
| ChoicesList = raw_choices_; |
| } else { |
| ChoicesList = best_choices_; |
| } |
| |
| /* throw out obviously bad choices to save some work */ |
| if (ChoicesList != NIL) { |
| Threshold = AmbigThreshold (BestFactor (ChoicesList), AdjustFactor); |
| if (Threshold > -stopper_ambiguity_threshold_offset) |
| Threshold = -stopper_ambiguity_threshold_offset; |
| if (WordChoice.certainty() - BestCertainty (ChoicesList) < Threshold) |
| return; |
| } |
| |
| /* see if a choice with the same text string has already been found */ |
| NewChoice = NULL; |
| Choices = ChoicesList; |
| |
| iterate(Choices) { |
| if (ChoiceSameAs (WordChoice, (VIABLE_CHOICE) first_node (Choices))) { |
| if (WordChoice.rating() < BestRating (Choices)) { |
| NewChoice = (VIABLE_CHOICE) first_node (Choices); |
| } else { |
| return; |
| } |
| } |
| } |
| |
| if (NewChoice) { |
| FillViableChoice(WordChoice, AdjustFactor, Certainties, true, NewChoice); |
| ChoicesList = delete_d(ChoicesList, NewChoice, is_same_node); |
| } |
| else { |
| NewChoice = NewViableChoice (WordChoice, AdjustFactor, Certainties); |
| } |
| |
| ChoicesList = s_adjoin (ChoicesList, NewChoice, CmpChoiceRatings); |
| if (stopper_debug_level >= 2) |
| raw_choice ? PrintViableChoice (stderr, "New Raw Choice: ", NewChoice) : |
| PrintViableChoice (stderr, "New Word Choice: ", NewChoice); |
| if (count (ChoicesList) > tessedit_truncate_wordchoice_log) { |
| Choices = |
| (LIST) nth_cell (ChoicesList, tessedit_truncate_wordchoice_log); |
| destroy_nodes (rest (Choices), Efree); |
| set_rest(Choices, NIL); |
| } |
| |
| // Update raw_choices_/best_choices_ pointer. |
| if (raw_choice) { |
| raw_choices_ = ChoicesList; |
| } else { |
| best_choices_ = ChoicesList; |
| } |
| } /* LogNewChoice */ |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| int Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, |
| DANGERR *fix_pt, |
| bool fix_replaceable, |
| BLOB_CHOICE_LIST_VECTOR *blob_choices, |
| bool *modified_blobs) { |
| if (stopper_debug_level > 2) { |
| tprintf("\nRunning NoDangerousAmbig() for %s\n", |
| best_choice->debug_string(getUnicharset()).string()); |
| } |
| |
| // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities |
| // for each unichar id in BestChoice. |
| BLOB_CHOICE_LIST_VECTOR ambig_blob_choices; |
| int i; |
| bool modified_best_choice = false; |
| bool ambigs_found = false; |
| // For each position in best_choice: |
| // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i] |
| // -- initialize wrong_ngram with a single unichar_id at best_choice[i] |
| // -- look for ambiguities corresponding to wrong_ngram in the list while |
| // adding the following unichar_ids from best_choice to wrong_ngram |
| // |
| // Repeat the above procedure twice: first time look through |
| // ambigs to be replaced and replace all the ambiguities found; |
| // second time look through dangerous ambiguities and construct |
| // ambig_blob_choices with fake a blob choice for each ambiguity |
| // and pass them to dawg_permute_and_select() to search for |
| // ambiguous words in the dictionaries. |
| // |
| // Note that during the execution of the for loop (on the first pass) |
| // if replacements are made the length of best_choice might change. |
| for (int pass = 0; pass < 2; ++pass) { |
| bool replace = (pass == 0); |
| const UnicharAmbigsVector &table = replace ? |
| getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs(); |
| if (!replace) { |
| // Initialize ambig_blob_choices with lists containing a single |
| // unichar id for the correspoding position in best_choice. |
| // best_choice consisting from only the original letters will |
| // have a rating of 0.0. |
| for (i = 0; i < best_choice->length(); ++i) { |
| BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST(); |
| BLOB_CHOICE_IT lst_it(lst); |
| lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i), |
| 0.0, 0.0, 0, -1)); |
| ambig_blob_choices.push_back(lst); |
| } |
| } |
| UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1]; |
| int wrong_ngram_index; |
| int next_index; |
| for (i = 0; i < best_choice->length(); ++i) { |
| UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i); |
| if (stopper_debug_level > 2) { |
| tprintf("Looking for %s ngrams starting with %s:\n", |
| replace ? "replaceable" : "ambiguous", |
| getUnicharset().debug_str(curr_unichar_id).string()); |
| } |
| wrong_ngram_index = 0; |
| wrong_ngram[wrong_ngram_index] = curr_unichar_id; |
| if (curr_unichar_id == INVALID_UNICHAR_ID || |
| curr_unichar_id >= table.size() || |
| table[curr_unichar_id] == NULL) { |
| continue; // there is no ambig spec for this unichar id |
| } |
| AmbigSpec_IT spec_it(table[curr_unichar_id]); |
| for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) { |
| const AmbigSpec *ambig_spec = spec_it.data(); |
| wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID; |
| int compare = UnicharIdArrayUtils::compare(wrong_ngram, |
| ambig_spec->wrong_ngram); |
| if (stopper_debug_level > 2) { |
| tprintf("candidate ngram: "); |
| UnicharIdArrayUtils::print(wrong_ngram, getUnicharset()); |
| tprintf("current ngram from spec: "); |
| UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset()); |
| tprintf("comparison result: %d\n", compare); |
| } |
| if (compare == 0) { |
| if (replace) { |
| if (stopper_debug_level > 2) { |
| tprintf("replace ambiguity with: "); |
| UnicharIdArrayUtils::print( |
| ambig_spec->correct_fragments, getUnicharset()); |
| } |
| ReplaceAmbig(i, ambig_spec->wrong_ngram_size, |
| ambig_spec->correct_ngram_id, |
| best_choice, blob_choices, modified_blobs); |
| modified_best_choice = true; |
| } else if (i > 0 || ambig_spec->type != CASE_AMBIG) { |
| // We found dang ambig - update ambig_blob_choices. |
| if (stopper_debug_level > 2) { |
| tprintf("found ambiguity: "); |
| UnicharIdArrayUtils::print( |
| ambig_spec->correct_fragments, getUnicharset()); |
| } |
| ambigs_found = true; |
| for (int tmp_index = 0; tmp_index <= wrong_ngram_index; |
| ++tmp_index) { |
| // Add a blob choice for the corresponding fragment of the |
| // ambiguity. These fake blob choices are initialized with |
| // negative ratings (which are not possible for real blob |
| // choices), so that dawg_permute_and_select() considers any |
| // word not consisting of only the original letters a better |
| // choice and stops searching for alternatives once such a |
| // choice is found. |
| BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]); |
| bc_it.add_to_end(new BLOB_CHOICE( |
| ambig_spec->correct_fragments[tmp_index], -1.0, 0.0, 0, -1)); |
| } |
| } |
| spec_it.forward(); |
| } else if (compare == -1) { |
| if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size && |
| ((next_index = wrong_ngram_index+1+i) < best_choice->length())) { |
| // Add the next unichar id to wrong_ngram and keep looking for |
| // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST. |
| wrong_ngram[++wrong_ngram_index] = |
| best_choice->unichar_id(next_index); |
| } else { |
| break; // no more matching ambigs in this AMBIG_SPEC_LIST |
| } |
| } else { |
| spec_it.forward(); |
| } |
| } // end searching AmbigSpec_LIST |
| } // end searching best_choice |
| } // end searching replace and dangerous ambigs |
| if (modified_best_choice) best_choice->populate_unichars(getUnicharset()); |
| // If any ambiguities were found permute the constructed ambig_blob_choices |
| // to see if an alternative dictionary word can be found. |
| if (ambigs_found) { |
| if (stopper_debug_level > 2) { |
| tprintf("\nResulting ambig_blob_choices:\n"); |
| for (i = 0; i < ambig_blob_choices.length(); ++i) { |
| print_ratings_list("", ambig_blob_choices.get(i), getUnicharset()); |
| tprintf("\n"); |
| } |
| } |
| WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0); |
| ambigs_found = (alt_word->rating() < 0.0); |
| if (ambigs_found && stopper_debug_level >= 1) { |
| tprintf ("Stopper: Possible ambiguous word = %s\n", |
| alt_word->debug_string(getUnicharset()).string()); |
| } |
| delete alt_word; |
| } |
| ambig_blob_choices.delete_data_pointers(); |
| return !ambigs_found; |
| } |
| |
| void Dict::EndDangerousAmbigs() {} |
| |
| /*---------------------------------------------------------------------------*/ |
| void Dict::SettupStopperPass1() { |
| /* |
| ** Parameters: none |
| ** Variables Used: |
| ** reject_offset_ offset allowed before word is rejected |
| ** Operation: This routine performs any settup of stopper variables |
| ** that is needed in preparation for the first pass. |
| ** Return: none |
| ** Exceptions: none |
| ** History: Mon Jun 3 12:32:00 1991, DSJ, Created. |
| */ |
| reject_offset_ = 0.0; |
| } /* SettupStopperPass1 */ |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| void Dict::SettupStopperPass2() { |
| /* |
| ** Parameters: none |
| ** Variables Used: |
| ** reject_offset_ offset allowed before word is rejected |
| ** Operation: This routine performs any settup of stopper variables |
| ** that is needed in preparation for the second pass. |
| ** Return: none |
| ** Exceptions: none |
| ** History: Mon Jun 3 12:32:00 1991, DSJ, Created. |
| */ |
| reject_offset_ = stopper_phase2_certainty_rejection_offset; |
| } /* SettupStopperPass2 */ |
| } // namespace tesseract |
| |
| |
| /**---------------------------------------------------------------------------- |
| Private Code |
| ----------------------------------------------------------------------------**/ |
| /*---------------------------------------------------------------------------*/ |
| void AddNewChunk(VIABLE_CHOICE Choice, int Blob) { |
| /* |
| ** Parameters: |
| ** Choice choice to add a new chunk to |
| ** Blob index of blob being split |
| ** Operation: This routine increments the chunk count of the character |
| ** in Choice which corresponds to Blob. |
| ** Return: none |
| ** Exceptions: none |
| ** History: Mon May 20 11:43:27 1991, DSJ, Created. |
| */ |
| int i, LastChunk; |
| |
| for (i = 0, LastChunk = 0; i < Choice->Length; i++) { |
| LastChunk += Choice->Blob[i].NumChunks; |
| if (Blob < LastChunk) { |
| (Choice->Blob[i].NumChunks)++; |
| return; |
| } |
| } |
| mem_tidy (1); |
| cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n", |
| Choice->Length, LastChunk, Blob); |
| assert(FALSE); /* this should never get executed */ |
| |
| } /* AddNewChunk */ |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| namespace tesseract { |
| // Replaces the corresponding wrong ngram in werd_choice with the correct one. |
| // We indicate that this newly inserted ngram unichar is composed from several |
| // fragments and modify the corresponding entries in blob_choices to contain |
| // fragments of the correct ngram unichar instead of the original unichars. |
| // Ratings and certainties of entries in blob_choices and werd_choice are |
| // unichaged. E.g. for werd_choice mystring'' and ambiguity ''->": |
| // werd_choice becomes mystring", first ' in blob_choices becomes |"|0|2, |
| // second one is set to |"|1|2. |
| void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, |
| UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, |
| BLOB_CHOICE_LIST_VECTOR *blob_choices, |
| bool *modified_blobs) { |
| int num_blobs_to_replace = 0; |
| int begin_blob_index = 0; |
| int i; |
| for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) { |
| if (i >= wrong_ngram_begin_index) { |
| num_blobs_to_replace += werd_choice->fragment_length(i); |
| } else { |
| begin_blob_index += werd_choice->fragment_length(i); |
| } |
| } |
| BLOB_CHOICE_IT bit; |
| int temp_blob_index = begin_blob_index; |
| const char *temp_uch = NULL; |
| const char *correct_ngram_str = |
| getUnicharset().id_to_unichar(correct_ngram_id); |
| for (int replaced_count = 0; replaced_count < wrong_ngram_size; |
| ++replaced_count) { |
| if (blob_choices != NULL) { |
| UNICHAR_ID uch_id = werd_choice->unichar_id(wrong_ngram_begin_index); |
| int fraglen = werd_choice->fragment_length(wrong_ngram_begin_index); |
| if (fraglen > 1) temp_uch = getUnicharset().id_to_unichar(uch_id); |
| for (i = 0; i < fraglen; ++i) { |
| if (fraglen > 1) { |
| STRING frag_str = |
| CHAR_FRAGMENT::to_string(temp_uch, i, fraglen); |
| getUnicharset().unichar_insert(frag_str.string()); |
| uch_id = getUnicharset().unichar_to_id(frag_str.string()); |
| } |
| bit.set_to_list(blob_choices->get(temp_blob_index)); |
| STRING correct_frag_uch = |
| CHAR_FRAGMENT::to_string(correct_ngram_str, |
| temp_blob_index - begin_blob_index, |
| num_blobs_to_replace); |
| getUnicharset().unichar_insert(correct_frag_uch.string()); |
| UNICHAR_ID correct_frag_uch_id = |
| getUnicharset().unichar_to_id(correct_frag_uch.string()); |
| // Find the WERD_CHOICE corresponding to the original unichar in |
| // the list of blob choices, add the derived character fragment |
| // before it with the same rating and certainty. |
| for (bit.mark_cycle_pt(); !bit.cycled_list(); bit.forward()) { |
| if (bit.data()->unichar_id() == correct_frag_uch_id) { |
| break; // the unichar we want to insert is already there |
| } |
| if (bit.data()->unichar_id() == uch_id) { |
| bit.add_before_then_move(new BLOB_CHOICE(*(bit.data()))); |
| bit.data()->set_unichar_id(correct_frag_uch_id); |
| if (modified_blobs != NULL) *modified_blobs = true; |
| break; |
| } |
| } |
| temp_blob_index++; |
| } |
| } |
| // Remove current unichar from werd_choice. On the last iteration |
| // set the correct replacement unichar instead of removing a unichar. |
| if (replaced_count + 1 == wrong_ngram_size) { |
| werd_choice->set_unichar_id(correct_ngram_id, |
| num_blobs_to_replace, 0.0, 0.0, wrong_ngram_begin_index); |
| } else { |
| werd_choice->remove_unichar_id(wrong_ngram_begin_index); |
| } |
| } |
| if (stopper_debug_level >= 1) { |
| tprintf("ReplaceAmbigs() modified werd_choice: %s\n", |
| werd_choice->debug_string(getUnicharset()).string()); |
| werd_choice->print(); |
| if (modified_blobs != NULL && *modified_blobs && blob_choices != NULL) { |
| tprintf("Modified blob_choices: "); |
| for (int i = 0; i < blob_choices->size(); ++i) { |
| print_ratings_list("\n", blob_choices->get(i), getUnicharset()); |
| } |
| } |
| } |
| } |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| int Dict::ChoiceSameAs(const WERD_CHOICE &WordChoice, |
| VIABLE_CHOICE ViableChoice) { |
| /* |
| ** Parameters: |
| ** Choice choice to compare to ViableChoice |
| ** ViableChoice viable choice to compare to Choice |
| ** Operation: This routine compares the corresponding strings of |
| ** Choice and ViableChoice and returns TRUE if they are the |
| ** same, FALSE otherwise. |
| ** Return: TRUE or FALSE. |
| ** Exceptions: none |
| ** History: Fri May 17 08:48:04 1991, DSJ, Created. |
| */ |
| return (StringSameAs(WordChoice, ViableChoice)); |
| |
| } /* ChoiceSameAs */ |
| } // namespace tesseract |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| int CmpChoiceRatings(void *arg1, //VIABLE_CHOICE Choice1, |
| void *arg2) { //VIABLE_CHOICE Choice2) |
| /* |
| ** Parameters: |
| ** Choice1, Choice2 choices to compare ratings for |
| ** Operation: Return -1 if the rating for Choice1 is less than the |
| ** rating for Choice2, otherwise return (1). |
| ** Return: -1 or 1 |
| ** Exceptions: none |
| ** History: Wed May 15 13:02:37 1991, DSJ, Created. |
| */ |
| float R1, R2; |
| VIABLE_CHOICE Choice1 = (VIABLE_CHOICE) arg1; |
| VIABLE_CHOICE Choice2 = (VIABLE_CHOICE) arg2; |
| |
| R1 = Choice1->Rating; |
| R2 = Choice2->Rating; |
| |
| if (R1 < R2) |
| return (-1); |
| else |
| return (1); |
| |
| } /* CmpChoiceRatings */ |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice) { |
| /* |
| ** Parameters: |
| ** Choice choice to be expanded |
| ** ExpandedChoice place to put resulting expanded choice |
| ** Operation: This routine expands Choice and places the results |
| ** in ExpandedChoice. The primary function of expansion |
| ** is to create an two arrays, one which holds the corresponding |
| ** certainty for each chunk in Choice, and one which holds |
| ** the class for each chunk. |
| ** Return: none (results are placed in ExpandedChoice) |
| ** Exceptions: none |
| ** History: Fri May 31 15:21:57 1991, DSJ, Created. |
| */ |
| int i, j, Chunk; |
| |
| ExpandedChoice->Choice = Choice; |
| for (i = 0, Chunk = 0; i < Choice->Length; i++) |
| for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) { |
| ExpandedChoice->ChunkCertainty[Chunk] = Choice->Blob[i].Certainty; |
| ExpandedChoice->ChunkClass[Chunk] = Choice->Blob[i].Class; |
| } |
| } /* ExpandChoice */ |
| |
| /*---------------------------------------------------------------------------*/ |
| int FreeBadChoice(void *item1, //VIABLE_CHOICE Choice, |
| void *item2) { //EXPANDED_CHOICE *BestChoice) |
| /* |
| ** Parameters: |
| ** Choice choice to be tested |
| ** BestChoice best choice found |
| ** Variables Used: |
| ** stopper_ambiguity_threshold_gain |
| ** stopper_ambiguity_threshold_offset |
| ** Operation: If the certainty of any chunk in Choice is not ambiguous |
| ** with the corresponding chunk in the best choice, free |
| ** Choice and return TRUE. Otherwise, return FALSE. |
| ** Return: TRUE or FALSE. |
| ** Exceptions: none |
| ** History: Wed May 15 13:20:26 1991, DSJ, Created. |
| */ |
| int i, j, Chunk; |
| FLOAT32 Threshold; |
| VIABLE_CHOICE Choice; |
| EXPANDED_CHOICE *BestChoice; |
| |
| Choice = (VIABLE_CHOICE) item1; |
| BestChoice = (EXPANDED_CHOICE *) item2; |
| |
| Threshold = AmbigThreshold (BestChoice->Choice->AdjustFactor, |
| Choice->AdjustFactor); |
| |
| for (i = 0, Chunk = 0; i < Choice->Length; i++) |
| for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) |
| if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] && |
| Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] < |
| Threshold) { |
| memfree(Choice); |
| return (TRUE); |
| } |
| |
| return (FALSE); |
| |
| } /* FreeBadChoice */ |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| namespace tesseract { |
| int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) { |
| /* |
| ** Parameters: |
| ** Word word to be tested |
| ** Operation: Return the length of the shortest alpha run in Word. |
| ** Return: Return the length of the shortest alpha run in Word. |
| ** Exceptions: none |
| ** History: Tue May 14 07:50:45 1991, DSJ, Created. |
| */ |
| register int Shortest = MAX_INT32; |
| register int Length; |
| int x; |
| int y; |
| |
| for (x = 0; x < WordChoice.length(); ++x) { |
| if (getUnicharset().get_isalpha(WordChoice.unichar_id(x))) { |
| for (y = x + 1, Length = 1; |
| y < WordChoice.length() && |
| getUnicharset().get_isalpha(WordChoice.unichar_id(y)); |
| ++y, ++Length); |
| if (Length < Shortest) { |
| Shortest = Length; |
| } |
| if (y == WordChoice.length()) { |
| break; |
| } |
| } |
| } |
| if (Shortest == MAX_INT32) |
| Shortest = 0; |
| |
| return (Shortest); |
| |
| } /* LengthOfShortestAlphaRun */ |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| VIABLE_CHOICE Dict::NewViableChoice(const WERD_CHOICE &WordChoice, |
| FLOAT32 AdjustFactor, |
| const float Certainties[]) { |
| /* |
| ** Parameters: |
| ** Choice choice to be converted to a viable choice |
| ** AdjustFactor factor used to adjust ratings for Choice |
| ** Certainties certainty for each character in Choice |
| ** Variables Used: |
| ** current_segmentation segmentation corresponding to Choice |
| ** Operation: Allocate a new viable choice data structure, copy |
| ** Choice, Certainties, and current_segmentation_ into it, |
| ** and return a pointer to it. |
| ** Return: Ptr to new viable choice. |
| ** Exceptions: none |
| ** History: Thu May 16 15:28:29 1991, DSJ, Created. |
| */ |
| int Length = WordChoice.length(); |
| assert (Length <= MAX_NUM_CHUNKS && Length > 0); |
| VIABLE_CHOICE NewChoice = (VIABLE_CHOICE) Emalloc ( |
| sizeof (VIABLE_CHOICE_STRUCT) + (Length - 1) * sizeof (CHAR_CHOICE)); |
| FillViableChoice(WordChoice, AdjustFactor, Certainties, false, NewChoice); |
| return (NewChoice); |
| } /* NewViableChoice */ |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| void Dict::PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice) { |
| /* |
| ** Parameters: |
| ** File open text file to print Choice to |
| ** Label text label to be printed with Choice |
| ** Choice choice to be printed |
| ** Operation: This routine dumps a text representation of the |
| ** specified Choice to File. |
| ** Return: none |
| ** Exceptions: none |
| ** History: Mon May 20 11:16:44 1991, DSJ, Created. |
| */ |
| int i, j; |
| |
| fprintf (File, "%s", Label); |
| |
| fprintf(File, "(R=%5.1f, C=%4.1f, F=%4.2f, Frag=%d) ", |
| Choice->Rating, Choice->Certainty, |
| Choice->AdjustFactor, Choice->ComposedFromCharFragments); |
| |
| for (i = 0; i < Choice->Length; i++) |
| fprintf(File, "%s", getUnicharset().id_to_unichar(Choice->Blob[i].Class)); |
| fprintf(File, "\n"); |
| |
| for (i = 0; i < Choice->Length; i++) { |
| fprintf(File, " %s", getUnicharset().id_to_unichar(Choice->Blob[i].Class)); |
| for (j = 0; j < Choice->Blob[i].NumChunks - 1; j++) |
| fprintf(File, " "); |
| } |
| fprintf(File, "\n"); |
| |
| for (i = 0; i < Choice->Length; i++) { |
| for (j = 0; j < Choice->Blob[i].NumChunks; j++) |
| fprintf(File, "%3d ", (int) (Choice->Blob[i].Certainty * -10.0)); |
| } |
| fprintf(File, "\n"); |
| |
| for (i = 0; i < Choice->Length; i++) { |
| for (j = 0; j < Choice->Blob[i].NumChunks; j++) |
| fprintf(File, "%3d ", Choice->Blob[i].NumChunks); |
| } |
| fprintf(File, "\n"); |
| } /* PrintViableChoice */ |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| void Dict::FillViableChoice(const WERD_CHOICE &WordChoice, |
| FLOAT32 AdjustFactor, const float Certainties[], |
| bool SameString, VIABLE_CHOICE ViableChoice) { |
| /* |
| ** Parameters: |
| ** WordChoice a choice with info that will be copied |
| ** AdjustFactor factor used to adjust ratings for AChoice |
| ** Certainties certainty for each character in AChoice |
| ** SameString if true the string in the viable choice |
| ** will not be changed |
| ** ViableChoice existing viable choice to fill in |
| ** Variables Used: |
| ** current_segmentation_ segmentation for NewChoice |
| ** Operation: |
| ** Fill ViableChoice with information from AChoice, |
| ** AdjustFactor, and Certainties. |
| ** Return: none |
| ** Exceptions: none |
| ** History: Fri May 17 13:35:58 1991, DSJ, Created. |
| */ |
| CHAR_CHOICE *NewChar; |
| BLOB_WIDTH *BlobWidth; |
| int x; |
| |
| ViableChoice->Rating = WordChoice.rating(); |
| ViableChoice->Certainty = WordChoice.certainty(); |
| ViableChoice->AdjustFactor = AdjustFactor; |
| ViableChoice->ComposedFromCharFragments = false; |
| if (!SameString) { |
| ViableChoice->Length = WordChoice.length(); |
| } |
| for (x = 0, |
| NewChar = &(ViableChoice->Blob[0]), |
| BlobWidth = current_segmentation_; |
| x < WordChoice.length(); |
| x++, NewChar++, Certainties++, BlobWidth++) { |
| if (!SameString) { |
| NewChar->Class = WordChoice.unichar_id(x); |
| } |
| NewChar->NumChunks = *BlobWidth; |
| NewChar->Certainty = *Certainties; |
| for (int i = 1; i < WordChoice.fragment_length(x); ++i) { |
| BlobWidth++; |
| assert(*BlobWidth > 0); |
| NewChar->NumChunks += *BlobWidth; |
| ViableChoice->ComposedFromCharFragments = true; |
| } |
| } |
| } /* FillViableChoice */ |
| |
| |
| // Compares unichar ids in word_choice to those in viable_choice, |
| // returns true if they are the same, false otherwise. |
| bool Dict::StringSameAs(const WERD_CHOICE &WordChoice, |
| VIABLE_CHOICE ViableChoice) { |
| if (WordChoice.length() != ViableChoice->Length) { |
| return false; |
| } |
| int i; |
| CHAR_CHOICE *CharChoice; |
| for (i = 0, CharChoice = &(ViableChoice->Blob[0]); |
| i < ViableChoice->Length; CharChoice++, i++) { |
| if (CharChoice->Class != WordChoice.unichar_id(i)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /*---------------------------------------------------------------------------*/ |
| int Dict::StringSameAs(const char *String, |
| const char *String_lengths, |
| VIABLE_CHOICE ViableChoice) { |
| /* |
| ** Parameters: |
| ** String string to compare to ViableChoice |
| ** String_lengths lengths of unichars in String |
| ** ViableChoice viable choice to compare to String |
| ** Operation: This routine compares String to ViableChoice and |
| ** returns TRUE if they are the same, FALSE otherwise. |
| ** Return: TRUE or FALSE. |
| ** Exceptions: none |
| ** History: Fri May 17 08:48:04 1991, DSJ, Created. |
| */ |
| CHAR_CHOICE *Char; |
| int i; |
| int current_unichar_length; |
| |
| for (Char = &(ViableChoice->Blob[0]), i = 0; |
| i < ViableChoice->Length; |
| String += *(String_lengths++), Char++, i++) { |
| current_unichar_length = strlen(getUnicharset().id_to_unichar(Char->Class)); |
| if (current_unichar_length != *String_lengths || |
| strncmp(String, getUnicharset().id_to_unichar(Char->Class), |
| current_unichar_length) != 0) |
| return (FALSE); |
| } |
| |
| if (*String == 0) |
| return (TRUE); |
| else |
| return (FALSE); |
| |
| } /* StringSameAs */ |
| } // namespace tesseract |
| |
| /*---------------------------------------------------------------------------*/ |
| int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices, |
| const WERD_CHOICE &BestChoice) { |
| /* |
| ** Parameters: |
| ** Choices choices for current segmentation |
| ** BestChoice best choice for current segmentation |
| ** Variables Used: |
| ** stopper_allowable_character_badness |
| ** max allowed certainty variation |
| ** Operation: This routine returns TRUE if the certainty of the |
| ** BestChoice word is within a reasonable range of the average |
| ** certainties for the best choices for each character in |
| ** the segmentation. This test is used to catch words in which |
| ** one character is much worse than the other characters in |
| ** the word (i.e. FALSE will be returned in that case). |
| ** The algorithm computes the mean and std deviation of the |
| ** certainties in the word with the worst certainty thrown out. |
| ** Return: TRUE or FALSE. |
| ** Exceptions: none |
| ** History: Tue May 14 08:23:21 1991, DSJ, Created. |
| */ |
| float Certainty; |
| float WorstCertainty = MAX_FLOAT32; |
| float CertaintyThreshold; |
| FLOAT64 TotalCertainty; |
| FLOAT64 TotalCertaintySquared; |
| FLOAT64 Variance; |
| FLOAT32 Mean, StdDev; |
| int WordLength; |
| |
| WordLength = Choices.length(); |
| if (WordLength < 3) |
| return (TRUE); |
| |
| TotalCertainty = TotalCertaintySquared = 0.0; |
| BLOB_CHOICE_IT BlobChoiceIt; |
| for (int i = 0; i < Choices.length(); ++i) { |
| BlobChoiceIt.set_to_list(Choices.get(i)); |
| Certainty = BlobChoiceIt.data()->certainty(); |
| TotalCertainty += Certainty; |
| TotalCertaintySquared += Certainty * Certainty; |
| if (Certainty < WorstCertainty) |
| WorstCertainty = Certainty; |
| } |
| |
| /* subtract off worst certainty from statistics */ |
| WordLength--; |
| TotalCertainty -= WorstCertainty; |
| TotalCertaintySquared -= WorstCertainty * WorstCertainty; |
| |
| Mean = TotalCertainty / WordLength; |
| Variance = ((WordLength * TotalCertaintySquared - |
| TotalCertainty * TotalCertainty) / |
| (WordLength * (WordLength - 1))); |
| if (Variance < 0.0) |
| Variance = 0.0; |
| StdDev = sqrt (Variance); |
| |
| CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev; |
| if (CertaintyThreshold > stopper_nondict_certainty_base) |
| CertaintyThreshold = stopper_nondict_certainty_base; |
| |
| if (BestChoice.certainty() < CertaintyThreshold) { |
| if (stopper_debug_level >= 1) |
| cprintf("Stopper: Non-uniform certainty = %4.1f" |
| " (m=%4.1f, s=%4.1f, t=%4.1f)\n", |
| BestChoice.certainty(), Mean, StdDev, CertaintyThreshold); |
| return (FALSE); |
| } else { |
| return (TRUE); |
| } |
| } /* UniformCertainties */ |