| /* -*-C-*- |
| ******************************************************************************** |
| * |
| * File: permdawg.c (Formerly permdawg.c) |
| * Description: Scale word choices by a dictionary |
| * Author: Mark Seaman, OCR Technology |
| * Created: Fri Oct 16 14:37:00 1987 |
| * Modified: Tue Jul 9 15:43:18 1991 (Mark Seaman) marks@hpgrlt |
| * Language: C |
| * Package: N/A |
| * Status: Reusable Software Component |
| * |
| * (c) Copyright 1987, Hewlett-Packard Company. |
| ** Licensed under the Apache License, Version 2.0 (the "License"); |
| ** you may not use this file except in compliance with the License. |
| ** You may obtain a copy of the License at |
| ** http://www.apache.org/licenses/LICENSE-2.0 |
| ** Unless required by applicable law or agreed to in writing, software |
| ** distributed under the License is distributed on an "AS IS" BASIS, |
| ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ** See the License for the specific language governing permissions and |
| ** limitations under the License. |
| * |
| *********************************************************************************/ |
| /*---------------------------------------------------------------------- |
| I n c l u d e s |
| ----------------------------------------------------------------------*/ |
| #include "permdawg.h" |
| #include "debug.h" |
| #include "hyphen.h" |
| #include "permute.h" |
| #include "tordvars.h" |
| #include "context.h" |
| #include "stopper.h" |
| #include "freelist.h" |
| #include "globals.h" |
| #include "tprintf.h" |
| #include "cutil.h" |
| #include "dawg.h" |
| #include "ndminx.h" |
| #include "varable.h" |
| #include "conversion.h" |
| |
| #include <ctype.h> |
| #include "dict.h" |
| #include "image.h" |
| |
| /*---------------------------------------------------------------------- |
| T y p e s |
| ----------------------------------------------------------------------*/ |
| #define FREQ_WERD 1.0 |
| #define GOOD_WERD 1.1 |
| #define OK_WERD 1.3125 |
| #define MAX_FREQ_EDGES 1500 |
| #define NO_RATING -1 |
| |
| /*---------------------------------------------------------------------- |
| V a r i a b l e s |
| ----------------------------------------------------------------------*/ |
| static EDGE_ARRAY frequent_words; |
| static float rating_margin; |
| static float rating_pad = 5.0; |
| |
| BOOL_VAR(segment_dawg_debug, 0, "Debug mode for word segmentation"); |
| |
| double_VAR(segment_penalty_dict_punc_bad, OK_WERD, |
| "Default score multiplier for word matches, which may have case or " |
| "punctuation issues (lower is better)."); |
| |
| double_VAR(segment_penalty_dict_punc_ok, GOOD_WERD, |
| "Score multiplier for word matches that have good case " |
| "(lower is better)."); |
| |
| double_VAR(segment_penalty_dict_frequent_word, FREQ_WERD, |
| "Score multiplier for word matches which have good case and are " |
| "frequent in the given language (lower is better)."); |
| |
| /*---------------------------------------------------------------------- |
| F u n c t i o n s |
| ----------------------------------------------------------------------*/ |
| /********************************************************************** |
| * adjust_word |
| * |
| * Assign an adjusted value to a string that is a word. The value |
| * that this word choice has is based on case and punctuation rules. |
| **********************************************************************/ |
| namespace tesseract { |
| void Dict::adjust_word(A_CHOICE *best_choice, float *certainty_array) { |
| char *this_word; |
| int punct_status; |
| float adjust_factor; |
| |
| if (segment_adjust_debug) |
| tprintf ("Word: %s %4.2f ", |
| class_string (best_choice), class_rating (best_choice)); |
| |
| this_word = class_string (best_choice); |
| punct_status = punctuation_ok (this_word, class_lengths (best_choice)); |
| |
| class_rating (best_choice) += RATING_PAD; |
| if (case_ok (this_word, class_lengths (best_choice)) |
| && punct_status != -1) { |
| if (punct_status < 1 && word_in_dawg (frequent_words, this_word)) { |
| class_rating (best_choice) *= segment_penalty_dict_frequent_word; |
| class_permuter (best_choice) = FREQ_DAWG_PERM; |
| adjust_factor = segment_penalty_dict_frequent_word; |
| if (segment_adjust_debug) |
| tprintf(", F, %4.2f ", (double)segment_penalty_dict_frequent_word); |
| } |
| else { |
| class_rating (best_choice) *= segment_penalty_dict_punc_ok; |
| adjust_factor = segment_penalty_dict_punc_ok; |
| if (segment_adjust_debug) |
| tprintf(", %4.2f ", (double)segment_penalty_dict_punc_ok); |
| } |
| } |
| else { |
| class_rating (best_choice) *= segment_penalty_dict_punc_bad; |
| adjust_factor = segment_penalty_dict_punc_bad; |
| if (segment_adjust_debug) { |
| if (!case_ok (this_word, class_lengths (best_choice))) |
| tprintf(", C"); |
| if (punctuation_ok (this_word, class_lengths (best_choice)) == -1) |
| tprintf(", P"); |
| tprintf(", %4.2f ", (double)segment_penalty_dict_punc_bad); |
| } |
| } |
| |
| class_rating (best_choice) -= RATING_PAD; |
| |
| LogNewWordChoice(best_choice, adjust_factor, |
| certainty_array, getUnicharset()); |
| |
| if (segment_adjust_debug) |
| tprintf(" --> %4.2f\n", class_rating (best_choice)); |
| } |
| |
| |
| /********************************************************************** |
| * append_next_choice |
| * |
| * Check to see whether or not the next choice is worth appending to |
| * the string being generated. If so then keep going deeper into the |
| * word. |
| **********************************************************************/ |
| void Dict::append_next_choice( /*previous option */ |
| EDGE_ARRAY dawg, |
| NODE_REF node, |
| char permuter, |
| char *word, |
| char unichar_lengths[], |
| int unichar_offsets[], |
| CHOICES_LIST choices, |
| int char_choice_index, |
| int word_index, |
| A_CHOICE *this_choice, |
| const char *prevchar, |
| float *limit, |
| float rating, |
| float certainty, |
| float *rating_array, |
| float *certainty_array, |
| int word_ending, |
| int last_word, |
| const CHAR_FRAGMENT_INFO *prev_char_frag_info, |
| char fragment_lengths[], |
| CHOICES *result) { |
| A_CHOICE *better_choice; |
| CHAR_FRAGMENT_INFO char_frag_info; |
| const char *ch = NULL; |
| |
| /* Deal with fragments */ |
| if (!fragment_state_okay( |
| getUnicharset().unichar_to_id(class_string(this_choice)), |
| class_rating(this_choice), class_certainty(this_choice), |
| prev_char_frag_info, |
| (segment_dawg_debug && (fragments_debug > 1)) ? "dawg_debug" : NULL, |
| word_ending, &char_frag_info)) { |
| return; // this_choice must be an invalid fragment |
| } |
| if (char_frag_info.unichar_id != INVALID_UNICHAR_ID) { |
| ch = getUnicharset().id_to_unichar(char_frag_info.unichar_id); |
| } |
| if (ch == NULL) { // this character is a fragment |
| JOIN_ON(*result, // so search the next letter |
| dawg_permute(dawg, node, permuter, choices, |
| char_choice_index + 1, word_index, |
| limit, word, unichar_lengths, unichar_offsets, |
| rating, certainty, rating_array, certainty_array, |
| last_word, &char_frag_info, fragment_lengths)); |
| return; |
| } |
| |
| /* Add new character */ |
| strcpy(word + unichar_offsets[word_index], ch); |
| |
| unichar_lengths[word_index] = strlen(ch); |
| unichar_lengths[word_index + 1] = 0; |
| fragment_lengths[word_index] = char_frag_info.num_fragments; |
| fragment_lengths[word_index + 1] = 0; |
| unichar_offsets[word_index + 1] = |
| unichar_offsets[word_index] + unichar_lengths[word_index]; |
| if (word[unichar_offsets[word_index]] == '\0') { |
| word[unichar_offsets[word_index]] = ' '; |
| word[unichar_offsets[word_index] + 1] = '\0'; |
| unichar_lengths[word_index] = 1; |
| unichar_lengths[word_index + 1] = 0; |
| fragment_lengths[word_index] = 1; |
| fragment_lengths[word_index + 1] = 0; |
| unichar_offsets[word_index + 1] = unichar_offsets[word_index] + |
| unichar_lengths[word_index]; |
| } |
| certainty_array[word_index] = char_frag_info.certainty; |
| rating += char_frag_info.rating; |
| certainty = MIN (char_frag_info.certainty, certainty); |
| |
| /* Prune bad subwords */ |
| if (rating_array[char_choice_index] == NO_RATING) { |
| rating_array[char_choice_index] = rating; |
| } else { |
| if (rating_array[word_index] * rating_margin + rating_pad < rating) { |
| if (segment_dawg_debug) { |
| tprintf("early pruned word rating=%4.2f, limit=%4.2f", rating, *limit); |
| print_word_string(word); |
| tprintf("\n"); |
| } |
| return; |
| } |
| } |
| |
| /* Deal with hyphens */ |
| if (word_ending && last_word && word[unichar_offsets[word_index]] == '-' && |
| word_index > 0) { |
| *limit = rating; |
| if (segment_dawg_debug) |
| tprintf("new hyphen choice = %s\n", word); |
| better_choice = new_choice (word, unichar_lengths, rating, certainty, |
| -1, NULL, permuter, false, fragment_lengths); |
| |
| adjust_word(better_choice, certainty_array); |
| push_on(*result, better_choice); |
| if (permuter == SYSTEM_DAWG_PERM) { |
| // Until dawg is a class, it is only safe to use set_hyphen_word |
| // on a single dawg type, as it saves permuter state to use on another |
| // word in hyphen_state, and this *must* apply to the same dawg. |
| set_hyphen_word(word, unichar_lengths, unichar_offsets, |
| rating, node, char_choice_index, fragment_lengths); |
| } |
| } |
| /* Look up char in DAWG */ |
| else { |
| int sub_offset = 0; |
| NODE_REF node_saved = node; |
| while (sub_offset < unichar_lengths[word_index] && |
| (this->*letter_is_okay_)(dawg, &node, |
| unichar_offsets[word_index] + sub_offset, |
| *prevchar, word, word_ending && |
| sub_offset == |
| unichar_lengths[word_index] - 1)) { |
| ++sub_offset; |
| } |
| if (sub_offset == unichar_lengths[word_index]) { |
| /* Add a new word choice */ |
| if (word_ending) { |
| if (segment_dawg_debug == 1) |
| tprintf("new choice = %s\n", word); |
| *limit = rating; |
| |
| better_choice = |
| new_choice (hyphen_tail (word), unichar_lengths + hyphen_base_size(), |
| rating, certainty, -1, NULL, permuter, false, |
| fragment_lengths + hyphen_base_size()); |
| adjust_word (better_choice, &certainty_array[hyphen_base_size()]); |
| push_on(*result, better_choice); |
| } |
| else { |
| /* Search the next letter */ |
| JOIN_ON (*result, |
| dawg_permute (dawg, node, permuter, choices, |
| char_choice_index + 1, word_index + 1, limit, |
| word, unichar_lengths, unichar_offsets, rating, |
| certainty, rating_array, certainty_array, |
| last_word, &char_frag_info, fragment_lengths)); |
| } |
| } else { |
| if (segment_dawg_debug == 1) { |
| tprintf("letter not OK at char %d, index %d + sub index %d/%d\n", |
| word_index, unichar_offsets[word_index], |
| sub_offset, unichar_lengths[word_index]); |
| tprintf("Word"); |
| print_word_string(word); |
| tprintf("\nRejected tail"); |
| print_word_string(word + unichar_offsets[word_index]); |
| tprintf("\n"); |
| } |
| if (node != 0) |
| node = node_saved; |
| } |
| } |
| } |
| |
| |
| /********************************************************************** |
| * dawg_permute |
| * |
| * Permute all the valid words that can be created with this starting |
| * point. The node (in the DAWG) and the word string define a base |
| * from which to start adding the remaining character choices. |
| **********************************************************************/ |
| CHOICES Dict::dawg_permute(EDGE_ARRAY dawg, |
| NODE_REF node, |
| char permuter, |
| CHOICES_LIST choices, |
| int char_choice_index, |
| int word_index, |
| float *limit, |
| char *word, |
| char unichar_lengths[], |
| int unichar_offsets[], |
| float rating, |
| float certainty, |
| float *rating_array, |
| float *certainty_array, |
| int last_word, |
| const CHAR_FRAGMENT_INFO *prev_char_frag_info, |
| char fragment_lengths[]) { |
| CHOICES result = NIL; |
| CHOICES c; |
| char *prevchar; |
| int word_ending = FALSE; |
| |
| if (segment_dawg_debug) { |
| tprintf("dawg_permute (node=" REFFORMAT ", char_choice_index=%d," |
| " word_index=%d, limit=%f, word=", |
| node, char_choice_index, word_index, *limit); |
| print_word_string(word); |
| tprintf(", rating=%4.2f, certainty=%4.2f)\n", rating, certainty); |
| } |
| |
| /* Check for EOW */ |
| if (1 + char_choice_index == array_count (choices) + hyphen_char_choice_size()) { |
| word_ending = TRUE; |
| } |
| |
| if (char_choice_index < array_count (choices) + hyphen_char_choice_size()) { |
| prevchar = NULL; |
| iterate_list (c, (CHOICES) array_index (choices, |
| char_choice_index - hyphen_char_choice_size())) { |
| append_next_choice (dawg, node, permuter, word, unichar_lengths, |
| unichar_offsets, choices, char_choice_index, |
| word_index, (A_CHOICE *) first_node (c), |
| prevchar != NULL ? prevchar : "", limit, |
| rating, certainty, rating_array, certainty_array, |
| word_ending, last_word, prev_char_frag_info, |
| fragment_lengths, &result); |
| prevchar = best_string (c); |
| } |
| } |
| |
| if (result && (segment_dawg_debug == 1)) |
| print_choices ("dawg_permute", result); |
| |
| return (result); |
| } |
| |
| /********************************************************************** |
| * dawg_permute_and_select |
| * |
| * Use a DAWG type data structure to enumerate all the valid strings |
| * in some gramar. Compare each of the choices against the best choice |
| * so far. Update the best choice if needed. |
| **********************************************************************/ |
| void Dict::dawg_permute_and_select(const char *string, |
| EDGE_ARRAY dawg, |
| char permuter, |
| CHOICES_LIST character_choices, |
| A_CHOICE *best_choice) { |
| CHOICES result = NIL; |
| char word[UNICHAR_LEN * MAX_WERD_LENGTH + 1]; |
| char unichar_lengths[MAX_WERD_LENGTH + 1]; |
| int unichar_offsets[MAX_WERD_LENGTH + 1]; |
| float certainty_array[MAX_WERD_LENGTH + 1]; |
| float rating_array[MAX_WERD_LENGTH + 1]; |
| char fragment_lengths[MAX_WERD_LENGTH + 1]; // num fragments from which |
| // each char was constructed |
| float rating; |
| int char_choice_index = 0; |
| int word_index = 0; |
| NODE_REF dawg_node = 0; |
| |
| /* Pruning margin ratio */ |
| rating_margin = segment_penalty_dict_punc_bad / segment_penalty_dict_punc_ok; |
| |
| word[0] = '\0'; |
| unichar_lengths[0] = 0; |
| fragment_lengths[0] = 0; |
| unichar_offsets[0] = 0; |
| rating = class_rating (best_choice); |
| |
| for (int i = 0; i < MAX_WERD_LENGTH + 1; ++i) { |
| rating_array[i] = NO_RATING; |
| } |
| |
| if (!is_last_word () && hyphen_string) { |
| strcpy(word, hyphen_string); |
| strcpy(unichar_lengths, hyphen_unichar_lengths); |
| strcpy(fragment_lengths, hyphen_fragment_lengths); |
| memcpy(unichar_offsets, hyphen_unichar_offsets, |
| (hyphen_base_size()) * sizeof (int)); |
| unichar_offsets[hyphen_base_size()] = |
| unichar_offsets[hyphen_base_size() - 1] + |
| unichar_lengths[hyphen_base_size() - 1]; |
| char_choice_index = hyphen_char_choice_size(); |
| word_index = strlen (hyphen_unichar_lengths); |
| if (permuter == SYSTEM_DAWG_PERM) |
| // Until dawg is a class, it is only safe to use set_hyphen_word |
| // on a single dawg type, as it saves permuter state to use on another |
| // word in hyphen_state, and this *must* apply to the same dawg. |
| dawg_node = hyphen_state; |
| } |
| |
| result = dawg_permute (dawg, dawg_node, permuter, character_choices, |
| char_choice_index, word_index, &rating, word, unichar_lengths, |
| unichar_offsets, 0.0, 0.0, rating_array, certainty_array, |
| is_last_word(), NULL, fragment_lengths); |
| |
| if (display_ratings && result) { |
| print_choices(string, result); |
| } |
| |
| while (result != NIL) { |
| if (best_rating (result) < class_rating (best_choice)) { |
| clone_choice (best_choice, (A_CHOICE *) first_node (result)); |
| } |
| free_choice (first_node (result)); |
| pop_off(result); |
| } |
| } |
| |
| |
| /********************************************************************** |
| * init_permdawg |
| * |
| * Initialize the variables needed by this file. |
| **********************************************************************/ |
| void Dict::init_permdawg() { |
| STRING name; |
| name = getImage()->getCCUtil()->language_data_path_prefix; |
| name += "freq-dawg"; |
| frequent_words = read_squished_dawg(name.string()); |
| } |
| |
| void Dict::end_permdawg() { |
| memfree(frequent_words); |
| frequent_words = NULL; |
| } |
| |
| |
| /********************************************************************** |
| * test_freq_words() |
| * |
| * Tests a word against the frequent word dawg |
| **********************************************************************/ |
| int Dict::test_freq_words(const char *word) { |
| return (word_in_dawg (frequent_words, word)); |
| } |
| } // namespace tesseract |