| /////////////////////////////////////////////////////////////////////// |
| // File: permngram.cpp |
| // Description: Character n-gram permuter |
| // Author: Thomas Kielbus |
| // Created: Wed Sep 12 11:26:43 PDT 2007 |
| // |
| // (C) Copyright 2007, Google Inc. |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| /////////////////////////////////////////////////////////////////////// |
| |
| #include "const.h" |
| #include "permngram.h" |
| #include "permute.h" |
| #include "dawg.h" |
| #include "tordvars.h" |
| #include "stopper.h" |
| #include "globals.h" |
| #include "context.h" |
| #include "ndminx.h" |
| #include "dict.h" |
| #include "conversion.h" |
| |
| #include <math.h> |
| #include <ctype.h> |
| |
| // Ratio to control the relative importance of the classifier and the ngram |
| // in the final score of a classification unit. Must be >= 0 and <= 1. |
| // A value of 1.0 uses only the shape classifier score. |
| // A value of 0.0 uses only the ngram score. |
| double_VAR(classifier_score_ngram_score_ratio, |
| 0.7, |
| ""); |
| |
| // Rating adjustment multiplier for words not in the DAWG. Must be >= 1. |
| double_VAR(non_dawg_prefix_rating_adjustment, |
| 1.5, |
| ""); |
| |
| // HypothesisPrefix represents a word prefix during the search of the |
| // character-level n-gram model based permuter. |
| // It holds the data needed to create the corresponding A_CHOICE. |
| // Note that the string stored in the _word data member always begin with a |
| // space character. This is used by the n-gram model to score the word. |
| // HypothesisPrefix also contains the node in the DAWG that is reached when |
| // searching for the corresponding prefix. |
| class HypothesisPrefix { |
| public: |
| HypothesisPrefix(); |
| HypothesisPrefix(const HypothesisPrefix& prefix, |
| A_CHOICE* choice, |
| bool end_of_word, |
| const tesseract::Dawg *dawg, |
| tesseract::Dict* dict); |
| |
| double rating() const {return rating_;} |
| double certainty() const {return certainty_;} |
| const char* word() const {return word_;} |
| const char* unichar_lengths() const {return unichar_lengths_;} |
| const float* certainty_array() const {return certainty_array_;} |
| bool is_dawg_prefix() const {return is_dawg_prefix_;} |
| NODE_REF dawg_node() const {return dawg_node_;} |
| |
| private: |
| double rating_; |
| double certainty_; |
| char word_[UNICHAR_LEN * MAX_WERD_LENGTH + 2]; |
| char unichar_lengths_[MAX_WERD_LENGTH + 1]; |
| float certainty_array_[MAX_WERD_LENGTH + 1]; |
| NODE_REF dawg_node_; |
| bool is_dawg_prefix_; |
| }; |
| |
| // HypothesisPrefix is the class used as nodes in HypothesisPrefixLists |
| typedef HypothesisPrefix HypothesisPrefixListNode; |
| |
| // HypothesisPrefixList maintains a sorted list of HypothesisPrefixes. The size |
| // is bounded by the argument given to the constructor. |
| // For the sake of simplicity, current implementation is not as efficient as it |
| // could be. The list is represented by a static array of pointers to its |
| // elements. All nodes are stored in positions from 0 to (size() - 1). |
| class HypothesisPrefixList { |
| public: |
| HypothesisPrefixList(int size_bound); |
| ~HypothesisPrefixList(); |
| |
| void add_node(HypothesisPrefix* node); |
| int size() const {return _size;} |
| void clear(); |
| const HypothesisPrefix& node(int index) {return *_list_nodes[index];} |
| |
| private: |
| HypothesisPrefix** _list_nodes; |
| int _size_bound; |
| int _size; |
| }; |
| |
| // Return the classifier_score_ngram_score_ratio for a given choice string. |
| // The classification decision for characters like comma and period should |
| // be based only on shape rather than on shape and n-gram score. |
| // Return 1.0 for them, the default classifier_score_ngram_score_ratio |
| // otherwise. |
| static double get_classifier_score_ngram_score_ratio(const char* choice); |
| |
| // Permute the given char_choices using a character level n-gram model and |
| // return the best word choice found. |
| // This is performed by maintaining a HypothesisPrefixList of HypothesisPrefixes. |
| // For each character position, each possible character choice is appended to |
| // the best current prefixes to create the list of best prefixes at the next |
| // character position. |
| namespace tesseract { |
| A_CHOICE *Dict::ngram_permute_and_select(CHOICES_LIST char_choices, |
| float rating_limit, |
| const Dawg *dawg) { |
| if (array_count (char_choices) <= MAX_WERD_LENGTH) { |
| CHOICES choices; |
| int char_index_max = array_count(char_choices); |
| HypothesisPrefixList list_1(20); |
| HypothesisPrefixList list_2(20); |
| HypothesisPrefixList* current_list = &list_1; |
| HypothesisPrefixList* next_list = &list_2; |
| HypothesisPrefix* initial_node = new HypothesisPrefix(); |
| current_list->add_node(initial_node); |
| for (int char_index = 0; char_index < char_index_max; ++char_index) { |
| iterate_list(choices, (CHOICES) array_index(char_choices, char_index)) { |
| A_CHOICE* choice = (A_CHOICE *) first_node(choices); |
| for (int node_index = 0; |
| node_index < current_list->size(); |
| ++node_index) { |
| // Append this choice to the current node |
| HypothesisPrefix* new_node = new HypothesisPrefix( |
| current_list->node(node_index), |
| choice, |
| char_index == char_index_max - 1, |
| dawg, this); |
| next_list->add_node(new_node); |
| } |
| } |
| // Clear current list and switch lists |
| current_list->clear(); |
| HypothesisPrefixList* temp_list = current_list; |
| current_list = next_list; |
| next_list = temp_list; |
| |
| // Give up if the current best rating is worse than rating_limit |
| if (current_list->node(0).rating() > rating_limit) |
| return new_choice (NULL, NULL, MAXFLOAT, -MAXFLOAT, -1, NO_PERM); |
| } |
| const HypothesisPrefix& best_word = current_list->node(0); |
| A_CHOICE* best_choice = new_choice (best_word.word() + 1, |
| best_word.unichar_lengths(), |
| best_word.rating(), |
| best_word.certainty(), -1, |
| valid_word(best_word.word() + 1) ? |
| SYSTEM_DAWG_PERM : TOP_CHOICE_PERM); |
| LogNewWordChoice(best_choice, best_word.is_dawg_prefix() ? |
| 1.0 : non_dawg_prefix_rating_adjustment, |
| const_cast<float*>(best_word.certainty_array()), |
| getUnicharset()); |
| return best_choice; |
| } else { |
| return new_choice (NULL, NULL, MAXFLOAT, -MAXFLOAT, -1, NO_PERM); |
| } |
| } |
| } // namespace tesseract |
| |
| double get_classifier_score_ngram_score_ratio(const char* choice) { |
| if (!strcmp(",", choice) || |
| !strcmp(".", choice)) |
| return 1.0; |
| else |
| return classifier_score_ngram_score_ratio; |
| } |
| |
| // Initial HypothesisPrefix constructor used to create the first state of the |
| // search. |
| HypothesisPrefix::HypothesisPrefix() { |
| rating_ = 0; |
| certainty_ = MAXFLOAT; |
| strcpy(word_, " "); |
| unichar_lengths_[0] = '\0'; |
| dawg_node_ = 0; |
| is_dawg_prefix_ = true; |
| } |
| |
| // Main constructor to create a new HypothesisPrefix by appending a character |
| // choice (A_CHOICE) to an existing HypothesisPrefix. This constructor takes |
| // care of copying the original prefix's data members, appends the character |
| // choice to the word and updates its rating using a character-level n-gram |
| // model. The state in the DAWG is also updated. |
| HypothesisPrefix::HypothesisPrefix(const HypothesisPrefix& prefix, |
| A_CHOICE* choice, |
| bool end_of_word, |
| const tesseract::Dawg *dawg, |
| tesseract::Dict* dict) { |
| char* word_ptr = word_; |
| const char* prefix_word_ptr = prefix.word_; |
| |
| // Copy first space character |
| *(word_ptr++) = *(prefix_word_ptr++); |
| |
| // Copy existing word, unichar_lengths, certainty_array |
| int char_index; |
| for (char_index = 0; |
| prefix.unichar_lengths_[char_index] != '\0'; |
| ++char_index) { |
| for (int char_subindex = 0; |
| char_subindex < prefix.unichar_lengths_[char_index]; |
| ++char_subindex) { |
| *(word_ptr++) = *(prefix_word_ptr++); |
| } |
| unichar_lengths_[char_index] = prefix.unichar_lengths_[char_index]; |
| certainty_array_[char_index] = prefix.certainty_array_[char_index]; |
| } |
| |
| // If choice is empty, use a space character instead |
| const char* class_string_choice = *class_string(choice) == '\0' ? |
| " " : class_string(choice); |
| |
| // Update certainty |
| certainty_ = MIN(prefix.certainty_, class_certainty(choice)); |
| |
| // Apprend choice to the word |
| strcpy(word_ptr, class_string_choice); |
| unichar_lengths_[char_index] = strlen(class_string_choice); |
| unichar_lengths_[char_index + 1] = '\0'; |
| |
| // Append choice certainty to the certainty array |
| certainty_array_[char_index] = class_certainty(choice); |
| |
| // Copy DAWG node state |
| dawg_node_ = prefix.dawg_node_; |
| is_dawg_prefix_ = prefix.is_dawg_prefix_; |
| |
| // Verify DAWG and update dawg_node_ if the current prefix is already valid |
| if (is_dawg_prefix_) { |
| for (int char_subindex = 0; |
| class_string_choice[char_subindex] != '\0'; |
| ++char_subindex) { |
| |
| // TODO(daria): update this code (and the rest of ngram permuter code |
| // to deal with unichar ids, make use of the new parallel dawg search |
| // and use WERD_CHOICE, BLOB_CHOICE_LIST_VECTOR instead of the deprecated |
| // A_CHOICE. |
| tprintf("Error: ngram permuter functionality is not available\n"); |
| exit(1); |
| |
| // Verify each byte of the appended character. Note that word_ptr points |
| // to the first byte so (word_ptr - (word_ + 1)) is the index of the first |
| // new byte in the string that starts at (word_ + 1). |
| /* |
| int current_byte_index = word_ptr - (word_ + 1) + char_subindex; |
| if (!(dict->*dict->letter_is_okay_)( |
| dawg, &dawg_node_, current_byte_index, word_ + 1, |
| end_of_word && class_string_choice[char_subindex + 1] == '\0')) { |
| dawg_node_ = NO_EDGE; |
| is_dawg_prefix_ = false; |
| break; |
| } |
| */ |
| } |
| } |
| |
| // Copy the prefix rating |
| rating_ = prefix.rating_; |
| |
| // Compute rating of current character |
| double probability = probability_in_context(prefix.word_, -1, |
| class_string_choice, -1); |
| |
| // If last character of the word, take the following space into account |
| if (end_of_word) |
| probability *= probability_in_context(word_, -1, " ", -1); |
| |
| double local_classifier_score_ngram_score_ratio = |
| get_classifier_score_ngram_score_ratio(class_string_choice); |
| |
| double classifier_rating = class_rating(choice); |
| double ngram_rating = -log(probability) / log(2.0); |
| double mixed_rating = |
| local_classifier_score_ngram_score_ratio * classifier_rating + |
| (1 - local_classifier_score_ngram_score_ratio) * ngram_rating; |
| |
| // If the current word is not a valid prefix, adjust the rating of the |
| // character being appended. If it used to be a valid prefix, compensate for |
| // previous adjustments. |
| if (!is_dawg_prefix_) { |
| if (prefix.is_dawg_prefix_) |
| rating_ *= non_dawg_prefix_rating_adjustment; |
| mixed_rating *= non_dawg_prefix_rating_adjustment; |
| } |
| |
| // Update rating by adding the rating of the character being appended. |
| rating_ += mixed_rating; |
| } |
| |
| // Create an empty HypothesisPrefixList. Its maximum size is set to the given |
| // bound. |
| HypothesisPrefixList::HypothesisPrefixList(int size_bound): |
| _size_bound(size_bound), |
| _size(0) { |
| _list_nodes = new HypothesisPrefix*[_size_bound]; |
| for (int i = 0; i < _size_bound; ++i) |
| _list_nodes[i] = NULL; |
| } |
| |
| // Destroy a HypothesisPrefixList all contained nodes are deleted as well. |
| HypothesisPrefixList::~HypothesisPrefixList() { |
| this->clear(); |
| delete[] _list_nodes; |
| } |
| |
| // Add a node to the HypothesisPrefixList. Maintains the sorted list property. |
| // Note that the HypothesisPrefixList takes ownership of the given node and |
| // might delete it if needed. It must therefore have been allocated on the heap. |
| void HypothesisPrefixList::add_node(HypothesisPrefix* node) { |
| // Detect nodes that have a worst rating that the current maximum and treat |
| // them separately. |
| if (_size > 0 && _list_nodes[_size - 1]->rating() < node->rating()) { |
| if (_size == _size_bound) { |
| // The list is already full. This node will not be added |
| delete node; |
| } else { |
| // The list is not full. Add the node at the last position. |
| _list_nodes[_size] = node; |
| ++_size; |
| } |
| return; |
| } |
| // Find the correct position |
| int node_index_target = 0; |
| while (node_index_target < _size_bound && |
| _list_nodes[node_index_target] != NULL && |
| _list_nodes[node_index_target]->rating() < node->rating()) { |
| ++node_index_target; |
| } |
| if (node_index_target >= _size_bound) { |
| delete node; |
| } else { |
| // Move next states by 1. Starting from the last one. |
| int node_index_move = _size - 1; |
| while (node_index_move >= node_index_target) { |
| if (node_index_move == _size_bound - 1) |
| delete _list_nodes[node_index_move]; |
| else |
| _list_nodes[node_index_move + 1] = _list_nodes[node_index_move]; |
| _list_nodes[node_index_move] = NULL; |
| --node_index_move; |
| } |
| // Insert new node |
| _list_nodes[node_index_target] = node; |
| // Increment size if it has changed |
| if (_size < _size_bound) |
| ++_size; |
| } |
| } |
| |
| // Delete all contained nodes and set the size of the HypothesisPrefixList to 0 |
| void HypothesisPrefixList::clear() { |
| for (int i = 0; i < _size_bound && _list_nodes[i] != NULL; ++i) { |
| delete _list_nodes[i]; |
| _list_nodes[i] = NULL; |
| } |
| _size = 0; |
| } |