| /////////////////////////////////////////////////////////////////////// |
| // File: dict.cpp |
| // Description: dict class. |
| // Author: Samuel Charron |
| // |
| // (C) Copyright 2006, Google Inc. |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| /////////////////////////////////////////////////////////////////////// |
| |
| #include "dict.h" |
| |
| namespace tesseract { |
| |
| class Image; |
| |
| Dict::Dict(Image* image_ptr) |
| : letter_is_okay_(&tesseract::Dict::def_letter_is_okay), |
| image_ptr_(image_ptr) { |
| dang_ambigs_table_ = NULL; |
| replace_ambigs_table_ = NULL; |
| keep_word_choices_ = false; |
| reject_offset_ = 0.0; |
| best_raw_choice_ = NULL; |
| best_choices_ = NIL; |
| raw_choices_ = NIL; |
| go_deeper_fxn_ = NULL; |
| hyphen_word_ = NULL; |
| last_word_on_line_ = false; |
| hyphen_unichar_id_ = INVALID_UNICHAR_ID; |
| document_words_ = NULL; |
| pending_words_ = NULL; |
| freq_dawg_ = NULL; |
| } |
| |
| Dict::~Dict() { |
| if (hyphen_word_ != NULL) delete hyphen_word_; |
| } |
| |
| // Returns true if in light of the current state the letter at word_index |
| // in the given word is allowed according to at least one of the dawgs in |
| // dawgs_. |
| // |
| // See more extensive comments in dict.h where this function is declared. |
| // |
| int Dict::def_letter_is_okay(void* void_dawg_args, int word_index, |
| const void *void_word, bool word_end) { |
| DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args); |
| const WERD_CHOICE *word = reinterpret_cast<const WERD_CHOICE*>(void_word); |
| |
| if (dawg_debug_level >= 3) { |
| tprintf("def_letter_is_okay: word_index=%d word_end=%d" |
| " word=%s num active dawgs=%d num constraints=%d\n", |
| word_index, word_end, |
| word->debug_string(getUnicharset()).string(), |
| dawg_args->active_dawgs->length(), |
| dawg_args->constraints->length()); |
| } |
| |
| // Do not accept words that contain kPatternUnicharID. |
| // (otherwise pattern dawgs would not function correctly). |
| // Do not accept words containing INVALID_UNICHAR_IDs. |
| UNICHAR_ID unichar_id = word->unichar_id(word_index); |
| if (unichar_id == Dawg::kPatternUnicharID || |
| unichar_id == INVALID_UNICHAR_ID) { |
| dawg_args->permuter = NO_PERM; |
| return NO_PERM; |
| } |
| |
| // Initialization. |
| PermuterType current_permuter = NO_PERM; |
| dawg_args->updated_active_dawgs->clear(); |
| const DawgInfoVector &constraints = *(dawg_args->constraints); |
| *dawg_args->updated_constraints = constraints; |
| |
| // Go over the active_dawgs vector and insert DawgInfo records with the |
| // updated ref (an edge with the corresponding unichar id) into |
| // dawg_args->updated_active_dawgs. |
| for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) { |
| const DawgInfo &info = (*dawg_args->active_dawgs)[a]; |
| const Dawg *dawg = dawgs_[info.dawg_index]; |
| // Obtain unichar_id at this position (could be changed later, so this |
| // needs to be inside the loop over all active dawgs). |
| unichar_id = word->unichar_id(word_index); |
| // The number dawg generalizes all digits to be kPatternUnicharID, |
| // so try to match kPatternUnicharID if the current unichar is a digit. |
| if (dawg->type() == DAWG_TYPE_NUMBER && |
| getUnicharset().get_isdigit(unichar_id)) { |
| unichar_id = Dawg::kPatternUnicharID; |
| } |
| // Get the starting node for this letter. |
| NODE_REF node; |
| if (info.ref == NO_EDGE) { |
| node = 0; // beginning to explore this dawg |
| } else { |
| node = dawg->next_node(info.ref); |
| if (node == 0) node = NO_EDGE; // end of word |
| } |
| // Find the edge out of the node for the curent unichar_id. |
| EDGE_REF edge = (node != NO_EDGE) ? |
| dawg->edge_char_of(node, unichar_id, word_end) : NO_EDGE; |
| |
| if (dawg_debug_level >= 3) { |
| tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", |
| info.dawg_index, node, edge); |
| } |
| |
| if (edge != NO_EDGE) { // the unichar was found in the current dawg |
| if (ConstraintsOk(*(dawg_args->updated_constraints), |
| word_end, dawg->type())) { |
| UpdatePermuter(dawg->permuter(), ¤t_permuter); |
| dawg_args->updated_active_dawgs->add_unique( |
| DawgInfo(info.dawg_index, edge), |
| "Append current dawg to updated active dawgs: "); |
| } |
| } else { // the unichar was not found in the current dawg |
| // Handle leading/trailing punctuation dawgs that denote a word pattern |
| // as an edge with kPatternUnicharID. If such an edge is found we add a |
| // constraint denoting the state of the dawg before the word pattern. |
| // This constraint will be applied later when this dawg is found among |
| // successor dawgs as well potentially at the end of the word. |
| if (dawg->type() == DAWG_TYPE_PUNCTUATION) { |
| edge = dawg->edge_char_of(node, Dawg::kPatternUnicharID, word_end); |
| if (edge != NO_EDGE) { |
| dawg_args->updated_constraints->add_unique( |
| DawgInfo(info.dawg_index, edge), "Recording constraint: "); |
| } else { |
| // Do not explore successors of this dawg, since this |
| // must be invalid leading or trailing punctuation. |
| if (dawg_debug_level >= 3) { |
| tprintf("Invalid punctuation from dawg %d\n", info.dawg_index); |
| } |
| continue; |
| } |
| } |
| |
| if (info.ref == NO_EDGE) { |
| if (dawg_debug_level >= 3) { |
| tprintf("No letters matched in dawg %d\n", info.dawg_index); |
| } |
| continue; |
| } |
| |
| // Discard the dawg if the pattern can not end at previous letter. |
| if (edge == NO_EDGE && // previous part is not leading punctuation |
| !dawg->end_of_word(info.ref)) { |
| if (dawg_debug_level >= 3) { |
| tprintf("No valid pattern end in dawg %d\n", info.dawg_index); |
| } |
| continue; |
| } |
| |
| // Look for the unichar in each of this dawg's successors |
| // and append those in which it is found to active_dawgs. |
| const SuccessorList &slist = *(successors_[info.dawg_index]); |
| for (int s = 0; s < slist.length(); ++s) { |
| int sdawg_index = slist[s]; |
| const Dawg *sdawg = dawgs_[sdawg_index]; |
| NODE_REF snode = 0; |
| // Apply constraints to the successor dawg. |
| for (int c = 0; c < constraints.length(); ++c) { |
| // If the successor dawg is described in the constraints change |
| // the start ref from 0 to the one recorded as the constraint. |
| const DawgInfo &cinfo = constraints[c]; |
| if (cinfo.dawg_index == sdawg_index) { |
| snode = sdawg->next_node(cinfo.ref); |
| // Make sure we do not search the successor dawg if after |
| // applying the saved constraint we are at the end of the word. |
| if (snode == 0) snode = NO_EDGE; |
| if (dawg_debug_level >= 3) { |
| tprintf("Applying constraint [%d, " REFFORMAT "]\n", |
| sdawg_index, snode); |
| } |
| } |
| } |
| // Look for the letter in this successor dawg. |
| EDGE_REF sedge = sdawg->edge_char_of( |
| snode, word->unichar_id(word_index), word_end); |
| // If we found the letter append sdawg to the active_dawgs list. |
| if (sedge != NO_EDGE && |
| ConstraintsOk(*(dawg_args->updated_constraints), word_end, |
| dawgs_[sdawg_index]->type())) { |
| UpdatePermuter(sdawg->permuter(), ¤t_permuter); |
| if (sdawg->next_node(sedge) != 0) { // if not word end |
| dawg_args->updated_active_dawgs->add_unique( |
| DawgInfo(sdawg_index, sedge), |
| "Append successor to updated active dawgs: "); |
| } |
| } |
| } // end successors loop |
| } // end if/else |
| } // end for |
| // Update dawg_args->permuter if it used to be NO_PERM or if we found |
| // the current letter in a non-punctuation dawg. This allows preserving |
| // information on which dawg the "core" word came from. |
| if ((current_permuter == PUNC_PERM && |
| current_permuter > dawg_args->permuter) || |
| current_permuter != PUNC_PERM) { |
| dawg_args->permuter = current_permuter; |
| } |
| return dawg_args->permuter; |
| } |
| |
| } // namespace tesseract |