dict/dict.cpp - platform/external/tesseract - Git at Google

 ///////////////////////////////////////////////////////////////////////
 // File:        dict.cpp
 // Description: dict class.
 // Author:      Samuel Charron
 //
 // (C) Copyright 2006, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 ///////////////////////////////////////////////////////////////////////

 #include "dict.h"

 namespace tesseract {

 class Image;

 Dict::Dict(Image* image_ptr)
     : letter_is_okay_(&tesseract::Dict::def_letter_is_okay),
       image_ptr_(image_ptr) {
   dang_ambigs_table_ = NULL;
   replace_ambigs_table_ = NULL;
   keep_word_choices_ = false;
   reject_offset_ = 0.0;
   best_raw_choice_ = NULL;
   best_choices_ = NIL;
   raw_choices_ = NIL;
   go_deeper_fxn_ = NULL;
   hyphen_word_ = NULL;
   last_word_on_line_ = false;
   hyphen_unichar_id_ = INVALID_UNICHAR_ID;
   document_words_ = NULL;
   pending_words_ = NULL;
   freq_dawg_ = NULL;
 }

 Dict::~Dict() {
   if (hyphen_word_ != NULL) delete hyphen_word_;
 }

 // Returns true if in light of the current state the letter at word_index
 // in the given word is allowed according to at least one of the dawgs in
 // dawgs_.
 //
 // See more extensive comments in dict.h where this function is declared.
 //
 int Dict::def_letter_is_okay(void* void_dawg_args, int word_index,
                              const void *void_word, bool word_end) {
   DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args);
   const WERD_CHOICE *word = reinterpret_cast<const WERD_CHOICE*>(void_word);

   if (dawg_debug_level >= 3) {
     tprintf("def_letter_is_okay: word_index=%d word_end=%d"
             " word=%s num active dawgs=%d num constraints=%d\n",
             word_index, word_end,
             word->debug_string(getUnicharset()).string(),
             dawg_args->active_dawgs->length(),
             dawg_args->constraints->length());
   }

   // Do not accept words that contain kPatternUnicharID.
   // (otherwise pattern dawgs would not function correctly).
   // Do not accept words containing INVALID_UNICHAR_IDs.
   UNICHAR_ID unichar_id = word->unichar_id(word_index);
   if (unichar_id == Dawg::kPatternUnicharID ||
       unichar_id == INVALID_UNICHAR_ID) {
     dawg_args->permuter = NO_PERM;
     return NO_PERM;
   }

   // Initialization.
   PermuterType current_permuter = NO_PERM;
   dawg_args->updated_active_dawgs->clear();
   const DawgInfoVector &constraints = *(dawg_args->constraints);
   *dawg_args->updated_constraints = constraints;

   // Go over the active_dawgs vector and insert DawgInfo records with the
   // updated ref (an edge with the corresponding unichar id) into
   // dawg_args->updated_active_dawgs.
   for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
     const DawgInfo &info = (*dawg_args->active_dawgs)[a];
     const Dawg *dawg = dawgs_[info.dawg_index];
     // Obtain unichar_id at this position (could be changed later, so this
     // needs to be inside the loop over all active dawgs).
      unichar_id = word->unichar_id(word_index);
     // The number dawg generalizes all digits to be kPatternUnicharID,
     // so try to match kPatternUnicharID if the current unichar is a digit.
     if (dawg->type() == DAWG_TYPE_NUMBER &&
         getUnicharset().get_isdigit(unichar_id)) {
       unichar_id = Dawg::kPatternUnicharID;
     }
     // Get the starting node for this letter.
     NODE_REF node;
     if (info.ref == NO_EDGE) {
       node = 0;  // beginning to explore this dawg
     } else {
       node = dawg->next_node(info.ref);
       if (node == 0) node = NO_EDGE;  // end of word
     }
     // Find the edge out of the node for the curent unichar_id.
     EDGE_REF edge = (node != NO_EDGE) ?
       dawg->edge_char_of(node, unichar_id, word_end) : NO_EDGE;

     if (dawg_debug_level >= 3) {
       tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
               info.dawg_index, node, edge);
     }

     if (edge != NO_EDGE) {  // the unichar was found in the current dawg
       if (ConstraintsOk(*(dawg_args->updated_constraints),
                         word_end, dawg->type())) {
         UpdatePermuter(dawg->permuter(), &current_permuter);
         dawg_args->updated_active_dawgs->add_unique(
             DawgInfo(info.dawg_index, edge),
             "Append current dawg to updated active dawgs: ");
       }
     } else {                // the unichar was not found in the current dawg
       // Handle leading/trailing punctuation dawgs that denote a word pattern
       // as an edge with kPatternUnicharID. If such an edge is found we add a
       // constraint denoting the state of the dawg before the word pattern.
       // This constraint will be applied later when this dawg is found among
       // successor dawgs as well potentially at the end of the word.
       if (dawg->type() == DAWG_TYPE_PUNCTUATION) {
         edge = dawg->edge_char_of(node, Dawg::kPatternUnicharID, word_end);
         if (edge != NO_EDGE) {
           dawg_args->updated_constraints->add_unique(
               DawgInfo(info.dawg_index, edge), "Recording constraint: ");
         } else {
           // Do not explore successors of this dawg, since this
           // must be invalid leading or trailing punctuation.
           if (dawg_debug_level >= 3) {
             tprintf("Invalid punctuation from dawg %d\n", info.dawg_index);
           }
           continue;
         }
       }

       if (info.ref == NO_EDGE) {
         if (dawg_debug_level >= 3) {
           tprintf("No letters matched in dawg %d\n", info.dawg_index);
         }
         continue;
       }

       // Discard the dawg if the pattern can not end at previous letter.
       if (edge == NO_EDGE &&  // previous part is not leading punctuation
           !dawg->end_of_word(info.ref)) {
         if (dawg_debug_level >= 3) {
           tprintf("No valid pattern end in dawg %d\n", info.dawg_index);
         }
         continue;
       }

       // Look for the unichar in each of this dawg's successors
       // and append those in which it is found to active_dawgs.
       const SuccessorList &slist = *(successors_[info.dawg_index]);
       for (int s = 0; s < slist.length(); ++s) {
         int sdawg_index = slist[s];
         const Dawg *sdawg = dawgs_[sdawg_index];
         NODE_REF snode = 0;
         // Apply constraints to the successor dawg.
         for (int c = 0; c < constraints.length(); ++c) {
           // If the successor dawg is described in the constraints change
           // the start ref from 0 to the one recorded as the constraint.
           const DawgInfo &cinfo = constraints[c];
           if (cinfo.dawg_index == sdawg_index) {
             snode = sdawg->next_node(cinfo.ref);
             // Make sure we do not search the successor dawg if after
             // applying the saved constraint we are at the end of the word.
             if (snode == 0) snode = NO_EDGE;
             if (dawg_debug_level >= 3) {
                tprintf("Applying constraint [%d, " REFFORMAT "]\n",
                        sdawg_index, snode);
             }
           }
         }
         // Look for the letter in this successor dawg.
         EDGE_REF sedge = sdawg->edge_char_of(
             snode, word->unichar_id(word_index), word_end);
         // If we found the letter append sdawg to the active_dawgs list.
         if (sedge != NO_EDGE &&
             ConstraintsOk(*(dawg_args->updated_constraints), word_end,
                           dawgs_[sdawg_index]->type())) {
           UpdatePermuter(sdawg->permuter(), &current_permuter);
           if (sdawg->next_node(sedge) != 0) {  // if not word end
             dawg_args->updated_active_dawgs->add_unique(
               DawgInfo(sdawg_index, sedge),
               "Append successor to updated active dawgs: ");
           }
         }
       }  // end successors loop
     }  // end if/else
   }  // end for
   // Update dawg_args->permuter if it used to be NO_PERM or if we found
   // the current letter in a non-punctuation dawg. This allows preserving
   // information on which dawg the "core" word came from.
   if ((current_permuter == PUNC_PERM &&
        current_permuter > dawg_args->permuter) ||
       current_permuter != PUNC_PERM) {
     dawg_args->permuter = current_permuter;
   }
   return dawg_args->permuter;
 }

 }  // namespace tesseract
	///////////////////////////////////////////////////////////////////////
	// File: dict.cpp
	// Description: dict class.
	// Author: Samuel Charron
	//
	// (C) Copyright 2006, Google Inc.
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	// http://www.apache.org/licenses/LICENSE-2.0
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//
	///////////////////////////////////////////////////////////////////////

	#include "dict.h"

	namespace tesseract {

	class Image;

	Dict::Dict(Image* image_ptr)
	: letter_is_okay_(&tesseract::Dict::def_letter_is_okay),
	image_ptr_(image_ptr) {
	dang_ambigs_table_ = NULL;
	replace_ambigs_table_ = NULL;
	keep_word_choices_ = false;
	reject_offset_ = 0.0;
	best_raw_choice_ = NULL;
	best_choices_ = NIL;
	raw_choices_ = NIL;
	go_deeper_fxn_ = NULL;
	hyphen_word_ = NULL;
	last_word_on_line_ = false;
	hyphen_unichar_id_ = INVALID_UNICHAR_ID;
	document_words_ = NULL;
	pending_words_ = NULL;
	freq_dawg_ = NULL;
	}

	Dict::~Dict() {
	if (hyphen_word_ != NULL) delete hyphen_word_;
	}

	// Returns true if in light of the current state the letter at word_index
	// in the given word is allowed according to at least one of the dawgs in
	// dawgs_.
	//
	// See more extensive comments in dict.h where this function is declared.
	//
	int Dict::def_letter_is_okay(void* void_dawg_args, int word_index,
	const void *void_word, bool word_end) {
	DawgArgs dawg_args = reinterpret_cast<DawgArgs>(void_dawg_args);
	const WERD_CHOICE word = reinterpret_cast<const WERD_CHOICE>(void_word);

	if (dawg_debug_level >= 3) {
	tprintf("def_letter_is_okay: word_index=%d word_end=%d"
	" word=%s num active dawgs=%d num constraints=%d\n",
	word_index, word_end,
	word->debug_string(getUnicharset()).string(),
	dawg_args->active_dawgs->length(),
	dawg_args->constraints->length());
	}

	// Do not accept words that contain kPatternUnicharID.
	// (otherwise pattern dawgs would not function correctly).
	// Do not accept words containing INVALID_UNICHAR_IDs.
	UNICHAR_ID unichar_id = word->unichar_id(word_index);
	if (unichar_id == Dawg::kPatternUnicharID \|\|
	unichar_id == INVALID_UNICHAR_ID) {
	dawg_args->permuter = NO_PERM;
	return NO_PERM;
	}

	// Initialization.
	PermuterType current_permuter = NO_PERM;
	dawg_args->updated_active_dawgs->clear();
	const DawgInfoVector &constraints = *(dawg_args->constraints);
	*dawg_args->updated_constraints = constraints;

	// Go over the active_dawgs vector and insert DawgInfo records with the
	// updated ref (an edge with the corresponding unichar id) into
	// dawg_args->updated_active_dawgs.
	for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
	const DawgInfo &info = (*dawg_args->active_dawgs)[a];
	const Dawg *dawg = dawgs_[info.dawg_index];
	// Obtain unichar_id at this position (could be changed later, so this
	// needs to be inside the loop over all active dawgs).
	unichar_id = word->unichar_id(word_index);
	// The number dawg generalizes all digits to be kPatternUnicharID,
	// so try to match kPatternUnicharID if the current unichar is a digit.
	if (dawg->type() == DAWG_TYPE_NUMBER &&
	getUnicharset().get_isdigit(unichar_id)) {
	unichar_id = Dawg::kPatternUnicharID;
	}
	// Get the starting node for this letter.
	NODE_REF node;
	if (info.ref == NO_EDGE) {
	node = 0; // beginning to explore this dawg
	} else {
	node = dawg->next_node(info.ref);
	if (node == 0) node = NO_EDGE; // end of word
	}
	// Find the edge out of the node for the curent unichar_id.
	EDGE_REF edge = (node != NO_EDGE) ?
	dawg->edge_char_of(node, unichar_id, word_end) : NO_EDGE;

	if (dawg_debug_level >= 3) {
	tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
	info.dawg_index, node, edge);
	}

	if (edge != NO_EDGE) { // the unichar was found in the current dawg
	if (ConstraintsOk(*(dawg_args->updated_constraints),
	word_end, dawg->type())) {
	UpdatePermuter(dawg->permuter(), &current_permuter);
	dawg_args->updated_active_dawgs->add_unique(
	DawgInfo(info.dawg_index, edge),
	"Append current dawg to updated active dawgs: ");
	}
	} else { // the unichar was not found in the current dawg
	// Handle leading/trailing punctuation dawgs that denote a word pattern
	// as an edge with kPatternUnicharID. If such an edge is found we add a
	// constraint denoting the state of the dawg before the word pattern.
	// This constraint will be applied later when this dawg is found among
	// successor dawgs as well potentially at the end of the word.
	if (dawg->type() == DAWG_TYPE_PUNCTUATION) {
	edge = dawg->edge_char_of(node, Dawg::kPatternUnicharID, word_end);
	if (edge != NO_EDGE) {
	dawg_args->updated_constraints->add_unique(
	DawgInfo(info.dawg_index, edge), "Recording constraint: ");
	} else {
	// Do not explore successors of this dawg, since this
	// must be invalid leading or trailing punctuation.
	if (dawg_debug_level >= 3) {
	tprintf("Invalid punctuation from dawg %d\n", info.dawg_index);
	}
	continue;
	}
	}

	if (info.ref == NO_EDGE) {
	if (dawg_debug_level >= 3) {
	tprintf("No letters matched in dawg %d\n", info.dawg_index);
	}
	continue;
	}

	// Discard the dawg if the pattern can not end at previous letter.
	if (edge == NO_EDGE && // previous part is not leading punctuation
	!dawg->end_of_word(info.ref)) {
	if (dawg_debug_level >= 3) {
	tprintf("No valid pattern end in dawg %d\n", info.dawg_index);
	}
	continue;
	}

	// Look for the unichar in each of this dawg's successors
	// and append those in which it is found to active_dawgs.
	const SuccessorList &slist = *(successors_[info.dawg_index]);
	for (int s = 0; s < slist.length(); ++s) {
	int sdawg_index = slist[s];
	const Dawg *sdawg = dawgs_[sdawg_index];
	NODE_REF snode = 0;
	// Apply constraints to the successor dawg.
	for (int c = 0; c < constraints.length(); ++c) {
	// If the successor dawg is described in the constraints change
	// the start ref from 0 to the one recorded as the constraint.
	const DawgInfo &cinfo = constraints[c];
	if (cinfo.dawg_index == sdawg_index) {
	snode = sdawg->next_node(cinfo.ref);
	// Make sure we do not search the successor dawg if after
	// applying the saved constraint we are at the end of the word.
	if (snode == 0) snode = NO_EDGE;
	if (dawg_debug_level >= 3) {
	tprintf("Applying constraint [%d, " REFFORMAT "]\n",
	sdawg_index, snode);
	}
	}
	}
	// Look for the letter in this successor dawg.
	EDGE_REF sedge = sdawg->edge_char_of(
	snode, word->unichar_id(word_index), word_end);
	// If we found the letter append sdawg to the active_dawgs list.
	if (sedge != NO_EDGE &&
	ConstraintsOk(*(dawg_args->updated_constraints), word_end,
	dawgs_[sdawg_index]->type())) {
	UpdatePermuter(sdawg->permuter(), &current_permuter);
	if (sdawg->next_node(sedge) != 0) { // if not word end
	dawg_args->updated_active_dawgs->add_unique(
	DawgInfo(sdawg_index, sedge),
	"Append successor to updated active dawgs: ");
	}
	}
	} // end successors loop
	} // end if/else
	} // end for
	// Update dawg_args->permuter if it used to be NO_PERM or if we found
	// the current letter in a non-punctuation dawg. This allows preserving
	// information on which dawg the "core" word came from.
	if ((current_permuter == PUNC_PERM &&
	current_permuter > dawg_args->permuter) \|\|
	current_permuter != PUNC_PERM) {
	dawg_args->permuter = current_permuter;
	}
	return dawg_args->permuter;
	}

	} // namespace tesseract