ccmain/ambigsrecog.cpp - platform/external/tesseract - Git at Google

 ///////////////////////////////////////////////////////////////////////
 // File:        genericvector.h
 // Description: Functions for producing classifications
 //              for the input to ambigstraining.
 // Author:      Daria Antonova
 // Created:     Mon Jun 23 11:26:43 PDT 2008
 //
 // (C) Copyright 2007, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 ///////////////////////////////////////////////////////////////////////

 #include "ambigs.h"

 #include "applybox.h"
 #include "boxread.h"
 #include "control.h"
 #include "permute.h"
 #include "ratngs.h"
 #include "reject.h"
 #include "stopper.h"
 #include "tesseractclass.h"

 namespace tesseract {

 // Sets flags necessary for ambigs training mode.
 // Opens and returns the pointer to the output file.
 FILE *Tesseract::init_ambigs_training(const STRING &fname) {
   permute_only_top = 1;                        // use only top choice permuter
   tessedit_tess_adaption_mode.set_value(0);    // turn off adaption
   tessedit_ok_mode.set_value(0);               // turn off context checking
   tessedit_enable_doc_dict.set_value(0);       // turn off document dictionary
   save_best_choices.set_value(1);              // save individual char choices
   stopper_no_acceptable_choices.set_value(1);  // explore all segmentations
   save_raw_choices.set_value(1);               // save raw choices

   // Open ambigs output file.
   STRING output_fname = fname;
   const char *lastdot = strrchr(output_fname.string(), '.');
   if (lastdot != NULL) {
     output_fname[lastdot - output_fname.string()] = '\0';
   }
   output_fname += ".txt";
   FILE *output_file;
   if (!(output_file = fopen(output_fname.string(), "a+"))) {
     CANTOPENFILE.error("ambigs_training", EXIT,
                        "Can't open box file %s\n", output_fname.string());
   }
   return output_file;
 }

 // This function takes tif/box pair of files and runs recognition on the image,
 // while making sure that the word bounds that tesseract identified roughly
 // match to those specified by the input box file. For each word (ngram in a
 // single bounding box from the input box file) it outputs the ocred result,
 // the correct label, rating and certainty.
 void Tesseract::ambigs_training_segmented(const STRING &fname,
                                           PAGE_RES *page_res,
                                           volatile ETEXT_DESC *monitor,
                                           FILE *output_file) {
   STRING box_fname = fname;
   const char *lastdot = strrchr(box_fname.string(), '.');
   if (lastdot != NULL) {
     box_fname[lastdot - box_fname.string()] = '\0';
   }
   box_fname += ".box";
   FILE *box_file;
   if (!(box_file = fopen(box_fname.string(), "r"))) {
     CANTOPENFILE.error("ambigs_training", EXIT,
                        "Can't open box file %s\n", box_fname.string());
   }

   static PAGE_RES_IT page_res_it;
   page_res_it.page_res = page_res;
   page_res_it.restart_page();
   int x_min, y_min, x_max, y_max;
   char label[UNICHAR_LEN * 10];

   // Process all the words on this page.
   while (page_res_it.word() != NULL &&
          read_next_box(applybox_page, box_file, label,
                        &x_min, &y_min, &x_max, &y_max)) {
     // Init bounding box of the current word bounding box and from box file.
     TBOX box = TBOX(ICOORD(x_min, y_min), ICOORD(x_max, y_max));
     TBOX word_box(page_res_it.word()->word->bounding_box());
     bool one_word = true;
     // Check whether the bounding box of the next word overlaps with the
     // current box from box file.
     while (page_res_it.next_word() != NULL &&
            box.x_overlap(page_res_it.next_word()->word->bounding_box())) {
       word_box = word_box.bounding_union(
           page_res_it.next_word()->word->bounding_box());
       page_res_it.forward();
       one_word = false;
     }
     if (!word_box.major_overlap(box)) {
       if (!word_box.x_overlap(box)) {
         // We must be looking at the word that belongs in the "next" bounding
         // box from the box file. The ngram that was supposed to appear in
         // the current box read from the box file must have been dropped by
         // tesseract as noise.
         tprintf("Word %s was dropped as noise.\n", label);
         continue;  // stay on this blob, but read next box from box file
       } else {
         tprintf("Error: Insufficient overlap for word box"
                 " and box from file for %s\n", label);
         word_box.print();
         box.print();
         exit(1);
       }
     }
     // Skip recognizing the ngram if tesseract is sure it's not
     // one word, otherwise run one recognition pass on this word.
     if (!one_word) {
       tprintf("Tesseract segmented %s as multiple words\n", label);
     } else {
       ambigs_classify_and_output(&page_res_it, label, output_file);
     }
     page_res_it.forward();
   }
   fclose(box_file);
 }

 // Run classify_word_pass1() on the current word. Output tesseract's raw choice
 // as a result of the classification. For words labeled with a single unichar
 // also output all alternatives from blob_choices of the best choice.
 void Tesseract::ambigs_classify_and_output(PAGE_RES_IT *page_res_it,
                                            const char *label,
                                            FILE *output_file) {
   int offset;
   // Classify word.
   classify_word_pass1(page_res_it->word(), page_res_it->row()->row,
                       page_res_it->block()->block,
                       FALSE, NULL, NULL);
   WERD_CHOICE *best_choice = page_res_it->word()->best_choice;
   ASSERT_HOST(best_choice != NULL);
   ASSERT_HOST(best_choice->blob_choices() != NULL);

   // Compute the number of unichars in the label.
   int label_num_unichars = 0;
   int step = 1;  // should be non-zero on the first iteration
   for (offset = 0; label[offset] != '\0' && step > 0;
        step = getDict().getUnicharset().step(label + offset),
        offset += step, ++label_num_unichars);
   if (step == 0) {
     tprintf("Not outputting illegal unichar %s\n", label);
     return;
   }

   // Output all classifier choices for the unigrams (1-1 classifications).
   if (label_num_unichars == 1 && best_choice->blob_choices()->length() == 1) {
     BLOB_CHOICE_LIST_C_IT outer_blob_choice_it;
     outer_blob_choice_it.set_to_list(best_choice->blob_choices());
     BLOB_CHOICE_IT blob_choice_it;
     blob_choice_it.set_to_list(outer_blob_choice_it.data());
     for (blob_choice_it.mark_cycle_pt();
          !blob_choice_it.cycled_list();
          blob_choice_it.forward()) {
       BLOB_CHOICE *blob_choice = blob_choice_it.data();
       if (blob_choice->unichar_id() != INVALID_UNICHAR_ID) {
         fprintf(output_file, "%s\t%s\t%.4f\t%.4f\n",
                unicharset.id_to_unichar(blob_choice->unichar_id()),
                label, blob_choice->rating(), blob_choice->certainty());
       }
     }
   }
   // Output the raw choice for succesful non 1-1 classifications.
   getDict().PrintAmbigAlternatives(output_file, label, label_num_unichars);
 }

 }  // namespace tesseract
	///////////////////////////////////////////////////////////////////////
	// File: genericvector.h
	// Description: Functions for producing classifications
	// for the input to ambigstraining.
	// Author: Daria Antonova
	// Created: Mon Jun 23 11:26:43 PDT 2008
	//
	// (C) Copyright 2007, Google Inc.
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	// http://www.apache.org/licenses/LICENSE-2.0
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//
	///////////////////////////////////////////////////////////////////////

	#include "ambigs.h"

	#include "applybox.h"
	#include "boxread.h"
	#include "control.h"
	#include "permute.h"
	#include "ratngs.h"
	#include "reject.h"
	#include "stopper.h"
	#include "tesseractclass.h"

	namespace tesseract {

	// Sets flags necessary for ambigs training mode.
	// Opens and returns the pointer to the output file.
	FILE *Tesseract::init_ambigs_training(const STRING &fname) {
	permute_only_top = 1; // use only top choice permuter
	tessedit_tess_adaption_mode.set_value(0); // turn off adaption
	tessedit_ok_mode.set_value(0); // turn off context checking
	tessedit_enable_doc_dict.set_value(0); // turn off document dictionary
	save_best_choices.set_value(1); // save individual char choices
	stopper_no_acceptable_choices.set_value(1); // explore all segmentations
	save_raw_choices.set_value(1); // save raw choices

	// Open ambigs output file.
	STRING output_fname = fname;
	const char *lastdot = strrchr(output_fname.string(), '.');
	if (lastdot != NULL) {
	output_fname[lastdot - output_fname.string()] = '\0';
	}
	output_fname += ".txt";
	FILE *output_file;
	if (!(output_file = fopen(output_fname.string(), "a+"))) {
	CANTOPENFILE.error("ambigs_training", EXIT,
	"Can't open box file %s\n", output_fname.string());
	}
	return output_file;
	}

	// This function takes tif/box pair of files and runs recognition on the image,
	// while making sure that the word bounds that tesseract identified roughly
	// match to those specified by the input box file. For each word (ngram in a
	// single bounding box from the input box file) it outputs the ocred result,
	// the correct label, rating and certainty.
	void Tesseract::ambigs_training_segmented(const STRING &fname,
	PAGE_RES *page_res,
	volatile ETEXT_DESC *monitor,
	FILE *output_file) {
	STRING box_fname = fname;
	const char *lastdot = strrchr(box_fname.string(), '.');
	if (lastdot != NULL) {
	box_fname[lastdot - box_fname.string()] = '\0';
	}
	box_fname += ".box";
	FILE *box_file;
	if (!(box_file = fopen(box_fname.string(), "r"))) {
	CANTOPENFILE.error("ambigs_training", EXIT,
	"Can't open box file %s\n", box_fname.string());
	}

	static PAGE_RES_IT page_res_it;
	page_res_it.page_res = page_res;
	page_res_it.restart_page();
	int x_min, y_min, x_max, y_max;
	char label[UNICHAR_LEN * 10];

	// Process all the words on this page.
	while (page_res_it.word() != NULL &&
	read_next_box(applybox_page, box_file, label,
	&x_min, &y_min, &x_max, &y_max)) {
	// Init bounding box of the current word bounding box and from box file.
	TBOX box = TBOX(ICOORD(x_min, y_min), ICOORD(x_max, y_max));
	TBOX word_box(page_res_it.word()->word->bounding_box());
	bool one_word = true;
	// Check whether the bounding box of the next word overlaps with the
	// current box from box file.
	while (page_res_it.next_word() != NULL &&
	box.x_overlap(page_res_it.next_word()->word->bounding_box())) {
	word_box = word_box.bounding_union(
	page_res_it.next_word()->word->bounding_box());
	page_res_it.forward();
	one_word = false;
	}
	if (!word_box.major_overlap(box)) {
	if (!word_box.x_overlap(box)) {
	// We must be looking at the word that belongs in the "next" bounding
	// box from the box file. The ngram that was supposed to appear in
	// the current box read from the box file must have been dropped by
	// tesseract as noise.
	tprintf("Word %s was dropped as noise.\n", label);
	continue; // stay on this blob, but read next box from box file
	} else {
	tprintf("Error: Insufficient overlap for word box"
	" and box from file for %s\n", label);
	word_box.print();
	box.print();
	exit(1);
	}
	}
	// Skip recognizing the ngram if tesseract is sure it's not
	// one word, otherwise run one recognition pass on this word.
	if (!one_word) {
	tprintf("Tesseract segmented %s as multiple words\n", label);
	} else {
	ambigs_classify_and_output(&page_res_it, label, output_file);
	}
	page_res_it.forward();
	}
	fclose(box_file);
	}

	// Run classify_word_pass1() on the current word. Output tesseract's raw choice
	// as a result of the classification. For words labeled with a single unichar
	// also output all alternatives from blob_choices of the best choice.
	void Tesseract::ambigs_classify_and_output(PAGE_RES_IT *page_res_it,
	const char *label,
	FILE *output_file) {
	int offset;
	// Classify word.
	classify_word_pass1(page_res_it->word(), page_res_it->row()->row,
	page_res_it->block()->block,
	FALSE, NULL, NULL);
	WERD_CHOICE *best_choice = page_res_it->word()->best_choice;
	ASSERT_HOST(best_choice != NULL);
	ASSERT_HOST(best_choice->blob_choices() != NULL);

	// Compute the number of unichars in the label.
	int label_num_unichars = 0;
	int step = 1; // should be non-zero on the first iteration
	for (offset = 0; label[offset] != '\0' && step > 0;
	step = getDict().getUnicharset().step(label + offset),
	offset += step, ++label_num_unichars);
	if (step == 0) {
	tprintf("Not outputting illegal unichar %s\n", label);
	return;
	}

	// Output all classifier choices for the unigrams (1-1 classifications).
	if (label_num_unichars == 1 && best_choice->blob_choices()->length() == 1) {
	BLOB_CHOICE_LIST_C_IT outer_blob_choice_it;
	outer_blob_choice_it.set_to_list(best_choice->blob_choices());
	BLOB_CHOICE_IT blob_choice_it;
	blob_choice_it.set_to_list(outer_blob_choice_it.data());
	for (blob_choice_it.mark_cycle_pt();
	!blob_choice_it.cycled_list();
	blob_choice_it.forward()) {
	BLOB_CHOICE *blob_choice = blob_choice_it.data();
	if (blob_choice->unichar_id() != INVALID_UNICHAR_ID) {
	fprintf(output_file, "%s\t%s\t%.4f\t%.4f\n",
	unicharset.id_to_unichar(blob_choice->unichar_id()),
	label, blob_choice->rating(), blob_choice->certainty());
	}
	}
	}
	// Output the raw choice for succesful non 1-1 classifications.
	getDict().PrintAmbigAlternatives(output_file, label, label_num_unichars);
	}

	} // namespace tesseract