ccmain/control.cpp - platform/external/tesseract - Git at Google

 /******************************************************************
  * File:        control.cpp  (Formerly control.c)
  * Description: Module-independent matcher controller.
  * Author:					Ray Smith
  * Created:					Thu Apr 23 11:09:58 BST 1992
  * ReHacked:    Tue Sep 22 08:42:49 BST 1992 Phil Cheatle
  *
  * (C) Copyright 1992, Hewlett-Packard Ltd.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  **********************************************************************/

 #include          "mfcpch.h"
 #include          "mainblk.h"
 #include          <string.h>
 #include          <math.h>
 #ifdef __UNIX__
 #include          <assert.h>
 #include          <unistd.h>
 #include                    <errno.h>
 #endif
 #include          <ctype.h>
 #include          "ocrclass.h"
 #include          "werdit.h"
 #include          "drawfx.h"
 #include          "tfacep.h"
 #include          "tessbox.h"
 #include          "tessvars.h"
 //#include                                      "fxtop.h"
 #include          "pgedit.h"
 #include          "reject.h"
 #include          "adaptions.h"
 #include          "charcut.h"
 #include          "fixxht.h"
 #include          "fixspace.h"
 #include          "genblob.h"
 #include          "docqual.h"
 #include          "control.h"
 #include          "secname.h"
 #include          "output.h"
 #include          "callcpp.h"
 #include          "notdll.h"
 #include "tordvars.h"
 #include "adaptmatch.h"
 #include "globals.h"
 #include "tesseractclass.h"

 #define MIN_FONT_ROW_COUNT  8
 #define MAX_XHEIGHT_DIFF  3

 #define EXTERN
 //extern "C" {
 //EXTERN BOOL_VAR(tessedit_small_match,FALSE,"Use small matrix matcher");

 //extern FILE*                          matcher_fp;
 //extern FILE*                          correct_fp;
 //};
 BOOL_VAR (tessedit_small_match, FALSE, "Use small matrix matcher");
 EXTERN BOOL_VAR (tessedit_print_text, FALSE, "Write text to stdout");
 EXTERN BOOL_VAR (tessedit_draw_words, FALSE, "Draw source words");
 EXTERN BOOL_VAR (tessedit_draw_outwords, FALSE, "Draw output words");
 EXTERN BOOL_VAR (tessedit_training_wiseowl, FALSE, "Call WO to learn blobs");
 EXTERN BOOL_VAR (tessedit_training_tess, FALSE, "Call Tess to learn blobs");
 EXTERN BOOL_VAR (tessedit_matcher_is_wiseowl, FALSE, "Call WO to classify");
 EXTERN BOOL_VAR (tessedit_dump_choices, FALSE, "Dump char choices");
 EXTERN BOOL_VAR (tessedit_fix_fuzzy_spaces, TRUE,
 "Try to improve fuzzy spaces");
 EXTERN BOOL_VAR (tessedit_unrej_any_wd, FALSE,
 "Dont bother with word plausibility");
 EXTERN BOOL_VAR (tessedit_fix_hyphens, TRUE, "Crunch double hyphens?");

 EXTERN BOOL_VAR (tessedit_reject_fullstops, FALSE, "Reject all fullstops");
 EXTERN BOOL_VAR (tessedit_reject_suspect_fullstops, FALSE,
 "Reject suspect fullstops");
 EXTERN BOOL_VAR (tessedit_redo_xheight, TRUE, "Check/Correct x-height");
 EXTERN BOOL_VAR (tessedit_cluster_adaption_on, TRUE,
 "Do our own adaption - ems only");
 EXTERN BOOL_VAR (tessedit_enable_doc_dict, TRUE,
 "Add words to the document dictionary");
 EXTERN BOOL_VAR (word_occ_first, FALSE, "Do word occ before re-est xht");
 EXTERN BOOL_VAR (tessedit_debug_fonts, FALSE, "Output font info per char");
 EXTERN BOOL_VAR (tessedit_xht_fiddles_on_done_wds, TRUE,
 "Apply xht fix up even if done");
 EXTERN BOOL_VAR (tessedit_xht_fiddles_on_no_rej_wds, TRUE,
 "Apply xht fix up even in no rejects");
 EXTERN INT_VAR (x_ht_check_word_occ, 2, "Check Char Block occupancy");
 EXTERN INT_VAR (x_ht_stringency, 1, "How many confirmed a/n to accept?");
 EXTERN BOOL_VAR (x_ht_quality_check, TRUE, "Dont allow worse quality");
 EXTERN BOOL_VAR (tessedit_debug_block_rejection, FALSE,
 "Block and Row stats");
 EXTERN INT_VAR (debug_x_ht_level, 0, "Reestimate debug");
 EXTERN BOOL_VAR (rej_use_xht, TRUE, "Individual rejection control");
 EXTERN BOOL_VAR (debug_acceptable_wds, FALSE, "Dump word pass/fail chk");

 EXTERN STRING_VAR (chs_leading_punct, "('`\"", "Leading punctuation");
 EXTERN
 STRING_VAR (chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");
 EXTERN STRING_VAR (chs_trailing_punct2, ")'`\"",
 "2nd Trailing punctuation");

 EXTERN double_VAR (quality_rej_pc, 0.08,
 "good_quality_doc lte rejection limit");
 EXTERN double_VAR (quality_blob_pc, 0.0,
 "good_quality_doc gte good blobs limit");
 EXTERN double_VAR (quality_outline_pc, 1.0,
 "good_quality_doc lte outline error limit");
 EXTERN double_VAR (quality_char_pc, 0.95,
 "good_quality_doc gte good char limit");
 EXTERN INT_VAR (quality_min_initial_alphas_reqd, 2,
 "alphas in a good word");

 EXTERN BOOL_VAR (tessedit_tess_adapt_to_rejmap, FALSE,
 "Use reject map to control Tesseract adaption");
 EXTERN INT_VAR (tessedit_tess_adaption_mode, 0x27,
 "Adaptation decision algorithm for tess");
 EXTERN INT_VAR (tessedit_em_adaption_mode, 0,
 "Adaptation decision algorithm for ems matrix matcher");
 EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass1, FALSE,
 "Adapt using clusterer after pass 1");
 EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass2, FALSE,
 "Adapt using clusterer after pass 1");
 EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass3, FALSE,
 "Adapt using clusterer after pass 1");
 EXTERN BOOL_VAR (tessedit_cluster_adapt_before_pass1, FALSE,
 "Adapt using clusterer before Tess adaping during pass 1");
 EXTERN INT_VAR (tessedit_cluster_adaption_mode, 0,
 "Adaptation decision algorithm for matrix matcher");
 EXTERN BOOL_VAR (tessedit_adaption_debug, FALSE,
 "Generate and print debug information for adaption");
 EXTERN BOOL_VAR (tessedit_minimal_rej_pass1, FALSE,
 "Do minimal rejection on pass 1 output");
 EXTERN BOOL_VAR (tessedit_test_adaption, FALSE,
 "Test adaption criteria");
 EXTERN BOOL_VAR (tessedit_global_adaption, FALSE,
 "Adapt to all docs over time");
 EXTERN BOOL_VAR (tessedit_matcher_log, FALSE, "Log matcher activity");
 EXTERN INT_VAR (tessedit_test_adaption_mode, 3,
 "Adaptation decision algorithm for tess");
 EXTERN BOOL_VAR(save_best_choices, FALSE,
                 "Save the results of the recognition step"
 " (blob_choices) within the corresponding WERD_CHOICE");

 EXTERN BOOL_VAR (test_pt, FALSE, "Test for point");
 EXTERN double_VAR (test_pt_x, 99999.99, "xcoord");
 EXTERN double_VAR (test_pt_y, 99999.99, "ycoord");

 extern int display_ratings;
 extern int number_debug;
 FILE *choice_file = NULL;        //Choice file ptr

 CLISTIZEH (PBLOB) CLISTIZE (PBLOB)
 /* DEBUGGING */
 inT16 blob_count(WERD *w) {
   return w->blob_list ()->length ();
 }


 /**********************************************************************
  * recog_pseudo_word
  *
  * Make a word from the selected blobs and run Tess on them.
  **********************************************************************/
 namespace tesseract {
 void Tesseract::recog_pseudo_word(                         //recognize blobs
                                   BLOCK_LIST *block_list,  //blocks to check
                                   TBOX &selection_box) {
   WERD *word;
   ROW *pseudo_row;               //row of word
   BLOCK *pseudo_block;           //block of word

   word = make_pseudo_word (block_list, selection_box,
     pseudo_block, pseudo_row);
   if (word != NULL) {
     recog_interactive(pseudo_block, pseudo_row, word);
     delete word;
   }
 }


 /**********************************************************************
  * recog_interactive
  *
  * Recognize a single word in interactive mode.
  **********************************************************************/
 BOOL8 Tesseract::recog_interactive(            //recognize blobs
                                    BLOCK *block,    //block
                                    ROW *row,   //row of word
                                    WERD *word  //word to recognize
                        ) {
   WERD_RES word_res(word);
   inT16 char_qual;
   inT16 good_char_qual;

   classify_word_pass2(&word_res, block, row);
   #ifndef SECURE_NAMES
   if (tessedit_debug_quality_metrics) {
     word_char_quality(&word_res, row, &char_qual, &good_char_qual);
     tprintf
       ("\n%d chars;  word_blob_quality: %d;  outline_errs: %d; char_quality: %d; good_char_quality: %d\n",
       word_res.reject_map.length (), word_blob_quality (&word_res, row),
       word_outline_errs (&word_res), char_qual, good_char_qual);
   }
   #endif
   return TRUE;
 }


 /**********************************************************************
  * recog_all_words()
  *
  * Walk the current block list applying the specified word processor function
  * to all words
  **********************************************************************/

 void Tesseract::recog_all_words(                            //process words
                                 PAGE_RES *page_res,         //page structure
                                                             //progress monitor
                                 volatile ETEXT_DESC *monitor,
                                         // specifies just to extract a rectangle
                                 TBOX *target_word_box,
                                 //0 - all, 1 just pass 1, 2 passes 2 and higher
                                 inT16 dopasses
                                ) {
                                  //reset page iterator
   static PAGE_RES_IT page_res_it;
   inT16 chars_in_word;
   inT16 rejects_in_word;
   static CHAR_SAMPLES_LIST em_clusters;
   static CHAR_SAMPLE_LIST ems_waiting;
   static CHAR_SAMPLES_LIST char_clusters;
   static CHAR_SAMPLE_LIST chars_waiting;
   inT16 blob_quality = 0;
   inT16 outline_errs = 0;
   static inT16 doc_blob_quality = 0;
   static inT16 doc_outline_errs = 0;
   static inT16 doc_char_quality = 0;
   inT16 all_char_quality;
   inT16 accepted_all_char_quality;
   static inT16 good_char_count = 0;
   static inT16 doc_good_char_quality = 0;
   int i;


   inT32 tess_adapt_mode = 0;
   static inT32 word_count;              //count of words in doc
   inT32 word_index;              //current word
   static int dict_words;

   if (tessedit_minimal_rej_pass1) {
     tessedit_test_adaption.set_value (TRUE);
     tessedit_minimal_rejection.set_value (TRUE);
   }

   if (tessedit_cluster_adapt_before_pass1) {
     tess_adapt_mode = tessedit_tess_adaption_mode;
     tessedit_tess_adaption_mode.set_value (0);
     tessedit_tess_adapt_to_rejmap.set_value (TRUE);
   }


 if (dopasses==0 || dopasses==1)
 {
 	page_res_it.page_res=page_res;
 	page_res_it.restart_page();

   /* Pass 1 */
   word_count = 0;
   if (monitor != NULL) {
     monitor->ocr_alive = TRUE;
     while (page_res_it.word () != NULL) {
       word_count++;
       page_res_it.forward ();
     }
     page_res_it.restart_page ();
   }
   else
     word_count = 1;

   word_index = 0;

 	em_clusters.clear();
     ems_waiting.clear();
     char_clusters.clear();
     chars_waiting.clear();
     dict_words = 0;
 	doc_blob_quality = 0;
 	doc_outline_errs = 0;
 	doc_char_quality = 0;
 	good_char_count = 0;
 	doc_good_char_quality = 0;

   while (page_res_it.word () != NULL) {
     set_global_loc_code(LOC_PASS1);
     word_index++;
     if (monitor != NULL) {
       monitor->ocr_alive = TRUE;
       monitor->progress = 30 + 50 * word_index / word_count;
       if ((monitor->end_time != 0 && clock() > monitor->end_time) ||
           (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
                                                          dict_words)))
         return;
     }
     classify_word_pass1(page_res_it.word(), page_res_it.row()->row,
                         page_res_it.block()->block, FALSE, NULL, NULL);
     if (tessedit_dump_choices) {
 #ifndef GRAPHICS_DISABLED
       word_dumper(NULL, page_res_it.row()->row, page_res_it.word()->word);
 #endif
       tprintf("Pass1: %s [%s]\n",
               page_res_it.word()->best_choice->unichar_string().string(),
               page_res_it.word()->best_choice->
                 debug_string(unicharset).string());
     }

     if (tessedit_test_adaption && !tessedit_minimal_rejection) {
       if (!word_adaptable (page_res_it.word (),
         tessedit_test_adaption_mode)) {
         page_res_it.word ()->reject_map.rej_word_tess_failure ();
       //FAKE PERM REJ
       } else {
         // Override rejection mechanisms for this word.
         UNICHAR_ID space = unicharset.unichar_to_id(" ");
         for (i = 0; i < page_res_it.word()->best_choice->length(); i++) {
           if ((page_res_it.word()->best_choice->unichar_id(i) != space) &&
               page_res_it.word()->reject_map[i].rejected())
             page_res_it.word ()->reject_map[i].setrej_minimal_rej_accept();
         }
       }
     }

     if ((tessedit_cluster_adapt_after_pass1
       || tessedit_cluster_adapt_after_pass3
       || tessedit_cluster_adapt_before_pass1)
     && tessedit_cluster_adaption_mode != 0) {
       collect_characters_for_adaption (page_res_it.word (),
         &char_clusters, &chars_waiting);
     }
     // Count dict words.
     if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
       ++dict_words;
     page_res_it.forward ();
   }

   if (tessedit_cluster_adapt_before_pass1)
     tessedit_tess_adaption_mode.set_value (tess_adapt_mode);

   page_res_it.restart_page ();
   while ((tessedit_cluster_adapt_after_pass1
     || tessedit_cluster_adapt_before_pass1)
   && page_res_it.word () != NULL) {
     if (monitor != NULL)
       monitor->ocr_alive = TRUE;
     if (tessedit_cluster_adapt_after_pass1)
       adapt_to_good_samples (page_res_it.word (),
         &char_clusters, &chars_waiting);
     else
       classify_word_pass1 (page_res_it.word (),
         page_res_it.row ()->row,
                           page_res_it.block()->block,
         TRUE, &char_clusters, &chars_waiting);

     page_res_it.forward ();
   }

   //


  }

 if (dopasses==1) return;

   /* Pass 2 */
   page_res_it.restart_page ();
   word_index = 0;
   while (!tessedit_test_adaption && page_res_it.word () != NULL) {
     set_global_loc_code(LOC_PASS2);
     word_index++;
     if (monitor != NULL) {
       monitor->ocr_alive = TRUE;
       monitor->progress = 80 + 10 * word_index / word_count;
       if ((monitor->end_time != 0 && clock() > monitor->end_time) ||
           (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
                                                          dict_words)))
         return;
     }
 //changed by jetsoft
 //specific to its needs to extract one word when need

 	if (target_word_box)
 	{

 		TBOX current_word_box=page_res_it.word ()->word->bounding_box();
 		FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
 		if (!target_word_box->contains(center_pt))
 		{
 			page_res_it.forward ();
 			continue;
 		}

 	}
 //end jetsoft

     classify_word_pass2(page_res_it.word(), page_res_it.block()->block,
                         page_res_it.row()->row);
     if (tessedit_dump_choices) {
 #ifndef GRAPHICS_DISABLED
       word_dumper(NULL, page_res_it.row()->row, page_res_it.word()->word);
 #endif
       tprintf("Pass2: %s [%s]\n",
               page_res_it.word()->best_choice->unichar_string().string(),
               page_res_it.word()->best_choice->
                 debug_string(unicharset).string());
     }

     if (tessedit_em_adaption_mode > 0)
       collect_ems_for_adaption (page_res_it.word (),
         &em_clusters, &ems_waiting);

     if (tessedit_cluster_adapt_after_pass2
       && tessedit_cluster_adaption_mode != 0)
       collect_characters_for_adaption (page_res_it.word (),
         &char_clusters, &chars_waiting);
     page_res_it.forward ();
   }

   /* Another pass */
   set_global_loc_code(LOC_FUZZY_SPACE);

   if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces
     && !tessedit_word_for_word)
     fix_fuzzy_spaces(monitor, word_count, page_res);

   if (!tessedit_test_adaption && tessedit_em_adaption_mode != 0)
                                  // Initially ems only
     print_em_stats(&em_clusters, &ems_waiting);

   /* Pass 3 - used for checking confusion sets */
   page_res_it.restart_page ();
   word_index = 0;
   while (!tessedit_test_adaption && page_res_it.word () != NULL) {
     set_global_loc_code(LOC_MM_ADAPT);
     word_index++;
     if (monitor != NULL) {
       monitor->ocr_alive = TRUE;
       monitor->progress = 95 + 5 * word_index / word_count;
     }
     check_debug_pt (page_res_it.word (), 70);
     /* Use good matches to sort out confusions */


 //changed by jetsoft
 //specific to its needs to extract one word when need

 	if (target_word_box)
 	{

 		TBOX current_word_box=page_res_it.word ()->word->bounding_box();
 		FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
 		if (!target_word_box->contains(center_pt))
 		{
 			page_res_it.forward ();
 			continue;
 		}

 	}
 // end jetsoft

     if (tessedit_em_adaption_mode != 0)
       adapt_to_good_ems (page_res_it.word (), &em_clusters, &ems_waiting);

     if (tessedit_cluster_adapt_after_pass2
       && tessedit_cluster_adaption_mode != 0)
       adapt_to_good_samples (page_res_it.word (),
         &char_clusters, &chars_waiting);

     UNICHAR_ID dot = unicharset.unichar_to_id(".");
     if (tessedit_reject_fullstops &&
         page_res_it.word()->best_choice->contains_unichar_id(dot)) {
       reject_all_fullstops (page_res_it.word ());
     } else if (tessedit_reject_suspect_fullstops &&
              page_res_it.word()->best_choice->contains_unichar_id(dot)) {
       reject_suspect_fullstops (page_res_it.word ());
     }

     page_res_it.rej_stat_word ();
     chars_in_word = page_res_it.word ()->reject_map.length ();
     rejects_in_word = page_res_it.word ()->reject_map.reject_count ();

     blob_quality = word_blob_quality (page_res_it.word (),
       page_res_it.row ()->row);
     doc_blob_quality += blob_quality;
     outline_errs = word_outline_errs (page_res_it.word ());
     doc_outline_errs += outline_errs;
     word_char_quality (page_res_it.word (),
       page_res_it.row ()->row,
       &all_char_quality, &accepted_all_char_quality);
     doc_char_quality += all_char_quality;
     uinT8 permuter_type = page_res_it.word ()->best_choice->permuter ();
     if ((permuter_type == SYSTEM_DAWG_PERM) ||
       (permuter_type == FREQ_DAWG_PERM) ||
     (permuter_type == USER_DAWG_PERM)) {
       good_char_count += chars_in_word - rejects_in_word;
       doc_good_char_quality += accepted_all_char_quality;
     }
     check_debug_pt (page_res_it.word (), 80);
     if (tessedit_reject_bad_qual_wds &&
       (blob_quality == 0) && (outline_errs >= chars_in_word))
       page_res_it.word ()->reject_map.rej_word_bad_quality ();
     check_debug_pt (page_res_it.word (), 90);
     page_res_it.forward ();
   }

   page_res_it.restart_page ();
   while (!tessedit_test_adaption
   && tessedit_cluster_adapt_after_pass3 && page_res_it.word () != NULL) {
     if (monitor != NULL)
       monitor->ocr_alive = TRUE;

 //changed by jetsoft
 //specific to its needs to extract one word when need

 	if (target_word_box)
 	{

 		TBOX current_word_box=page_res_it.word ()->word->bounding_box();
 		FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
 		if (!target_word_box->contains(center_pt))
 		{
 			page_res_it.forward ();
 			continue;
 		}

 	}

 //end jetsoft
     if (tessedit_cluster_adaption_mode != 0)
       adapt_to_good_samples (page_res_it.word (),
         &char_clusters, &chars_waiting);
     page_res_it.forward ();
   }

   #ifndef SECURE_NAMES
   if (tessedit_debug_quality_metrics) {
     tprintf
       ("QUALITY: num_chs= %d  num_rejs= %d %5.3f blob_qual= %d %5.3f outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
       page_res->char_count, page_res->rej_count,
       page_res->rej_count / (float) page_res->char_count, doc_blob_quality,
       doc_blob_quality / (float) page_res->char_count, doc_outline_errs,
       doc_outline_errs / (float) page_res->char_count, doc_char_quality,
       doc_char_quality / (float) page_res->char_count,
       doc_good_char_quality,
       good_char_count >
       0 ? doc_good_char_quality / (float) good_char_count : 0.0);
   }
   #endif
   BOOL8 good_quality_doc =
     (page_res->rej_count / (float) page_res->char_count <= quality_rej_pc)
     &&
     (doc_blob_quality / (float) page_res->char_count >= quality_blob_pc) &&
     (doc_outline_errs / (float) page_res->char_count <= quality_outline_pc) &&
     (doc_char_quality / (float) page_res->char_count >= quality_char_pc);

   /* Do whole document or whole block rejection pass*/

   if (!tessedit_test_adaption) {
     set_global_loc_code(LOC_DOC_BLK_REJ);
     quality_based_rejection(page_res_it, good_quality_doc);
   }
   font_recognition_pass(page_res_it);

   /* Write results pass */
   set_global_loc_code(LOC_WRITE_RESULTS);
   // This is now redundant, but retained commented so show how to obtain
   // bounding boxes and style information.

   // changed by jetsoft
   // needed for dll to output memory structure
   if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
     output_pass(page_res_it, ocr_char_space() > 0, target_word_box);
   // end jetsoft
 }


 /**********************************************************************
  * classify_word_pass1
  *
  * Baseline normalize the word and pass it to Tess.
  **********************************************************************/

 void Tesseract::classify_word_pass1(                 //recog one word
                                     WERD_RES *word,  //word to do
                                     ROW *row,
                                     BLOCK* block,
                                     BOOL8 cluster_adapt,
                                     CHAR_SAMPLES_LIST *char_clusters,
                                     CHAR_SAMPLE_LIST *chars_waiting) {
   WERD *bln_word;                //baseline norm copy
                                  //detailed results
   BLOB_CHOICE_LIST_CLIST local_blob_choices;
   BLOB_CHOICE_LIST_CLIST *blob_choices;
   BOOL8 adapt_ok;
   const char *rejmap;
   inT16 index;
   STRING mapstr = "";
   char *match_string;
   char word_string[1024];

   if (save_best_choices)
     blob_choices = new BLOB_CHOICE_LIST_CLIST();
   else
     blob_choices = &local_blob_choices;

   if (matcher_fp != NULL) {
     fgets (word_string, 1023, correct_fp);
     if ((match_string = strchr (word_string, '\r')) != NULL)
       *match_string = '\0';
     if ((match_string = strchr (word_string, '\n')) != NULL)
       *match_string = '\0';
     if (word_string[0] != '\0') {
       word->word->set_text (word_string);
       word_answer = (char *) word->word->text ();
     }
     else
       word_answer = NULL;
   }

   check_debug_pt (word, 0);
   bln_word = make_bln_copy(word->word, row, block, word->x_height,
                            &word->denorm);

   word->best_choice = tess_segment_pass1 (bln_word, &word->denorm,
     &Tesseract::tess_default_matcher,
     word->raw_choice, blob_choices,
     word->outword);
   /*
      Test for TESS screw up on word. Recog_word has already ensured that the
      choice list, outword blob lists and best_choice string are the same
      length. A TESS screw up is indicated by a blank filled or 0 length string.
    */
   if ((word->best_choice->length() == 0) ||
       (strspn (word->best_choice->unichar_string().string(), " ") ==
        word->best_choice->length())) {
     word->done = FALSE;          // Try again on pass2 - adaption may help.
     word->tess_failed = TRUE;
     word->reject_map.initialise(word->best_choice->length());
     word->reject_map.rej_word_tess_failure ();
   } else {
     word->tess_failed = FALSE;
     if ((word->best_choice->length() !=
          word->outword->blob_list()->length()) ||
         (word->best_choice->length() != blob_choices->length())) {
       tprintf
         ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
         word->best_choice->debug_string(unicharset).string(),
         word->best_choice->length(),
         word->outword->blob_list()->length(),
         blob_choices->length());
     }
     ASSERT_HOST (word->best_choice->length() ==
                  word->outword->blob_list()->length());
     ASSERT_HOST (word->best_choice->length() == blob_choices->length ());

     /*
        The adaption step used to be here. It has been moved to after
        make_reject_map so that we know whether the word will be accepted in the
        first pass or not.   This move will PREVENT adaption to words containing
        double quotes because the word will not be identical to what tess thinks
        its best choice is. (See CurrentBestChoiceIs in
        danj/microfeatures/stopper.c which is used by AdaptableWord in
        danj/microfeatures/adaptmatch.c)
      */

     if (word->word->flag (W_REP_CHAR)) {
       fix_rep_char(word);
     } else {
       // TODO(daria) delete these hacks when replaced by more generic code.
       // Convert '' (double single) to " (single double).
       fix_quotes(word->best_choice, word->outword, blob_choices);
       if (tessedit_fix_hyphens)  // turn -- to -
         fix_hyphens (word->best_choice, word->outword, blob_choices);
       record_certainty (word->best_choice->certainty (), 1);
       // accounting.

       word->tess_accepted = tess_acceptable_word (word->best_choice,
         word->raw_choice);

       word->tess_would_adapt = tess_adaptable_word (word->outword,
         word->best_choice,
         word->raw_choice);
                                  // Also sets word->done flag
       make_reject_map (word, blob_choices, row, 1);

       adapt_ok = word_adaptable (word, tessedit_tess_adaption_mode);

       if (cluster_adapt)
         adapt_to_good_samples(word, char_clusters, chars_waiting);

       if (adapt_ok || tessedit_tess_adapt_to_rejmap) {
         if (!tessedit_tess_adapt_to_rejmap) {
           rejmap = NULL;
         } else {
           ASSERT_HOST(word->reject_map.length() ==
                       word->best_choice->length());

           for (index = 0; index < word->reject_map.length (); index++) {
             if (adapt_ok || word->reject_map[index].accepted ())
               mapstr += '1';
             else
               mapstr += '0';
           }
           rejmap = mapstr.string ();
         }

                                  // adapt to it.
         tess_adapter (word->outword, &word->denorm,
                       *word->best_choice,
                       *word->raw_choice, rejmap);
       }

       if (tessedit_enable_doc_dict)
         tess_add_doc_word (word->best_choice);
       set_word_fonts(word, blob_choices);
     }
   }
 #if 0
   if (tessedit_print_text) {
     write_cooked_text (bln_word, word->best_choice->string (),
       word->done, FALSE, stdout);
   }
 #endif
   delete bln_word;

   // Save best choices in the WERD_CHOICE if needed
   if (blob_choices != &local_blob_choices) {
     word->best_choice->set_blob_choices(blob_choices);
   } else {
     blob_choices->deep_clear();
   }
 }

 /**********************************************************************
  * classify_word_pass2
  *
  * Control what to do with the word in pass 2
  **********************************************************************/

 void Tesseract::classify_word_pass2(WERD_RES *word, BLOCK* block, ROW *row) {
   BOOL8 done_this_pass = FALSE;
   WERD_RES new_x_ht_word (word->word);
   float new_x_ht = 0.0;
   inT16 old_xht_reject_count;
   inT16 new_xht_reject_count;
   inT16 old_xht_accept_count;
   inT16 new_xht_accept_count;
   BOOL8 accept_new_x_ht = FALSE;
   inT16 old_chs_in_wd;
   inT16 new_chs_in_wd;
   inT16 old_word_quality;
   inT16 new_word_quality;
   inT16 dummy;

   set_global_subloc_code(SUBLOC_NORM);
   check_debug_pt (word, 30);
   if (!word->done ||
     tessedit_training_tess ||
   tessedit_training_wiseowl) {
     word->caps_height = 0.0;
     if (word->x_height == 0.0f)
       word->x_height = row->x_height();
     if (word->outword != NULL) {
       delete word->outword;      //get rid of junk
       delete word->best_choice;
       delete word->raw_choice;
     }
     match_word_pass2 (word, row, block, word->x_height);
     done_this_pass = TRUE;
     check_debug_pt (word, 40);
   }

   if (!word->tess_failed && !word->word->flag (W_REP_CHAR)) {
     set_global_subloc_code(SUBLOC_FIX_XHT);
     if ((tessedit_xht_fiddles_on_done_wds || !word->done) &&
       (tessedit_xht_fiddles_on_no_rej_wds ||
     (word->reject_map.reject_count () > 0))) {
       if ((x_ht_check_word_occ >= 2) && word_occ_first)
         check_block_occ(word);

       if (tessedit_redo_xheight)
         re_estimate_x_ht(word, &new_x_ht);

       if (((x_ht_check_word_occ >= 2) && !word_occ_first) ||
         ((x_ht_check_word_occ >= 1) && (new_x_ht > 0)))
         check_block_occ(word);
     }
     if (new_x_ht > 0) {
       old_chs_in_wd = word->reject_map.length ();

       /* Re-estimated x_ht error suggests a rematch is worthwhile. */
       new_x_ht_word.x_height = new_x_ht;
       new_x_ht_word.caps_height = 0.0;
       match_word_pass2(&new_x_ht_word, row, block, new_x_ht_word.x_height);
       if (!new_x_ht_word.tess_failed) {
         if ((x_ht_check_word_occ >= 1) && word_occ_first)
           check_block_occ(&new_x_ht_word);

         re_estimate_x_ht(&new_x_ht_word, &new_x_ht);

         if ((x_ht_check_word_occ >= 1) && !word_occ_first)
           check_block_occ(&new_x_ht_word);

         old_xht_reject_count = word->reject_map.reject_count ();
         old_xht_accept_count = old_chs_in_wd - old_xht_reject_count;
         new_xht_reject_count = new_x_ht_word.reject_map.reject_count ();
         new_chs_in_wd = new_x_ht_word.reject_map.length ();
         new_xht_accept_count = new_chs_in_wd - new_xht_reject_count;
         accept_new_x_ht =
           ((new_xht_accept_count > old_xht_accept_count) ||
           ((new_xht_accept_count == old_xht_accept_count) &&
           (new_xht_accept_count > 0))) &&
           (!new_x_ht_word.guessed_x_ht ||
           !new_x_ht_word.guessed_caps_ht);

         if (accept_new_x_ht && x_ht_quality_check) {
           word_char_quality(word, row, &old_word_quality, &dummy);
           word_char_quality(&new_x_ht_word, row, &new_word_quality, &dummy);
           if (old_word_quality > new_word_quality)
             accept_new_x_ht = FALSE;
         }

         if (accept_new_x_ht && (x_ht_stringency > 0)) {
           accept_new_x_ht =
             (count_alphanums (&new_x_ht_word) > x_ht_stringency);
           if (!accept_new_x_ht && rej_use_xht) {
             if (debug_x_ht_level >= 1)
               tprintf
                 ("Failed stringency test so reject original word\n");
             word->reject_map.rej_word_xht_fixup ();
           }
         }

         #ifndef SECURE_NAMES
         if (debug_x_ht_level >= 1) {
           tprintf ("New XHT Match:: %s ",
                    word->best_choice->debug_string(unicharset).string());
           word->reject_map.print (debug_fp);
           tprintf (" -> %s ",
                    new_x_ht_word.best_choice->debug_string(
                        unicharset).string());
           new_x_ht_word.reject_map.print (debug_fp);
           tprintf (" %s->%s %s %s\n",
             word->guessed_x_ht ? "GUESS" : "CERT",
             new_x_ht_word.guessed_x_ht ? "GUESS" : "CERT",
             new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
             accept_new_x_ht ? "ACCEPTED" : "");
         }
         #endif
       }
       if (accept_new_x_ht) {
         /*
            The new x_ht is deemed superior so put the final results in the real
            word and destroy the old results
          */
         delete word->outword;    //get rid of junk
         word->outword = new_x_ht_word.outword;
         word->denorm = new_x_ht_word.denorm;
         delete word->best_choice;
         word->best_choice = new_x_ht_word.best_choice;
         delete word->raw_choice;
         word->raw_choice = new_x_ht_word.raw_choice;
         word->reject_map = new_x_ht_word.reject_map;
         word->done = new_x_ht_word.done;
         done_this_pass = TRUE;
       }
       else {
       /*
          The new x_ht is no better, so destroy the copy word and put any
          uncertain x or cap ht estimate back to default. (I.e. dont blame
          me if its bad!) Conditionally, use any ammended block occ chars.
        */
                                  //get rid of junk
         delete new_x_ht_word.outword;
         delete new_x_ht_word.best_choice;
         delete new_x_ht_word.raw_choice;
       }
                                  //to keep new destructor happy
       new_x_ht_word.outword = NULL;
                                  //to keep new destructor happy
       new_x_ht_word.best_choice = NULL;
                                  //to keep new destructor happy
       new_x_ht_word.raw_choice = NULL;

       if (rej_mostly_reject_mode == 2) {
         reject_mostly_rejects(word);
         tprintf("Rejecting mostly rejects on %s ",
                 word->best_choice->debug_string(unicharset).string());
       }
     }

     set_global_subloc_code(SUBLOC_NORM);

     if (done_this_pass && !word->done && tessedit_save_stats) {
       STRING word_str;
       word->best_choice->string_and_lengths(unicharset, &word_str, NULL);
       SaveBadWord(word_str.string(), word->best_choice->certainty());
     }
     record_certainty (word->best_choice->certainty(), 2);
     //accounting
   }
 #ifndef GRAPHICS_DISABLED
   if (tessedit_draw_outwords) {
     if (fx_win == NULL)
       create_fx_win();
     clear_fx_win();
     word->outword->plot (fx_win);
     TBOX wbox = word->outword->bounding_box();
     fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
                             wbox.right(), wbox.bottom());
     //make_picture_current(fx_win);
     ScrollView::Update();
   }
 #endif

   set_global_subloc_code(SUBLOC_NORM);
 #if 0
   if (tessedit_print_text) {
     write_cooked_text (word->outword, word->best_choice->string (),
       word->done, done_this_pass, stdout);
   }
 #endif
   check_debug_pt (word, 50);
 }


 /**********************************************************************
  * match_word_pass2
  *
  * Baseline normalize the word and pass it to Tess.
  **********************************************************************/

 void Tesseract::match_word_pass2(                 //recog one word
                                  WERD_RES *word,  //word to do
                                  ROW *row,
                                  BLOCK* block,
                                  float x_height) {
   WERD *bln_word;                //baseline norm copy
                                  //detailed results
   BLOB_CHOICE_LIST_CLIST local_blob_choices;
   BLOB_CHOICE_LIST_CLIST *blob_choices;

   if (save_best_choices)
     blob_choices = new BLOB_CHOICE_LIST_CLIST();
   else
     blob_choices = &local_blob_choices;

   set_global_subsubloc_code(SUBSUBLOC_OTHER);
   if (matcher_fp != NULL) {
     word_answer = (char *) word->word->text ();
     if (word_answer != NULL && word_answer[0] == '\0')
       word_answer = NULL;
   }
   bln_word = make_bln_copy (word->word, row, block, x_height, &word->denorm);
   set_global_subsubloc_code(SUBSUBLOC_TESS);
   if (tessedit_training_tess)
     word->best_choice = correct_segment_pass2 (bln_word,
       &word->denorm,
       &Tesseract::tess_default_matcher,
       tess_training_tester,
       word->raw_choice,
       blob_choices, word->outword);
   else {
     word->best_choice = tess_segment_pass2 (bln_word, &word->denorm,
       &Tesseract::tess_default_matcher,
       word->raw_choice, blob_choices,
       word->outword);
   }
   set_global_subsubloc_code(SUBSUBLOC_OTHER);
   /*
      Test for TESS screw up on word. Recog_word has already ensured that the
      choice list, outword blob lists and best_choice string are the same
      length. A TESS screw up is indicated by a blank filled or 0 length string.
    */
   if ((word->best_choice->length() == 0) ||
       (strspn (word->best_choice->unichar_string().string (), " ") ==
        word->best_choice->length())) {
     word->tess_failed = TRUE;
     word->reject_map.initialise (word->best_choice->length());
     word->reject_map.rej_word_tess_failure ();
     //              tprintf("Empty word produced\n");
   }
   else {
     if ((word->best_choice->length() !=
       word->outword->blob_list()->length ()) ||
         (word->best_choice->length() != blob_choices->length())) {
       tprintf
         ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
         word->best_choice->debug_string(unicharset).string(),
         word->best_choice->length(),
         word->outword->blob_list()->length(), blob_choices->length());
     }
     ASSERT_HOST (word->best_choice->length() ==
                  word->outword->blob_list()->length());
     ASSERT_HOST (word->best_choice->length() == blob_choices->length());

     word->tess_failed = FALSE;
     if (word->word->flag (W_REP_CHAR)) {
       fix_rep_char(word);
     }
     else {
       fix_quotes (word->best_choice,
         word->outword, blob_choices);
       if (tessedit_fix_hyphens)
         fix_hyphens (word->best_choice,
           word->outword, blob_choices);
       /* Dont trust fix_quotes! - though I think I've fixed the bug */
       if ((word->best_choice->length() !=
            word->outword->blob_list()->length()) ||
           (word->best_choice->length() != blob_choices->length())) {
         #ifndef SECURE_NAMES
         tprintf
           ("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
            word->best_choice->debug_string(unicharset).string(),
            word->best_choice->length(),
            word->outword->blob_list()->length(), blob_choices->length());
         #endif

       }
       ASSERT_HOST (word->best_choice->length() ==
                    word->outword->blob_list()->length());
       ASSERT_HOST (word->best_choice->length() == blob_choices->length());

       word->tess_accepted = tess_acceptable_word(word->best_choice,
                                                  word->raw_choice);

       make_reject_map (word, blob_choices, row, 2);
     }
   }

   // Save best choices in the WERD_CHOICE if needed
   if (blob_choices != &local_blob_choices)
     word->best_choice->set_blob_choices(blob_choices);
   else
     blob_choices->deep_clear();

   delete bln_word;
   assert (word->raw_choice != NULL);
 }
 }  // namespace tesseract


 /*************************************************************************
  * fix_rep_char()
  * The word is a repeated char. Find the repeated char character. Make a reject
  * string which rejects any char other than the voted char. Set the word to done
  * to stop rematching it.
  *
  *************************************************************************/
 namespace tesseract {
 void Tesseract::fix_rep_char(WERD_RES *word_res) {
   struct REP_CH {
     UNICHAR_ID unichar_id;
     int count;
   };
   const WERD_CHOICE &word = *(word_res->best_choice);
   REP_CH *rep_ch;        // array of char counts
   int rep_ch_count = 0;  // how many unique chs
   int i, j;
   int total = 0;
   int max = 0;
   UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
   UNICHAR_ID space = unicharset.unichar_to_id(" ");

   rep_ch = new REP_CH[word.length()];
   for (i = 0; i < word.length(); ++i) {
     for (j = 0; j < rep_ch_count &&
          rep_ch[j].unichar_id != word.unichar_id(i); ++j);
     if (j < rep_ch_count) {
       rep_ch[j].count++;
     } else {
       rep_ch[rep_ch_count].unichar_id = word.unichar_id(i);
       rep_ch[rep_ch_count].count = 1;
       rep_ch_count++;
     }
   }

   for (j = 0; j < rep_ch_count; j++) {
     total += rep_ch[j].count;
     if ((rep_ch[j].count > max) && (rep_ch[j].unichar_id != space)) {
       max = rep_ch[j].count;
       maxch_id = rep_ch[j].unichar_id;
     }
   }
   //      tprintf( "REPEATED CHAR %s len=%d total=%d choice=%c\n",
   //                        word_str, word_len, total, maxch );
   delete[] rep_ch;

   word_res->reject_map.initialise(word.length());
   for (i = 0; i < word.length(); ++i) {
     if (word.unichar_id(i) != maxch_id)
       word_res->reject_map[i].setrej_bad_repetition(); // rej unrecognised blobs
   }
   word_res->done = TRUE;
 }

 // TODO(tkielbus) Decide between keeping this behavior here or modifying the
 // training data.

 // Utility function for fix_quotes
 // Return true if the next character in the string (given the UTF8 length in
 // bytes) is a quote character.
 static int is_simple_quote(const char* signed_str, int length) {
   const unsigned char* str =
     reinterpret_cast<const unsigned char*>(signed_str);
    //standard 1 byte quotes
   return (length == 1 && (*str == '\'' || *str == '`')) ||
       //utf8 3 bytes curved quotes
       (length == 3 && ((*str == 0xe2 &&
                         *(str + 1) == 0x80 &&
                         *(str + 2) == 0x98) ||
                        (*str == 0xe2 &&
                         *(str + 1) == 0x80 &&
                         *(str + 2) == 0x99)));
 }

 /**********************************************************************
  * fix_quotes
  *
  * Change pairs of quotes to double quotes.
  **********************************************************************/
 void Tesseract::fix_quotes(WERD_CHOICE *choice,  //choice to fix
                            WERD *word,    //word to do //char choices
                            BLOB_CHOICE_LIST_CLIST *blob_choices) {
   if (!unicharset.contains_unichar("\"") ||
       !unicharset.get_enabled(unicharset.unichar_to_id("\"")))
     return;  // Don't create it if it is disallowed.

   PBLOB_IT blob_it = word->blob_list();  // blobs
   BLOB_CHOICE_LIST_C_IT blob_choices_it = blob_choices;  // choices
   BLOB_CHOICE_IT it1;  // first choices
   BLOB_CHOICE_IT it2;  // second choices

   int i;
   int modified = false;
   for (i = 0; i < choice->length()-1;
        ++i, blob_it.forward(), blob_choices_it.forward()) {
     const char *ch = unicharset.id_to_unichar(choice->unichar_id(i));
     const char *next_ch = unicharset.id_to_unichar(choice->unichar_id(i+1));
     if (is_simple_quote(ch, strlen(ch)) &&
         is_simple_quote(next_ch, strlen(next_ch))) {
       choice->set_unichar_id(unicharset.unichar_to_id("\""), i);
       choice->remove_unichar_id(i+1);
       modified = true;
       merge_blobs(blob_it.data(), blob_it.data_relative(1));
       blob_it.forward();
       delete blob_it.extract();  // get rid of spare

       it1.set_to_list(blob_choices_it.data());
       it2.set_to_list(blob_choices_it.data_relative(1));
       if (it1.data()->certainty() < it2.data()->certainty()) {
         blob_choices_it.forward();
         delete blob_choices_it.extract();  // get rid of spare
       } else {
         delete blob_choices_it.extract();  // get rid of spare
         blob_choices_it.forward();
       }
     }
   }
   if (modified) {
     choice->populate_unichars(unicharset);
   }
 }


 /**********************************************************************
  * fix_hyphens
  *
  * Change pairs of hyphens to a single hyphen if the bounding boxes touch
  * Typically a long dash which has been segmented.
  **********************************************************************/
 void Tesseract::fix_hyphens(               //crunch double hyphens
                             WERD_CHOICE *choice,  //choice to fix
                             WERD *word,    //word to do //char choices
                             BLOB_CHOICE_LIST_CLIST *blob_choices) {
   if (!unicharset.contains_unichar("-") ||
       !unicharset.get_enabled(unicharset.unichar_to_id("-")))
     return;  // Don't create it if it is disallowed.

   PBLOB_IT blob_it = word->blob_list();
   BLOB_CHOICE_LIST_C_IT blob_choices_it = blob_choices;
   BLOB_CHOICE_IT it1;            // first choices
   BLOB_CHOICE_IT it2;            // second choices

   bool modified = false;
   for (int i = 0; i+1 < choice->length();
        ++i, blob_it.forward (), blob_choices_it.forward ()) {
     const char *ch = unicharset.id_to_unichar(choice->unichar_id(i));
     const char *next_ch = unicharset.id_to_unichar(choice->unichar_id(i+1));
     if (strlen(ch) != 1 || strlen(next_ch) != 1) continue;
     if ((*ch == '-' || *ch == '~') &&
         (*next_ch == '-' || *next_ch == '~') &&
         (blob_it.data()->bounding_box().right() >=
          blob_it.data_relative(1)->bounding_box().left ())) {
       choice->set_unichar_id(unicharset.unichar_to_id("-"), i);
       choice->remove_unichar_id(i+1);
       modified = true;
       merge_blobs(blob_it.data(), blob_it.data_relative(1));
       blob_it.forward();
       delete blob_it.extract();  // get rid of spare

       it1.set_to_list(blob_choices_it.data());
       it2.set_to_list(blob_choices_it.data_relative(1));
       if (it1.data()->certainty() < it2.data()->certainty()) {
         blob_choices_it.forward();
         delete blob_choices_it.extract();  // get rid of spare
       } else {
         delete blob_choices_it.extract();  // get rid of spare
         blob_choices_it.forward();
       }
     }
   }
   if (modified) {
     choice->populate_unichars(unicharset);
   }
 }
 }  // namespace tesseract


 /**********************************************************************
  * merge_blobs
  *
  * Add the outlines from blob2 to blob1. Blob2 is emptied but not deleted.
  **********************************************************************/

 void merge_blobs(               //combine 2 blobs
                  PBLOB *blob1,  //dest blob
                  PBLOB *blob2   //source blob
                 ) {
   OUTLINE_IT outline_it = blob1->out_list ();
   //iterator

   outline_it.move_to_last ();    //go to end
                                  //do it
   outline_it.add_list_after (blob2->out_list ());
 }


 /**********************************************************************
  * choice_dump_tester
  *
  * Matcher tester function which generates .chc file entries.
  * Called via test_segment_pass2 for every blob tested by tess in a word.
  * (But only for words for which a correct segmentation could be found.)
  **********************************************************************/
 /* DEADCODE
 void choice_dump_tester(                           //dump chars in word
                         PBLOB *,                   //blob
                         DENORM *,                  //de-normaliser
                         BOOL8 correct,             //ly segmented
                         char *text,                //correct text
                         inT32 count,               //chars in text
                         BLOB_CHOICE_LIST *ratings  //list of results
                        ) {
   STRING choice_file_name;
   BLOB_CHOICE *blob_choice;
   BLOB_CHOICE_IT it;
   char source_chars[20];
   char correct_char[3];

   if (choice_file == NULL) {
     choice_file_name = imagebasename + ".chc";
     if (!(choice_file = fopen (choice_file_name.string (), "w"))) {
       CANTOPENFILE.error ("choice_dump_tester", EXIT, "%s %d",
         choice_file_name.string (), errno);
     }
   }

   if ((count == 0) || (text == NULL) || (text[0] == '\0')) {
     strcpy (source_chars, "$$");
     strcpy (correct_char, "$$");
   }
   else {
     strncpy(source_chars, text, count);
     source_chars[count] = '\0';
     if (correct) {
       correct_char[0] = text[0];
       correct_char[1] = '\0';
     }
     else {
       strcpy (correct_char, "$$");
     }
   }
   fprintf (choice_file, "%s\t%s", source_chars, correct_char);

   it.set_to_list (ratings);
   for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
     blob_choice = it.data ();
     fprintf (choice_file, "\t%s\t%f\t%f",
              blob_choice->unichar (),
              blob_choice->rating (), blob_choice->certainty ());
   }
   fprintf (choice_file, "\n");
 }
 */

 /*************************************************************************
  * make_bln_copy()
  *
  * Generate a baseline normalised copy of the source word. The copy is done so
  * that whatever format the original word is in, a polygonal bln version is
  * generated as output.
  *************************************************************************/

 WERD *make_bln_copy(WERD *src_word, ROW *row, BLOCK* block,
                     float x_height, DENORM *denorm) {
   WERD *result = src_word->poly_copy(row->x_height());

   result->baseline_normalise_x (row, x_height, denorm);
   if (block != NULL)
     denorm->set_block(block);
   return result;
 }


 namespace tesseract {
 ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const char *s,
                                                        const char *lengths) {
   int i = 0;
   int offset = 0;
   int leading_punct_count;
   int upper_count = 0;
   int hyphen_pos = -1;
   ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;

   if (strlen (lengths) > 20)
     return word_type;

   /* Single Leading punctuation char*/

   if ((s[offset] != '\0') && (STRING (chs_leading_punct).contains (s[offset])))
     offset += lengths[i++];
   leading_punct_count = i;

   /* Initial cap */
   while ((s[offset] != '\0') &&
          unicharset.get_isupper(s + offset, lengths[i])) {
     offset += lengths[i++];
     upper_count++;
   }
   if (upper_count > 1)
     word_type = AC_UPPER_CASE;
   else {
     /* Lower case word, possibly with an initial cap */
     while ((s[offset] != '\0') &&
            unicharset.get_islower (s + offset, lengths[i])) {
       offset += lengths[i++];
     }
     if (i - leading_punct_count < quality_min_initial_alphas_reqd)
       goto not_a_word;
     /*
     Allow a single hyphen in a lower case word
     - dont trust upper case - I've seen several cases of "H" -> "I-I"
     */
     if (lengths[i] == 1 && s[offset] == '-') {
       hyphen_pos = i;
       offset += lengths[i++];
       if (s[offset] != '\0') {
         while ((s[offset] != '\0') &&
                unicharset.get_islower(s + offset, lengths[i])) {
           offset += lengths[i++];
         }
         if (i < hyphen_pos + 3)
           goto not_a_word;
       }
     }
     else {
       /* Allow "'s" in NON hyphenated lower case words */
       if (lengths[i] == 1 && (s[offset] == '\'') &&
           lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
         offset += lengths[i++];
         offset += lengths[i++];
       }
     }
     if (upper_count > 0)
       word_type = AC_INITIAL_CAP;
     else
       word_type = AC_LOWER_CASE;
   }

   /* Up to two different, constrained trailing punctuation chars */
   if (lengths[i] == 1 && (s[offset] != '\0') &&
       (STRING (chs_trailing_punct1).contains (s[offset])))
     offset += lengths[i++];
   if (lengths[i] == 1 && (s[offset] != '\0') && i > 0 &&
     (s[offset - lengths[i - 1]] != s[offset]) &&
       (STRING (chs_trailing_punct2).contains (s[offset])))
     offset += lengths[i++];

   if (s[offset] != '\0')
     word_type = AC_UNACCEPTABLE;

   not_a_word:

   if (word_type == AC_UNACCEPTABLE) {
     /* Look for abbreviation string */
     i = 0;
     offset = 0;
     if (s[0] != '\0' && unicharset.get_isupper (s, lengths[0])) {
       word_type = AC_UC_ABBREV;
       while ((s[offset] != '\0') &&
              unicharset.get_isupper(s + offset, lengths[i]) &&
              (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) {
         offset += lengths[i++];
         offset += lengths[i++];
       }
     }
     else if (s[0] != '\0' && unicharset.get_islower (s, lengths[0])) {
       word_type = AC_LC_ABBREV;
       while ((s[offset] != '\0') &&
              unicharset.get_islower(s + offset, lengths[i]) &&
              (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) {
         offset += lengths[i++];
         offset += lengths[i++];
       }
     }
     if (s[offset] != '\0')
       word_type = AC_UNACCEPTABLE;
   }

   return word_type;
 }

 }  // namespace tesseract

 /* DEBUGGING ROUTINE */

 BOOL8 check_debug_pt(WERD_RES *word, int location) {
   BOOL8 show_map_detail = FALSE;
   inT16 i;

   #ifndef SECURE_NAMES
   if (!test_pt)
     return FALSE;

   tessedit_rejection_debug.set_value (FALSE);
   debug_x_ht_level.set_value (0);
   tessedit_cluster_debug.set_value (FALSE);
   nn_debug.set_value (FALSE);
   nn_reject_debug.set_value (FALSE);

   if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
     if (location < 0)
       return TRUE;               //For breakpoint use
     tessedit_rejection_debug.set_value (TRUE);
     debug_x_ht_level.set_value (20);
     tessedit_cluster_debug.set_value (TRUE);
     nn_debug.set_value (TRUE);
     nn_reject_debug.set_value (TRUE);
     tprintf ("\n\nTESTWD::");
     switch (location) {
       case 0:
         tprintf ("classify_word_pass1 start\n");
         word->word->print (debug_fp);
         break;
       case 10:
         tprintf ("make_reject_map: initial map");
         break;
       case 20:
         tprintf ("make_reject_map: after NN");
         break;
       case 30:
         tprintf ("classify_word_pass2 - START");
         break;
       case 40:
         tprintf ("classify_word_pass2 - Pre Xht");
         break;
       case 50:
         tprintf ("classify_word_pass2 - END");
         show_map_detail = TRUE;
         break;
       case 60:
         tprintf ("fixspace");
         break;
       case 70:
         tprintf ("MM pass START");
         break;
       case 80:
         tprintf ("MM pass END");
         break;
       case 90:
         tprintf ("After Poor quality rejection");
         break;
       case 100:
         tprintf ("unrej_good_quality_words - START");
         break;
       case 110:
         tprintf ("unrej_good_quality_words - END");
         break;
       case 120:
         tprintf ("Write results pass");
         show_map_detail = TRUE;
         break;
     }
     tprintf(" \"%s\" ",
             word->best_choice->unichar_string().string());
     word->reject_map.print (debug_fp);
     tprintf ("\n");
     if (show_map_detail) {
       tprintf ("\"%s\"\n", word->best_choice->unichar_string().string());
       for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
         tprintf ("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
         word->reject_map[i].full_print(debug_fp);
       }
     }

     tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
     tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
     return TRUE;
   }
   else
   #endif
     return FALSE;
 }


 /**********************************************************************
  * set_word_fonts
  *
  * Get the fonts for the word.
  **********************************************************************/
 namespace tesseract {
 void Tesseract::set_word_fonts(
     WERD_RES *word,  // word to adapt to
     BLOB_CHOICE_LIST_CLIST *blob_choices  // detailed results
     ) {
   inT32 index;                   // char id index
   UNICHAR_ID choice_char_id;     // char id from word
   inT8 config;                   // font of char
                                  // character iterator
   BLOB_CHOICE_LIST_C_IT char_it = blob_choices;
   BLOB_CHOICE_IT choice_it;      // choice iterator
   int fontinfo_size = get_fontinfo_table().size();
   int fontset_size = get_fontset_table().size();
   if (fontinfo_size == 0 || fontset_size == 0)
     return;
   STATS fonts(0, fontinfo_size);  // font counters

   word->italic = 0;
   word->bold = 0;
   for (char_it.mark_cycle_pt(), index = 0;
        !char_it.cycled_list(); ++index, char_it.forward()) {
     choice_char_id = word->best_choice->unichar_id(index);
     choice_it.set_to_list(char_it.data());
     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
          choice_it.forward()) {
       if (choice_it.data()->unichar_id() == choice_char_id) {
         config = choice_it.data()->config();
         int class_id = choice_it.data()->unichar_id();
         int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
         if (font_set_id >= 0 && config >= 0 && font_set_id < fontset_size) {
           FontSet font_set = get_fontset_table().get(font_set_id);
           if (tessedit_debug_fonts) {
             tprintf("%s(%d=%d%c%c)", unicharset.id_to_unichar(choice_char_id),
                     config, (config & 31) >> 2,
                     config & 2 ? 'N' : 'B', config & 1 ? 'N' : 'I');
             const char* fontname;
             if (config >= font_set.size) {
               fontname = "Unknown";
             } else {
               fontname = get_fontinfo_table().get(
                 font_set.configs[config]).name;
             }
             tprintf("%s(%d,%d=%s)\n",
                     unicharset.id_to_unichar(choice_it.data()->unichar_id()),
                     font_set_id, config, fontname);
           }
           if (config < font_set.size) {
             int fontinfo_id = font_set.configs[config];
             if (fontinfo_id < fontinfo_size) {
               FontInfo fi = get_fontinfo_table().get(fontinfo_id);
               word->italic += fi.is_italic();
               word->bold += fi.is_bold();
               fonts.add(fontinfo_id, 1);
             }
           }
         }
         break;
       }
     }
   }
   find_modal_font(&fonts, &word->font1, &word->font1_count);
   find_modal_font(&fonts, &word->font2, &word->font2_count);
   if (tessedit_debug_fonts)
     tprintf("\n");
   if (word->font1_count > 0) {
     word->italic = word->bold = 0;
     for (char_it.mark_cycle_pt(), index = 0;
          !char_it.cycled_list();  char_it.forward(), ++index) {
       choice_char_id = word->best_choice->unichar_id(index);
       choice_it.set_to_list(char_it.data());
       for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
            choice_it.forward()) {
         if (choice_it.data()->unichar_id() == choice_char_id) {
           config = choice_it.data()->config();
           int class_id = choice_it.data()->unichar_id();
           int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
           if (font_set_id >= 0 && config >= 0 && font_set_id < fontset_size) {
             int fontinfo_id = get_fontset_table().get(font_set_id).
                 configs[config];
             if (fontinfo_id == word->font1 && fontinfo_id < fontinfo_size) {
               FontInfo fi = fontinfo_table_.get(fontinfo_id);
               word->italic += fi.is_italic();
               word->bold += fi.is_bold();
             }
           }
           break;
         }
       }
     }
   }
 }


 /**********************************************************************
  * font_recognition_pass
  *
  * Smooth the fonts for the document.
  **********************************************************************/

 void Tesseract::font_recognition_pass(  //good chars in word
                                       PAGE_RES_IT &page_res_it) {
   inT32 length;                  //of word
   inT32 count;                   //of a feature
   inT8 doc_font;                 //modal font
   inT8 doc_font_count;           //modal font
   inT32 doc_italic;              //total italics
   inT32 doc_bold;                //total bolds
   ROW_RES *row = NULL;           //current row
   WERD_RES *word;                //current word
   STATS fonts (0, get_fontinfo_table().size() ?
                get_fontinfo_table().size() : 32);           // font counters
   STATS doc_fonts (0, get_fontinfo_table().size() ?
                get_fontinfo_table().size() : 32);           // font counters

   doc_italic = 0;
   doc_bold = 0;
   page_res_it.restart_page ();
   while (page_res_it.word () != NULL) {
     if (row != page_res_it.row ()) {
       if (row != NULL) {
         find_modal_font (&fonts, &row->font1, &row->font1_count);
         find_modal_font (&fonts, &row->font2, &row->font2_count);
       }
       row = page_res_it.row ();  //current row
       fonts.clear ();            //clear counters
       row->italic = 0;
       row->bold = 0;
     }
     word = page_res_it.word ();
     row->italic += word->italic;
     row->bold += word->bold;
     fonts.add (word->font1, word->font1_count);
     fonts.add (word->font2, word->font2_count);
     doc_italic += word->italic;
     doc_bold += word->bold;
     doc_fonts.add (word->font1, word->font1_count);
     doc_fonts.add (word->font2, word->font2_count);
     page_res_it.forward ();
   }
   if (row != NULL) {
     find_modal_font (&fonts, &row->font1, &row->font1_count);
     find_modal_font (&fonts, &row->font2, &row->font2_count);
   }
   find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
   /*
     row=NULL;
     page_res_it.restart_page();
     while (page_res_it.word() != NULL)
     {
       if (row!=page_res_it.row())
       {
         row2=row;
         row=page_res_it.row();
         if (row->font1_count<MIN_FONT_ROW_COUNT)
         {
           fonts.clear();
           italic=0;
           bold=0;
           add_in_one_row(row,&fonts,&italic,&bold);
           if (row2!=NULL)
           {
             hdiff=row->row->x_height()-row2->row->x_height();
             if (hdiff<0)
               hdiff=-hdiff;
             if (hdiff<MAX_XHEIGHT_DIFF)
               add_in_one_row(row2,&fonts,&italic,&bold);
           }
           do
             page_res_it.forward();
           while (page_res_it.row()==row);
           row2=page_res_it.row();
           if (row2!=NULL)
           {
             hdiff=row->row->x_height()-row2->row->x_height();
             if (hdiff<0)
               hdiff=-hdiff;
             if (hdiff<MAX_XHEIGHT_DIFF)
               add_in_one_row(row2,&fonts,&italic,&bold);
           }
           row->italic=italic;
           row->bold=bold;
           find_modal_font(&fonts,&row->font1,&row->font1_count);
           find_modal_font(&fonts,&row->font2,&row->font2_count);
         }
         else
           page_res_it.forward();
       }
       else
         page_res_it.forward();
     }*/

   page_res_it.restart_page ();
   while (page_res_it.word () != NULL) {
     row = page_res_it.row ();    //current row
     word = page_res_it.word ();
     length = word->best_choice->length();

     count = word->italic;
     if (count < 0)
       count = -count;
     if (!(count == length || (length > 3 && count >= length * 3 / 4)))
       word->italic = doc_italic > 0 ? 1 : -1;

     count = word->bold;
     if (count < 0)
       count = -count;
     if (!(count == length || (length > 3 && count >= length * 3 / 4)))
       word->bold = doc_bold > 0 ? 1 : -1;

     count = word->font1_count;
     if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
       word->font1 = doc_font;
       word->font1_count = doc_font_count;
     }

     page_res_it.forward ();
   }
 }
 }  // namespace tesseract


 /**********************************************************************
  * add_in_one_row
  *
  * Add into the stats for one row.
  **********************************************************************/

 void add_in_one_row(               //good chars in word
                     ROW_RES *row,  //current row
                     STATS *fonts,  //font stats
                     inT8 *italic,  //output count
                     inT8 *bold     //output count
                    ) {
   WERD_RES *word;                //current word
   WERD_RES_IT word_it = &row->word_res_list;

   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
     word = word_it.data ();
     *italic += word->italic;
     *bold += word->bold;
     if (word->font1_count > 0)
       fonts->add (word->font1, word->font1_count);
     if (word->font2_count > 0)
       fonts->add (word->font2, word->font2_count);

   }
 }


 /**********************************************************************
  * find_modal_font
  *
  * Find the modal font and remove from the stats.
  **********************************************************************/

 void find_modal_font(                  //good chars in word
                      STATS *fonts,     //font stats
                      inT8 *font_out,   //output font
                      inT8 *font_count  //output count
                     ) {
   inT8 font;                     //font index
   inT32 count;                   //pile couat

   if (fonts->get_total () > 0) {
     font = (inT8) fonts->mode ();
     *font_out = font;
     count = fonts->pile_count (font);
     *font_count = count < MAX_INT8 ? count : MAX_INT8;
     fonts->add (font, -*font_count);
   }
   else {
     *font_out = -1;
     *font_count = 0;
   }
 }