ccmain/output.cpp - platform/external/tesseract - Git at Google

 /******************************************************************
  * File:        output.cpp  (Formerly output.c)
  * Description: Output pass
  * Author:					Phil Cheatle
  * Created:					Thu Aug  4 10:56:08 BST 1994
  *
  * (C) Copyright 1994, Hewlett-Packard Ltd.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  **********************************************************************/

 #include "mfcpch.h"
 #include          "ocrshell.h"
 #include          <string.h>
 #include          <ctype.h>
 #ifdef __UNIX__
 #include          <assert.h>
 #include          <unistd.h>
 #include                    <errno.h>
 #endif
 #include          "mainblk.h"
 #include          "tfacep.h"
 #include          "tessvars.h"
 #include          "control.h"
 #include          "secname.h"
 #include          "reject.h"
 #include          "docqual.h"
 #include          "output.h"
 #include "bestfirst.h"
 #include "globals.h"
 #include "tesseractclass.h"

 #define EXTERN

 #define EPAPER_EXT      ".ep"
 #define PAGE_YSIZE      3508
 #define CTRL_INSET      '\024'   //dc4=text inset
 #define CTRL_FONT       '\016'   //so=font change
 #define CTRL_DEFAULT      '\017' //si=default font
 #define CTRL_SHIFT      '\022'   //dc2=x shift
 #define CTRL_TAB        '\011'   //tab
 #define CTRL_NEWLINE      '\012' //newline
 #define CTRL_HARDLINE   '\015'   //cr

 EXTERN BOOL_EVAR (tessedit_write_block_separators, FALSE,
 "Write block separators in output");
 EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE,
 "Write raw stuff to name.raw");
 EXTERN BOOL_EVAR (tessedit_write_output, FALSE, "Write text to name.txt");
 EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE,
 "Return ratings in IPEOCRAPI data");
 EXTERN BOOL_EVAR (tessedit_write_txt_map, FALSE,
 "Write .txt to .etx map file");
 EXTERN BOOL_EVAR (tessedit_write_rep_codes, FALSE,
 "Write repetition char code");
 EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file");
 EXTERN STRING_EVAR (unrecognised_char, "|",
 "Output char for unidentified blobs");
 EXTERN INT_EVAR (suspect_level, 99, "Suspect marker level");
 EXTERN INT_VAR (suspect_space_level, 100,
 "Min suspect level for rejecting spaces");
 EXTERN INT_VAR (suspect_short_words, 2,
 "Dont Suspect dict wds longer than this");
 EXTERN BOOL_VAR (suspect_constrain_1Il, FALSE,
 "UNLV keep 1Il chars rejected");
 EXTERN double_VAR (suspect_rating_per_ch, 999.9,
 "Dont touch bad rating limit");
 EXTERN double_VAR (suspect_accept_rating, -999.9, "Accept good rating limit");

 EXTERN BOOL_EVAR (tessedit_minimal_rejection, FALSE,
 "Only reject tess failures");
 EXTERN BOOL_VAR (tessedit_zero_rejection, FALSE, "Dont reject ANYTHING");
 EXTERN BOOL_VAR (tessedit_word_for_word, FALSE,
 "Make output have exactly one word per WERD");
 EXTERN BOOL_VAR (tessedit_zero_kelvin_rejection, FALSE,
 "Dont reject ANYTHING AT ALL");
 EXTERN BOOL_VAR (tessedit_consistent_reps, TRUE,
 "Force all rep chars the same");

 FILE *txt_mapfile = NULL;        //reject map
 FILE *unlv_file = NULL;          //reject map

 /**********************************************************************
  * pixels_to_pts
  *
  * Convert an integer number of pixels to the nearest integer
  * number of points.
  **********************************************************************/

 inT32 pixels_to_pts(               //convert coords
                     inT32 pixels,
                     inT32 pix_res  //resolution
                    ) {
   float pts;                     //converted value

   pts = pixels * 72.0 / pix_res;
   return (inT32) (pts + 0.5);    //round it
 }

 namespace tesseract {
 void Tesseract::output_pass(  //Tess output pass //send to api
                             PAGE_RES_IT &page_res_it,
                             BOOL8 write_to_shm,
                             TBOX *target_word_box) {
   BLOCK_RES *block_of_last_word;
   inT16 block_id;
   BOOL8 force_eol;               //During output
   BLOCK *nextblock;              //block of next word
   WERD *nextword;                //next word

   if (tessedit_write_txt_map)
     txt_mapfile = open_outfile (".map");

   page_res_it.restart_page ();
   block_of_last_word = NULL;
   while (page_res_it.word () != NULL) {
     check_debug_pt (page_res_it.word (), 120);

 	if (target_word_box)
 	{

 		TBOX current_word_box=page_res_it.word ()->word->bounding_box();
 		FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
 		if (!target_word_box->contains(center_pt))
 		{
 			page_res_it.forward ();
 			continue;
 		}

 	}
     if (tessedit_write_block_separators &&
     block_of_last_word != page_res_it.block ()) {
       block_of_last_word = page_res_it.block ();
       block_id = block_of_last_word->block->index();
       if (!wordrec_no_block)
         fprintf (textfile, "|^~tr%d\n", block_id);
       fprintf (txt_mapfile, "|^~tr%d\n", block_id);
     }

     force_eol = (tessedit_write_block_separators &&
       (page_res_it.block () != page_res_it.next_block ())) ||
       (page_res_it.next_word () == NULL);

     if (page_res_it.next_word () != NULL)
       nextword = page_res_it.next_word ()->word;
     else
       nextword = NULL;
     if (page_res_it.next_block () != NULL)
       nextblock = page_res_it.next_block ()->block;
     else
       nextblock = NULL;
                                  //regardless of tilde crunching
     write_results (page_res_it, determine_newline_type (page_res_it.word ()->word, page_res_it.block ()->block, nextword, nextblock), force_eol,
       write_to_shm);
     page_res_it.forward ();
   }
   if (write_to_shm)
     ocr_send_text(FALSE);
   if (tessedit_write_block_separators) {
     if (!wordrec_no_block)
       fprintf (textfile, "|^~tr\n");
     fprintf (txt_mapfile, "|^~tr\n");
   }
   if (tessedit_write_txt_map) {
     fprintf (txt_mapfile, "\n"); //because txt gets one
     #ifdef __UNIX__
     fsync (fileno (txt_mapfile));
     #endif
     fclose(txt_mapfile);
   }
 }


 /*************************************************************************
  * write_results()
  *
  * All recognition and rejection has now been done. Generate the following:
  *   .txt file     - giving the final best choices with NO highlighting
  *   .raw file     - giving the tesseract top choice output for each word
  *   .map file     - showing how the .txt file has been rejected in the .ep file
  *   epchoice list - a list of one element per word, containing the text for the
  *                   epaper. Reject strings are inserted.
  *   inset list    - a list of bounding boxes of reject insets - indexed by the
  *                   reject strings in the epchoice text.
  *************************************************************************/

 void Tesseract::write_results(                        //output a word
                                                       //full info
                               PAGE_RES_IT &page_res_it,
                               char newline_type,      //type of newline
                                                       //override tilde crunch?
                               BOOL8 force_eol,
                               BOOL8 write_to_shm      //send to api
                   ) {
                                  //word to do
   WERD_RES *word = page_res_it.word ();
 //   WERD_CHOICE *ep_choice;        //ep format
   STRING repetition_code;
   const STRING *wordstr;
   STRING wordstr_lengths;
   int i;
   char unrecognised = STRING (unrecognised_char)[0];
   char ep_chars[32];             //Only for unlv_tilde_crunch
   int ep_chars_index = 0;
   char txt_chs[32];              //Only for unlv_tilde_crunch
   char map_chs[32];              //Only for unlv_tilde_crunch
   int txt_index = 0;
   static BOOL8 tilde_crunch_written = FALSE;
   static BOOL8 last_char_was_newline = TRUE;
   static BOOL8 last_char_was_tilde = FALSE;
   static BOOL8 empty_block = TRUE;
   BOOL8 need_reject = FALSE;
   PBLOB_IT blob_it;              //blobs
   UNICHAR_ID space = unicharset.unichar_to_id(" ");

   /*	if (word->best_choice->string().length() == 0)
     {
       tprintf("No output: to output\n");
     }
     else if (word->best_choice->string()[0]==' ')
     {
       tprintf("spaceword to output\n");
     }
     else if (word->best_choice->string()[0]=='\0')
     {
       tprintf("null to output\n");
     }*/
   if (word->unlv_crunch_mode != CR_NONE
   && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
     if ((word->unlv_crunch_mode != CR_DELETE) &&
       (!tilde_crunch_written ||
       ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
       (word->word->space () > 0) &&
       !word->word->flag (W_FUZZY_NON) &&
     !word->word->flag (W_FUZZY_SP)))) {
       if (!word->word->flag (W_BOL) &&
         (word->word->space () > 0) &&
         !word->word->flag (W_FUZZY_NON) &&
       !word->word->flag (W_FUZZY_SP)) {
         /* Write a space to separate from preceeding good text */
         txt_chs[txt_index] = ' ';
         map_chs[txt_index++] = '1';
         ep_chars[ep_chars_index++] = ' ';
         last_char_was_tilde = FALSE;
       }
       need_reject = TRUE;
     }
     if ((need_reject && !last_char_was_tilde) || (force_eol && empty_block)) {
       /* Write a reject char - mark as rejected unless zero_rejection mode */
       last_char_was_tilde = TRUE;
       txt_chs[txt_index] = unrecognised;
       if (tessedit_zero_rejection || (suspect_level == 0)) {
         map_chs[txt_index++] = '1';
         ep_chars[ep_chars_index++] = unrecognised;
       }
       else {
         map_chs[txt_index++] = '0';
         /*
            The ep_choice string is a faked reject to allow newdiff to sync the
            .etx with the .txt and .map files.
          */
         ep_chars[ep_chars_index++] = CTRL_INSET;
         //escape code
                                  //dummy reject
         ep_chars[ep_chars_index++] = 1;
                                  //dummy reject
         ep_chars[ep_chars_index++] = 1;
                                  //type
         ep_chars[ep_chars_index++] = 2;
                                  //dummy reject
         ep_chars[ep_chars_index++] = 1;
                                  //dummy reject
         ep_chars[ep_chars_index++] = 1;
       }
       tilde_crunch_written = TRUE;
       last_char_was_newline = FALSE;
       empty_block = FALSE;
     }

     if ((word->word->flag (W_EOL) && !last_char_was_newline) || force_eol) {
       /* Add a new line output */
       txt_chs[txt_index] = '\n';
       map_chs[txt_index++] = '\n';
                                  //end line
       ep_chars[ep_chars_index++] = newline_type;

                                  //Cos of the real newline
       tilde_crunch_written = FALSE;
       last_char_was_newline = TRUE;
       last_char_was_tilde = FALSE;
     }
     txt_chs[txt_index] = '\0';
     map_chs[txt_index] = '\0';
                                  //xiaofan
     if (tessedit_write_output && !wordrec_no_block)
       fprintf (textfile, "%s", txt_chs);

     if (tessedit_write_txt_map)
       fprintf (txt_mapfile, "%s", map_chs);

                                  //terminate string
     ep_chars[ep_chars_index] = '\0';
     word->ep_choice = new WERD_CHOICE(ep_chars, unicharset);

     if (force_eol)
       empty_block = TRUE;
     return;
   }

   /* NORMAL PROCESSING of non tilde crunched words */

   tilde_crunch_written = FALSE;
   if (newline_type)
     last_char_was_newline = TRUE;
   else
     last_char_was_newline = FALSE;
   empty_block = force_eol;       //About to write a real word

   if (unlv_tilde_crunching &&
       last_char_was_tilde &&
       (word->word->space() == 0) &&
       !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
       (word->best_choice->unichar_id(0) == space)) {
     /* Prevent adjacent tilde across words - we know that adjacent tildes within
        words have been removed */
     word->best_choice->remove_unichar_id(0);
     word->best_choice->populate_unichars(getDict().getUnicharset());
     word->reject_map.remove_pos (0);
     blob_it = word->outword->blob_list ();
     delete blob_it.extract ();   //get rid of reject blob
   }
   if (newline_type ||
     (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
     last_char_was_tilde = FALSE;
   else {
     if (word->reject_map.length () > 0) {
       if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
         last_char_was_tilde = TRUE;
       else
         last_char_was_tilde = FALSE;
     }
     else if (word->word->space () > 0)
       last_char_was_tilde = FALSE;
     /* else it is unchanged as there are no output chars */
   }

   ASSERT_HOST (word->best_choice->length() == word->reject_map.length());

   if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps)
     ensure_rep_chars_are_consistent(word);

   set_unlv_suspects(word);
   check_debug_pt (word, 120);
   if (tessedit_rejection_debug) {
     tprintf ("Dict word: \"%s\": %d\n",
              word->best_choice->debug_string(unicharset).string(),
              dict_word(*(word->best_choice)));
   }

 #if 0
   if (tessedit_write_unlv) {
     write_unlv_text(word);
   }
 #endif

   if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
     repetition_code = "|^~R";
     wordstr_lengths = "\001\001\001\001";
     repetition_code += unicharset.id_to_unichar(get_rep_char (word));
     wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word)));
     wordstr = &repetition_code;
   }
   else {
     if (tessedit_zero_rejection) {
       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
       for (i = 0; i < word->best_choice->length(); ++i) {
         if (word->reject_map[i].rejected())
           word->reject_map[i].setrej_minimal_rej_accept();
       }
     }
     if (tessedit_minimal_rejection) {
       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
       for (i = 0; i < word->best_choice->length(); ++i) {
         if ((word->best_choice->unichar_id(i) != space) &&
             word->reject_map[i].rejected())
           word->reject_map[i].setrej_minimal_rej_accept();
       }
     }
   }

   if (write_to_shm)
     write_shm_text (word, page_res_it.block ()->block,
       page_res_it.row (), *wordstr, wordstr_lengths);

 #if 0
   if (tessedit_write_output)
     write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile);

   if (tessedit_write_raw_output)
     write_cooked_text (word->word, word->raw_choice->string (),
       TRUE, FALSE, rawfile);

   if (tessedit_write_txt_map)
     write_map(txt_mapfile, word);

   ep_choice = make_epaper_choice (word, newline_type);
   word->ep_choice = ep_choice;
 #endif

   character_count += word->best_choice->length();
   word_count++;
 }
 }  // namespace tesseract

 /**********************************************************************
  * make_epaper_choice
  *
  * Construct the epaper text string for a word, using the reject map to
  * determine whether each blob should be rejected.
  **********************************************************************/

 #if 0
 WERD_CHOICE *make_epaper_choice(                   //convert one word
                                 WERD_RES *word,    //word to do
                                 char newline_type  //type of newline
                                ) {
   inT16 index = 0;               //to string
   inT16 blobindex;               //to word
   inT16 prevright = 0;           //right of previous blob
   inT16 nextleft;                //left of next blob
   PBLOB *blob;
   TBOX inset_box;                 //bounding box
   PBLOB_IT blob_it;              //blob iterator
   char word_string[MAX_PATH];    //converted string
   BOOL8 force_total_reject;
   char unrecognised = STRING (unrecognised_char)[0];

   blob_it.set_to_list (word->outword->blob_list ());

   ASSERT_HOST (word->reject_map.length () ==
     word->best_choice->string ().length ());
   /*
   tprintf( "\"%s\" -> length: %d;  blobcount: %d (%d)\n",
       word->best_choice->string().string(),
         word->best_choice->string().length(),
       blob_it.length(),
         blob_count( word->outword ) );
   */

   if (word->best_choice->string ().length () == 0)
     force_total_reject = TRUE;
   else {
     force_total_reject = FALSE;
     ASSERT_HOST (blob_it.length () ==
       word->best_choice->string ().length ());
   }
   if (!blob_it.empty ()) {
     for (index = 0; index < word->word->space (); index++)
       word_string[index] = ' ';  //leading blanks
   }
   /* Why does this generate leading blanks regardless of whether the
   word_choice string is empty, when write_cooked_text ony generates leading
   blanks when the string is NOT empty???. */

   if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
     strcpy (word_string + index, "|^~R");
     index += 4;
     strcpy(word_string + index, unicharset.id_to_unichar(get_rep_char (word)));
     index += strlen(unicharset.id_to_unichar(get_rep_char (word)));
   }
   else {
     if (!blob_it.empty ())
       prevright = blob_it.data ()->bounding_box ().left ();
     //actually first left
     for (blobindex = 0, blob_it.mark_cycle_pt ();
     !blob_it.cycled_list (); blobindex++, blob_it.forward ()) {
       blob = blob_it.data ();
       if (word->reject_map[blobindex].accepted ()) {
         if (word->best_choice->string ()[blobindex] == ' ')
                                  //but not rejected!!
           word_string[index++] = unrecognised;
         else
           word_string[index++] =
             word->best_choice->string ()[blobindex];
       }
       else {                     // start reject
         inset_box = blob->bounding_box ();
         /* Extend reject box to include rejected neighbours */
         while (!blob_it.at_last () &&
           (force_total_reject ||
         (word->reject_map[blobindex + 1].rejected ()))) {
           blobindex++;
           blob = blob_it.forward ();
                                  //get total box
           inset_box += blob->bounding_box ();
         }
         if (blob_it.at_last ())
           nextleft = inset_box.right ();
         else
           nextleft = blob_it.data_relative (1)->bounding_box ().left ();

         //       tprintf("Making reject from (%d,%d)->(%d,%d)\n",
         //          inset_box.left(),inset_box.bottom(),
         //          inset_box.right(),inset_box.top());

         index += make_reject (&inset_box, prevright, nextleft,
           &word->denorm, &word_string[index]);
       }
       prevright = blob->bounding_box ().right ();
     }
   }
   if (newline_type)
                                  //end line
     word_string[index++] = newline_type;
   word_string[index] = '\0';     //terminate string
   if (strlen (word_string) != index) {
     tprintf ("ASSERT ABOUT TO FAIL: %s, index %d len %d\n",
       word_string, index, strlen (word_string));
   }
                                  //don't pass any zeros
   ASSERT_HOST (strlen (word_string) == index);
   return new WERD_CHOICE (word_string, 0, 0, NO_PERM);
 }
 #endif

 /**********************************************************************
  * make_reject
  *
  * Add the escape code to the string for the reject.
  **********************************************************************/

 inT16
 make_reject (                    //make reject code
 TBOX * inset_box,                 //bounding box
 inT16 prevright,                 //previous char
 inT16 nextleft,                  //next char
 DENORM * denorm,                 //de-normalizer
 char word_string[]               //output string
 ) {
   inT16 index;                   //to string
   inT16 xpos;                    //start of inset
   inT16 ypos;
   inT16 width;                   //size of inset
   inT16 height;
   inT16 left_offset;             //shift form prev char
   inT16 right_offset;            //shift to next char
   inT16 baseline_offset;         //shift from baseline
   inT16 inset_index = 0;         //number of inset
   inT16 min_chars;               //min width estimate
   inT16 max_chars;               //max width estimate
   float x_centre;                //centre of box

   index = 0;
   x_centre = (inset_box->left () + inset_box->right ()) / 2.0;
   left_offset =
     (inT16) (denorm->x (inset_box->left ()) - denorm->x (prevright));
   right_offset =
     (inT16) (denorm->x (nextleft) - denorm->x (inset_box->right ()));
   xpos = (inT16) floor (denorm->x (inset_box->left ()));
   width = (inT16) ceil (denorm->x (inset_box->right ())) - xpos;
   ypos = (inT16) floor (denorm->y (inset_box->bottom (), x_centre));
   height = (inT16) ceil (denorm->y (inset_box->top (), x_centre)) - ypos;
   baseline_offset = ypos - (inT16) denorm->y (bln_baseline_offset, x_centre);
                                  //escape code
   word_string[index++] = CTRL_INSET;
   min_chars = (inT16) ceil (0.27 * width / denorm->row ()->x_height ());
   max_chars = (inT16) floor (1.8 * width / denorm->row ()->x_height ());
   /*
   Ensure min_chars and max_chars are in the range 0..254. This ensures that
   we can add 1 to them to avoid putting \0 in a string, and still not exceed
   the max value in a byte.
   */
   if (min_chars < 0)
     min_chars = 0;
   if (min_chars > 254)
     min_chars = 254;
   if (max_chars < min_chars)
     max_chars = min_chars;
   if (max_chars > 254)
     max_chars = 254;
                                  //min chars
   word_string[index++] = min_chars + 1;
                                  //max chars
   word_string[index++] = max_chars + 1;
   word_string[index++] = 2;      //type?
                                  //store index
   word_string[index++] = inset_index / 255 + 1;
   word_string[index++] = inset_index % 255 + 1;
   return index;                  //size of string
 }


 /**********************************************************************
  * determine_newline_type
  *
  * Find whether we have a wrapping or hard newline.
  * Return FALSE if not at end of line.
  **********************************************************************/

 char determine_newline_type(                   //test line ends
                             WERD *word,        //word to do
                             BLOCK *block,      //current block
                             WERD *next_word,   //next word
                             BLOCK *next_block  //block of next word
                            ) {
   inT16 end_gap;                 //to right edge
   inT16 width;                   //of next word
   TBOX word_box;                  //bounding
   TBOX next_box;                  //next word
   TBOX block_box;                 //block bounding

   if (!word->flag (W_EOL))
     return FALSE;                //not end of line
   if (next_word == NULL || next_block == NULL || block != next_block)
     return CTRL_NEWLINE;
   if (next_word->space () > 0)
     return CTRL_HARDLINE;        //it is tabbed
   word_box = word->bounding_box ();
   next_box = next_word->bounding_box ();
   block_box = block->bounding_box ();
                                  //gap to eol
   end_gap = block_box.right () - word_box.right ();
   end_gap -= (inT32) block->space ();
   width = next_box.right () - next_box.left ();
   //      tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
   //              block_box.right(),word_box.right(),end_gap,
   //              next_box.right(),next_box.left(),width,
   //              end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
   return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
 }

 /**********************************************************************
  * write_shm_text
  *
  * Write the cooked text to the shared memory for the api.
  **********************************************************************/

 void write_shm_text(                    //write output
                     WERD_RES *word,     //word to do
                     BLOCK *block,       //block it is from
                     ROW_RES *row,       //row it is from
                     const STRING &text, //text to write
                     const STRING &text_lengths
                    ) {
   inT32 index;                   //char counter
   inT32 index2;                  //char counter
   inT32 length;                  //chars in word
   inT32 ptsize;                  //font size
   inT8 blanks;                   //blanks in word
   uinT8 enhancement;             //bold etc
   uinT8 font;                    //font index
   char unrecognised = STRING (unrecognised_char)[0];
   PBLOB *blob;
   TBOX blob_box;                  //bounding box
   PBLOB_IT blob_it;              //blob iterator
   WERD copy_outword;             // copy to denorm
   uinT32 rating;                 //of char
   BOOL8 lineend;                 //end of line
   int offset;
   int offset2;

                                  //point size
   ptsize = pixels_to_pts ((inT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300);
   if (word->word->flag (W_BOL) && ocr_char_space () < 128
     && ocr_send_text (TRUE) != OKAY)
     return;                      //release failed
   copy_outword = *(word->outword);
   copy_outword.baseline_denormalise (&word->denorm);
   blob_it.set_to_list (copy_outword.blob_list ());
   length = text_lengths.length ();

   if (length > 0) {
     blanks = word->word->space ();
     if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL))
       blanks = 1;
     for (index = 0, offset = 0; index < length;
          offset += text_lengths[index++], blob_it.forward ()) {
       blob = blob_it.data ();
       blob_box = blob->bounding_box ();

       enhancement = 0;
       if (word->italic > 0 || (word->italic == 0 && row->italic > 0))
         enhancement |= EUC_ITALIC;
       if (word->bold > 0 || (word->bold == 0 && row->bold > 0))
         enhancement |= EUC_BOLD;
       if (tessedit_write_ratings)
         rating = (uinT32) (-word->best_choice->certainty () / 0.035);
       else if (tessedit_zero_rejection)
         rating = text[offset] == ' ' ? 100 : 0;
       else
         rating = word->reject_map[index].accepted ()? 0 : 100;
       if (rating > 255)
         rating = 255;
       if (word->font1_count > 2)
         font = word->font1;
       else if (row->font1_count > 8)
         font = row->font1;
       else
                                  //font index
         font = word->word->flag (W_DONT_CHOP) ? 0 : 1;

       lineend = word->word->flag (W_EOL) && index == length - 1;
       if (word->word->flag (W_EOL) && tessedit_zero_rejection
       && index < length - 1 && text[index + text_lengths[index]] == ' ') {
         for (index2 = index + 1, offset2 = offset + text_lengths[index];
              index2 < length && text[offset2] == ' ';
              offset2 += text_lengths[index2++]);
         if (index2 == length)
           lineend = TRUE;
       }

       if (!tessedit_zero_rejection || text[offset] != ' '
       || tessedit_word_for_word) {
                                  //confidence
         if (text[offset] == ' ') {
         ocr_append_char (unrecognised,
                          blob_box.left (), blob_box.right (),
                          page_image.get_ysize () - 1 - blob_box.top (),
                          page_image.get_ysize () - 1 - blob_box.bottom (),
                          font, (uinT8) rating,
                          ptsize,                //point size
                          blanks, enhancement,   //enhancement
                          OCR_CDIR_LEFT_RIGHT,
                          OCR_LDIR_DOWN_RIGHT,
                          lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
         } else {
           for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset)
             ocr_append_char (static_cast<unsigned char>(text[offset+suboffset]),
                              blob_box.left (), blob_box.right (),
                              page_image.get_ysize () - 1 - blob_box.top (),
                              page_image.get_ysize () - 1 - blob_box.bottom (),
                              font, (uinT8) rating,
                              ptsize,                //point size
                              blanks, enhancement,   //enhancement
                              OCR_CDIR_LEFT_RIGHT,
                              OCR_LDIR_DOWN_RIGHT,
                              lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
         }
         blanks = 0;
       }

     }
   }
   else if (tessedit_word_for_word) {
     blanks = word->word->space ();
     if (blanks == 0 && !word->word->flag (W_BOL))
       blanks = 1;
     blob_box = word->word->bounding_box ();

     enhancement = 0;
     if (word->italic > 0)
       enhancement |= EUC_ITALIC;
     if (word->bold > 0)
       enhancement |= EUC_BOLD;
     rating = 100;
     if (word->font1_count > 2)
       font = word->font1;
     else if (row->font1_count > 8)
       font = row->font1;
     else
                                  //font index
       font = word->word->flag (W_DONT_CHOP) ? 0 : 1;

     lineend = word->word->flag (W_EOL);

                                  //font index
     ocr_append_char (unrecognised,
                      blob_box.left (), blob_box.right (),
                      page_image.get_ysize () - 1 - blob_box.top (),
                      page_image.get_ysize () - 1 - blob_box.bottom (),
                      font,
                      rating,                    //confidence
                      ptsize,                    //point size
                      blanks, enhancement,       //enhancement
                      OCR_CDIR_LEFT_RIGHT,
                      OCR_LDIR_DOWN_RIGHT,
                      lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
   }
 }


 /**********************************************************************
  * write_map
  *
  * Write a map file of 0's and 1'a which associates characters from the .txt
  * file with those in the .etx file. 0 = .txt char was deleted. 1 = .txt char
  * is kept.  Note that there may be reject regions in the .etx file WITHOUT
  * .txt chars being rejected.  The map file should be the same length, and
  * the same number of lines as the .txt file
  *
  * The paramaterised input is because I thought I might be able to generate
  * multiple map files in a single run.  However, it didn't work because
  * newdiff needs etx files!
  **********************************************************************/

 #if 0
 void write_map(                //output a map file
                FILE *mapfile,  //mapfile to write to
                WERD_RES *word) {
   inT16 index;
   int status;
   STRING mapstr = "";

   if (word->best_choice->string ().length () > 0) {
     for (index = 0; index < word->word->space (); index++) {
       if (word->reject_spaces &&
         (suspect_level >= suspect_space_level) &&
         !tessedit_minimal_rejection && !tessedit_zero_rejection)
         /* Write rejected spaces to .map file ONLY. Newdiff converts these back to
         accepted spaces AFTER generating basic space stats but BEFORE using .etx */
         status = fprintf (mapfile, "0");
       else
         status = fprintf (mapfile, "1");
       if (status < 0)
         WRITEFAILED.error ("write_map", EXIT, "Space Errno: %d", errno);
     }

     if ((word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) {
       for (index = 0; index < 5; index++)
         mapstr += '1';
     }
     else {
       ASSERT_HOST (word->reject_map.length () ==
         word->best_choice->string ().length ());

       for (index = 0; index < word->reject_map.length (); index++) {
         if (word->reject_map[index].accepted ())
           mapstr += '1';
         else
           mapstr += '0';
       }
     }
     status = fprintf (mapfile, "%s", mapstr.string ());
     if (status < 0)
       WRITEFAILED.error ("write_map", EXIT, "Map str Errno: %d", errno);
   }
   if (word->word->flag (W_EOL)) {
     status = fprintf (mapfile, "\n");
     if (status < 0)
       WRITEFAILED.error ("write_map", EXIT, "Newline Errno: %d", errno);
   }
   status = fflush (mapfile);
   if (status != 0)
     WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);
 }
 #endif


 /*************************************************************************
  * open_file()
  *************************************************************************/

 namespace tesseract {
 FILE *Tesseract::open_outfile(  //open .map & .unlv file
                    const char *extension) {
   STRING file_name;
   FILE *outfile;

   file_name = imagebasename + extension;
   if (!(outfile = fopen (file_name.string (), "w"))) {
     CANTOPENFILE.error ("open_outfile", EXIT, "%s %d",
       file_name.string (), errno);
   }
   return outfile;
 }
 }  // namespace tesseract


 #if 0
 void write_unlv_text(WERD_RES *word) {
   const char *wordstr;

   char buff[512];                //string to output
   int i = 0;
   int j = 0;
   char unrecognised = STRING (unrecognised_char)[0];
   int status;
   char space_str[3];

   wordstr = word->best_choice->string ().string ();

   /* DONT need to do anything special for repeated char words - at this stage
   the repetition char has been identified and any other chars have been
   rejected.
   */

   for (; wordstr[i] != '\0'; i++) {
     if ((wordstr[i] == ' ') ||
       (wordstr[i] == '~') || (wordstr[i] == '^') || (wordstr[i] == '|'))
       buff[j++] = unrecognised;
     else {
       if (word->reject_map[i].rejected ())
         buff[j++] = '^';         //Add suspect marker
       buff[j++] = wordstr[i];
     }
   }
   buff[j] = '\0';

   if (strlen (wordstr) > 0) {
     if (word->reject_spaces &&
       (suspect_level >= suspect_space_level) &&
       !tessedit_minimal_rejection && !tessedit_zero_rejection)
       strcpy (space_str, "^ ");  //Suspect space
     else
       strcpy (space_str, " ");   //Certain space

     for (i = 0; i < word->word->space (); i++) {
       status = fprintf (unlv_file, "%s", space_str);
       if (status < 0)
         WRITEFAILED.error ("write_unlv_text", EXIT,
           "Space Errno: %d", errno);
     }

     status = fprintf (unlv_file, "%s", buff);
     if (status < 0)
       WRITEFAILED.error ("write_unlv_text", EXIT, "Word Errno: %d", errno);
   }
   if (word->word->flag (W_EOL)) {
     status = fprintf (unlv_file, "\n");
     if (status < 0)
       WRITEFAILED.error ("write_unlv_text", EXIT,
         "Newline Errno: %d", errno);
   }
   status = fflush (unlv_file);
   if (status != 0)
     WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);
 }
 #endif


 /*************************************************************************
  * get_rep_char()
  * Return the first accepted character from the repetition string. This is the
  * character which is repeated - as determined earlier by fix_rep_char()
  *************************************************************************/
 namespace tesseract {
 UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) {  // what char is repeated?
   int i;
   for (i = 0; ((i < word->reject_map.length()) &&
                (word->reject_map[i].rejected())); ++i);

   if (i < word->reject_map.length()) {
     return word->best_choice->unichar_id(i);
   } else {
     return unicharset.unichar_to_id(unrecognised_char.string());
   }
 }
 }  // namespace tesseract

 void ensure_rep_chars_are_consistent(WERD_RES *word) {
 #if 0
   char rep_char = get_rep_char (word);
   char *ptr;

   ptr = (char *) word->best_choice->string ().string ();
   for (; *ptr != '\0'; ptr++) {
     if (*ptr != rep_char)
       *ptr = rep_char;
   }
 #endif

 #if 0
   UNICHAR_ID rep_char = get_rep_char (word); //TODO(tkielbus) Reactivate
   int i;
   char *ptr;
   STRING consistent_string;
   STRING consistent_string_lengths;

   ptr = (char *) word->best_choice->string ().string ();
   for (i = 0; *ptr != '\0'; ptr += word->best_choice->lengths()[i++]) {
     consistent_string += unicharset.id_to_unichar(rep_char);
     consistent_string_lengths += strlen(unicharset.id_to_unichar(rep_char));
   }
   word->best_choice->string() = consistent_string;
   word->best_choice->lengths() = consistent_string_lengths;
 #endif
 }

 /*************************************************************************
  * SUSPECT LEVELS
  *
  * 0 - dont reject ANYTHING
  * 1,2 - partial rejection
  * 3 - BEST
  *
  * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
  * tessedit_minimal_rejection.
  *************************************************************************/

 namespace tesseract {
 void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
   int len = word_res->reject_map.length();
   const WERD_CHOICE &word = *(word_res->best_choice);
   int i;
   float rating_per_ch;

   if (suspect_level == 0) {
     for (i = 0; i < len; i++) {
       if (word_res->reject_map[i].rejected())
         word_res->reject_map[i].setrej_minimal_rej_accept();
     }
     return;
   }

   if (suspect_level >= 3)
     return;                      //Use defaults

   /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/

   if (safe_dict_word(word) &&
       (count_alphas(word) > suspect_short_words)) {
     /* Unreject alphas in dictionary words */
     for (i = 0; i < len; ++i) {
       if (word_res->reject_map[i].rejected() &&
           unicharset.get_isalpha(word.unichar_id(i)))
         word_res->reject_map[i].setrej_minimal_rej_accept();
     }
   }

   rating_per_ch = word.rating() / word_res->reject_map.length();

   if (rating_per_ch >= suspect_rating_per_ch)
     return;                      //Dont touch bad ratings

   if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
     /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
     for (i = 0; i < len; ++i) {
       if (word_res->reject_map[i].rejected() &&
           (!unicharset.eq(word.unichar_id(i), " ")))
         word_res->reject_map[i].setrej_minimal_rej_accept();
     }
   }

   for (i = 0; i < len; i++) {
     if (word_res->reject_map[i].rejected()) {
       if (word_res->reject_map[i].flag(R_DOC_REJ))
         word_res->reject_map[i].setrej_minimal_rej_accept();
       if (word_res->reject_map[i].flag(R_BLOCK_REJ))
         word_res->reject_map[i].setrej_minimal_rej_accept();
       if (word_res->reject_map[i].flag(R_ROW_REJ))
         word_res->reject_map[i].setrej_minimal_rej_accept();
     }
   }

   if (suspect_level == 2)
     return;

   if (!suspect_constrain_1Il ||
       (word_res->reject_map.length() <= suspect_short_words)) {
     for (i = 0; i < len; i++) {
       if (word_res->reject_map[i].rejected()) {
         if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
           word_res->reject_map[i].flag(R_POSTNN_1IL)))
           word_res->reject_map[i].setrej_minimal_rej_accept();

         if (!suspect_constrain_1Il &&
           word_res->reject_map[i].flag(R_MM_REJECT))
           word_res->reject_map[i].setrej_minimal_rej_accept();
       }
     }
   }

   if ((acceptable_word_string(word.unichar_string().string(),
                               word.unichar_lengths().string()) !=
        AC_UNACCEPTABLE) ||
       acceptable_number_string(word.unichar_string().string(),
                                word.unichar_lengths().string())) {
     if (word_res->reject_map.length() > suspect_short_words) {
       for (i = 0; i < len; i++) {
         if (word_res->reject_map[i].rejected() &&
           (!word_res->reject_map[i].perm_rejected() ||
            word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
            word_res->reject_map[i].flag (R_POSTNN_1IL) ||
            word_res->reject_map[i].flag (R_MM_REJECT))) {
           word_res->reject_map[i].setrej_minimal_rej_accept();
         }
       }
     }
   }
 }

 inT16 Tesseract::count_alphas(const WERD_CHOICE &word) {
   int count = 0;
   for (int i = 0; i < word.length(); ++i) {
     if (unicharset.get_isalpha(word.unichar_id(i)))
       count++;
   }
   return count;
 }


 inT16 Tesseract::count_alphanums(const WERD_CHOICE &word) {
   int count = 0;
   for (int i = 0; i < word.length(); ++i) {
     if (unicharset.get_isalpha(word.unichar_id(i)) ||
         unicharset.get_isdigit(word.unichar_id(i)))
       count++;
   }
   return count;
 }


 BOOL8 Tesseract::acceptable_number_string(const char *s,
                                           const char *lengths) {
   BOOL8 prev_digit = FALSE;

   if (*lengths == 1 && *s == '(')
     s++;

   if (*lengths == 1 &&
       ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
     s++;

   for (; *s != '\0'; s += *(lengths++)) {
     if (unicharset.get_isdigit (s, *lengths))
       prev_digit = TRUE;
     else if (prev_digit &&
              (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
       prev_digit = FALSE;
     else if (prev_digit && *lengths == 1 &&
              (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
       return TRUE;
     else if (prev_digit &&
              *lengths == 1 && (*s == '%') &&
              (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
              (*(s + *lengths + *(lengths + 1)) == '\0'))
       return TRUE;
     else
       return FALSE;
   }
   return TRUE;
 }
 }  // namespace tesseract