ccmain/docqual.cpp - platform/external/tesseract - Git at Google

 /******************************************************************
  * File:        docqual.cpp  (Formerly docqual.c)
  * Description: Document Quality Metrics
  * Author:		Phil Cheatle
  * Created:		Mon May  9 11:27:28 BST 1994
  *
  * (C) Copyright 1994, Hewlett-Packard Ltd.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  **********************************************************************/

 #include "mfcpch.h"
 #include          <ctype.h>
 #include          "docqual.h"
 #include          "tstruct.h"
 #include          "tfacep.h"
 #include          "reject.h"
 #include          "tessvars.h"
 #include          "genblob.h"
 #include          "secname.h"
 #include          "globals.h"
 #include          "tesseractclass.h"

 #define EXTERN

 EXTERN STRING_VAR (outlines_odd, "%| ", "Non standard number of outlines");
 EXTERN STRING_VAR (outlines_2, "ij!?%\":;",
 "Non standard number of outlines");
 EXTERN BOOL_VAR (docqual_excuse_outline_errs, FALSE,
 "Allow outline errs in unrejection?");
 EXTERN BOOL_VAR (tessedit_good_quality_unrej, TRUE,
 "Reduce rejection on good docs");
 EXTERN BOOL_VAR (tessedit_use_reject_spaces, TRUE, "Reject spaces?");
 EXTERN double_VAR (tessedit_reject_doc_percent, 65.00,
 "%rej allowed before rej whole doc");
 EXTERN double_VAR (tessedit_reject_block_percent, 45.00,
 "%rej allowed before rej whole block");
 EXTERN double_VAR (tessedit_reject_row_percent, 40.00,
 "%rej allowed before rej whole row");
 EXTERN double_VAR (tessedit_whole_wd_rej_row_percent, 70.00,
 "%of row rejects in whole word rejects which prevents whole row rejection");
 EXTERN BOOL_VAR (tessedit_preserve_blk_rej_perfect_wds, TRUE,
 "Only rej partially rejected words in block rejection");
 EXTERN BOOL_VAR (tessedit_preserve_row_rej_perfect_wds, TRUE,
 "Only rej partially rejected words in row rejection");
 EXTERN BOOL_VAR (tessedit_dont_blkrej_good_wds, FALSE,
 "Use word segmentation quality metric");
 EXTERN BOOL_VAR (tessedit_dont_rowrej_good_wds, FALSE,
 "Use word segmentation quality metric");
 EXTERN INT_VAR (tessedit_preserve_min_wd_len, 2,
 "Only preserve wds longer than this");
 EXTERN BOOL_VAR (tessedit_row_rej_good_docs, TRUE,
 "Apply row rejection to good docs");
 EXTERN double_VAR (tessedit_good_doc_still_rowrej_wd, 1.1,
 "rej good doc wd if more than this fraction rejected");
 EXTERN BOOL_VAR (tessedit_reject_bad_qual_wds, TRUE,
 "Reject all bad quality wds");
 EXTERN BOOL_VAR (tessedit_debug_doc_rejection, FALSE, "Page stats");
 EXTERN BOOL_VAR (tessedit_debug_quality_metrics, FALSE,
 "Output data to debug file");
 EXTERN BOOL_VAR (bland_unrej, FALSE, "unrej potential with no chekcs");
 EXTERN double_VAR (quality_rowrej_pc, 1.1,
 "good_quality_doc gte good char limit");

 EXTERN BOOL_VAR (unlv_tilde_crunching, TRUE,
 "Mark v.bad words for tilde crunch");
 EXTERN BOOL_VAR (crunch_early_merge_tess_fails, TRUE, "Before word crunch?");
 EXTERN BOOL_EVAR (crunch_early_convert_bad_unlv_chs, FALSE,
 "Take out ~^ early?");

 EXTERN double_VAR (crunch_terrible_rating, 80.0, "crunch rating lt this");
 EXTERN BOOL_VAR (crunch_terrible_garbage, TRUE, "As it says");
 EXTERN double_VAR (crunch_poor_garbage_cert, -9.0,
 "crunch garbage cert lt this");
 EXTERN double_VAR (crunch_poor_garbage_rate, 60,
 "crunch garbage rating lt this");

 EXTERN double_VAR (crunch_pot_poor_rate, 40,
 "POTENTIAL crunch rating lt this");
 EXTERN double_VAR (crunch_pot_poor_cert, -8.0,
 "POTENTIAL crunch cert lt this");
 EXTERN BOOL_VAR (crunch_pot_garbage, TRUE, "POTENTIAL crunch garbage");

 EXTERN double_VAR (crunch_del_rating, 60, "POTENTIAL crunch rating lt this");
 EXTERN double_VAR (crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
 EXTERN double_VAR (crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
 EXTERN double_VAR (crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
 EXTERN double_VAR (crunch_del_min_width, 3.0,
 "Del if word width lt xht x this");
 EXTERN double_VAR (crunch_del_high_word, 1.5,
 "Del if word gt xht x this above bl");
 EXTERN double_VAR (crunch_del_low_word, 0.5,
 "Del if word gt xht x this below bl");
 EXTERN double_VAR (crunch_small_outlines_size, 0.6, "Small if lt xht x this");

 EXTERN INT_VAR (crunch_rating_max, 10, "For adj length in rating per ch");
 EXTERN INT_VAR (crunch_pot_indicators, 1,
 "How many potential indicators needed");

 EXTERN BOOL_VAR (crunch_leave_ok_strings, TRUE,
 "Dont touch sensible strings");
 EXTERN BOOL_VAR (crunch_accept_ok, TRUE, "Use acceptability in okstring");
 EXTERN BOOL_VAR (crunch_leave_accept_strings, FALSE,
 "Dont pot crunch sensible strings");
 EXTERN BOOL_VAR (crunch_include_numerals, FALSE, "Fiddle alpha figures");
 EXTERN INT_VAR (crunch_leave_lc_strings, 4,
 "Dont crunch words with long lower case strings");
 EXTERN INT_VAR (crunch_leave_uc_strings, 4,
 "Dont crunch words with long lower case strings");
 EXTERN INT_VAR (crunch_long_repetitions, 3,
 "Crunch words with long repetitions");

 EXTERN INT_VAR (crunch_debug, 0, "As it says");

 /*************************************************************************
  * word_blob_quality()
  * How many blobs in the outword are identical to those of the inword?
  * ASSUME blobs in both initial word and outword are in ascending order of
  * left hand blob edge.
  *************************************************************************/
 inT16 word_blob_quality(  //Blob seg changes
                         WERD_RES *word,
                         ROW *row) {
   WERD *bln_word;                //BL norm init word
   TWERD *tessword;               //tess format
   WERD *init_word;               //BL norm init word
   PBLOB_IT outword_it;
   PBLOB_IT initial_it;
   inT16 i;
   inT16 init_blobs_left;
   inT16 match_count = 0;
   BOOL8 matched;
   TBOX out_box;
   PBLOB *test_blob;
   DENORM denorm;
   float bln_xht;

   if (word->word->gblob_list ()->empty ())
     return 0;
                                  //xht used for blnorm
   bln_xht = bln_x_height / word->denorm.scale ();
   bln_word = make_bln_copy(word->word, row, NULL, bln_xht, &denorm);
   /*
     NOTE: Need to convert to tess format and back again to ensure that the
     same float -> int rounding of coords is done to source wd as out wd before
     comparison
   */
   tessword = make_tess_word(bln_word, NULL);  // Convert word.
   init_word = make_ed_word (tessword, bln_word);
   delete bln_word;
   delete_word(tessword);
   if (init_word == NULL) {
     // Conversion failed.
     return 0;
   }

   initial_it.set_to_list (init_word->blob_list ());
   init_blobs_left = initial_it.length ();
   outword_it.set_to_list (word->outword->blob_list ());

   for (outword_it.mark_cycle_pt ();
   !outword_it.cycled_list (); outword_it.forward ()) {
     out_box = outword_it.data ()->bounding_box ();

     // Skip any initial blobs LEFT of current outword blob.
     while (!initial_it.at_last () &&
     (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
       initial_it.forward ();
       init_blobs_left--;
     }

     /* See if current outword blob matches any initial blob with the same left
       coord. (Normally only one but possibly more - in unknown order) */

     i = 0;
     matched = FALSE;
     do {
       test_blob = initial_it.data_relative (i++);
       matched = crude_match_blobs (test_blob, outword_it.data ());
       if (matched)
         match_count++;
     }
     while (!matched &&
       (init_blobs_left - i > 0) &&
       (i < 129) &&
       !initial_it.at_last () &&
       test_blob->bounding_box ().left () == out_box.left ());
   }
   delete init_word;
   return match_count;
 }


 /*************************************************************************
  * crude_match_blobs()
  * Check bounding boxes are the same and the number of outlines are the same.
  *************************************************************************/
 BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2) {
   TBOX box1 = blob1->bounding_box ();
   TBOX box2 = blob2->bounding_box ();

   if (box1.contains (box2) &&
     box2.contains (box1) &&
     (blob1->out_list ()->length () == blob1->out_list ()->length ()))
     return TRUE;
   else
     return FALSE;
 }


 inT16 word_outline_errs(WERD_RES *word) {
   PBLOB_IT outword_it;
   inT16 i = 0;
   inT16 err_count = 0;

   outword_it.set_to_list (word->outword->blob_list ());

   for (outword_it.mark_cycle_pt ();
   !outword_it.cycled_list (); outword_it.forward ()) {
     err_count += count_outline_errs (word->best_choice->unichar_string()[i],
                                     outword_it.data()->out_list()->length());
     i++;
   }
   return err_count;
 }


 /*************************************************************************
  * word_char_quality()
  * Combination of blob quality and outline quality - how many good chars are
  * there? - I.e chars which pass the blob AND outline tests.
  *************************************************************************/
 void word_char_quality(WERD_RES *word,
                        ROW *row,
                        inT16 *match_count,
                        inT16 *accepted_match_count) {
   WERD *bln_word;                //BL norm init word
   TWERD *tessword;               //tess format
   WERD *init_word;               //BL norm init word
   PBLOB_IT outword_it;
   PBLOB_IT initial_it;
   inT16 i;
   inT16 init_blobs_left;
   BOOL8 matched;
   TBOX out_box;
   PBLOB *test_blob;
   DENORM denorm;
   float bln_xht;
   inT16 j = 0;

   *match_count = 0;
   *accepted_match_count = 0;
   if (word->word->gblob_list ()->empty ())
     return;

                                  //xht used for blnorm
   bln_xht = bln_x_height / word->denorm.scale ();
   bln_word = make_bln_copy(word->word, row, NULL, bln_xht, &denorm);
   /*
     NOTE: Need to convert to tess format and back again to ensure that the
     same float -> int rounding of coords is done to source wd as out wd before
     comparison
   */
   tessword = make_tess_word(bln_word, NULL);  // Convert word.
   init_word = make_ed_word (tessword, bln_word);
   delete bln_word;
   delete_word(tessword);
   if (init_word == NULL)
     return;

   initial_it.set_to_list (init_word->blob_list ());
   init_blobs_left = initial_it.length ();
   outword_it.set_to_list (word->outword->blob_list ());

   for (outword_it.mark_cycle_pt ();
   !outword_it.cycled_list (); outword_it.forward ()) {
     out_box = outword_it.data ()->bounding_box ();

     /* Skip any initial blobs LEFT of current outword blob */
     while (!initial_it.at_last () &&
     (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
       initial_it.forward ();
       init_blobs_left--;
     }

     /* See if current outword blob matches any initial blob with the same left
       coord. (Normally only one but possibly more - in unknown order) */

     i = 0;
     matched = FALSE;
     do {
       test_blob = initial_it.data_relative (i++);
       matched = crude_match_blobs (test_blob, outword_it.data ());
       if (matched &&
         (count_outline_errs (word->best_choice->unichar_string()[j],
         outword_it.data ()->out_list ()->length ())
       == 0)) {
         (*match_count)++;
         if (word->reject_map[j].accepted ())
           (*accepted_match_count)++;
       }
     }
     while (!matched &&
       (init_blobs_left - i > 0) &&
       (i < 129) &&
       !initial_it.at_last () &&
       test_blob->bounding_box ().left () == out_box.left ());
     j++;
   }
   delete init_word;
 }


 /*************************************************************************
  * unrej_good_chs()
  * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
  *************************************************************************/
 void unrej_good_chs(WERD_RES *word, ROW *row) {
   WERD *bln_word;                //BL norm init word
   TWERD *tessword;               //tess format
   WERD *init_word;               //BL norm init word
   PBLOB_IT outword_it;
   PBLOB_IT initial_it;
   inT16 i;
   inT16 init_blobs_left;
   BOOL8 matched;
   TBOX out_box;
   PBLOB *test_blob;
   DENORM denorm;
   float bln_xht;
   inT16 j = 0;

   if (word->word->gblob_list ()->empty ())
     return;

                                  //xht used for blnorm
   bln_xht = bln_x_height / word->denorm.scale ();
   bln_word = make_bln_copy(word->word, row, NULL, bln_xht, &denorm);
   /*
     NOTE: Need to convert to tess format and back again to ensure that the
     same float -> int rounding of coords is done to source wd as out wd before
     comparison
   */
   tessword = make_tess_word(bln_word, NULL);  // Convert word
   init_word = make_ed_word (tessword, bln_word);
   delete bln_word;
   delete_word(tessword);
   if (init_word == NULL)
     return;

   initial_it.set_to_list (init_word->blob_list ());
   init_blobs_left = initial_it.length ();
   outword_it.set_to_list (word->outword->blob_list ());

   for (outword_it.mark_cycle_pt ();
   !outword_it.cycled_list (); outword_it.forward ()) {
     out_box = outword_it.data ()->bounding_box ();

     /* Skip any initial blobs LEFT of current outword blob */
     while (!initial_it.at_last () &&
     (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
       initial_it.forward ();
       init_blobs_left--;
     }

     /* See if current outword blob matches any initial blob with the same left
       coord. (Normally only one but possibly more - in unknown order) */

     i = 0;
     matched = FALSE;
     do {
       test_blob = initial_it.data_relative (i++);
       matched = crude_match_blobs (test_blob, outword_it.data ());
       if (matched &&
         (word->reject_map[j].accept_if_good_quality ()) &&
         (docqual_excuse_outline_errs ||
         (count_outline_errs (word->best_choice->unichar_string()[j],
         outword_it.data ()->out_list ()->
         length ()) == 0)))
         word->reject_map[j].setrej_quality_accept ();
     }
     while (!matched &&
       (init_blobs_left - i > 0) &&
       (i < 129) &&
       !initial_it.at_last () &&
       test_blob->bounding_box ().left () == out_box.left ());
     j++;
   }
   delete init_word;
 }


 void print_boxes(WERD *word) {
   PBLOB_IT it;
   TBOX box;

   it.set_to_list (word->blob_list ());
   for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
     box = it.data ()->bounding_box ();
     box.print ();
   }
 }


 inT16 count_outline_errs(char c, inT16 outline_count) {
   int expected_outline_count;

   if (STRING (outlines_odd).contains (c))
     return 0;                    //Dont use this char
   else if (STRING (outlines_2).contains (c))
     expected_outline_count = 2;
   else
     expected_outline_count = 1;
   return abs (outline_count - expected_outline_count);
 }


 namespace tesseract {
 void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
                                         BOOL8 good_quality_doc) {
   if ((tessedit_good_quality_unrej && good_quality_doc))
     unrej_good_quality_words(page_res_it);
   doc_and_block_rejection(page_res_it, good_quality_doc);

   page_res_it.restart_page ();
   while (page_res_it.word () != NULL) {
     insert_rej_cblobs(page_res_it.word());
     page_res_it.forward();
   }

   if (unlv_tilde_crunching) {
     tilde_crunch(page_res_it);
     tilde_delete(page_res_it);
   }
 }


 /*************************************************************************
  * unrej_good_quality_words()
  * Accept potential rejects in words which pass the following checks:
  *    - Contains a potential reject
  *    - Word looks like a sensible alpha word.
  *    - Word segmentation is the same as the original image
  *		- All characters have the expected number of outlines
  * NOTE - the rejection counts are recalculated after unrejection
  *      - CANT do it in a single pass without a bit of fiddling
  *		- keep it simple but inefficient
  *************************************************************************/
 void Tesseract::unrej_good_quality_words(  //unreject potential
                                          PAGE_RES_IT &page_res_it) {
   WERD_RES *word;
   ROW_RES *current_row;
   BLOCK_RES *current_block;
   int i;

   page_res_it.restart_page ();
   while (page_res_it.word () != NULL) {
     check_debug_pt (page_res_it.word (), 100);
     if (bland_unrej) {
       word = page_res_it.word ();
       for (i = 0; i < word->reject_map.length (); i++) {
         if (word->reject_map[i].accept_if_good_quality ())
           word->reject_map[i].setrej_quality_accept ();
       }
       page_res_it.forward ();
     }
     else if ((page_res_it.row ()->char_count > 0) &&
       ((page_res_it.row ()->rej_count /
       (float) page_res_it.row ()->char_count) <=
     quality_rowrej_pc)) {
       word = page_res_it.word ();
       if (word->reject_map.quality_recoverable_rejects () &&
         (tessedit_unrej_any_wd ||
         acceptable_word_string (word->best_choice->unichar_string().string(),
                                 word->best_choice->unichar_lengths().string())
       != AC_UNACCEPTABLE)) {
         unrej_good_chs (word, page_res_it.row ()->row);
       }
       page_res_it.forward ();
     }
     else {
       /* Skip to end of dodgy row */
       current_row = page_res_it.row ();
       while ((page_res_it.word () != NULL) &&
         (page_res_it.row () == current_row))
         page_res_it.forward ();
     }
     check_debug_pt (page_res_it.word (), 110);
   }
   page_res_it.restart_page ();
   page_res_it.page_res->char_count = 0;
   page_res_it.page_res->rej_count = 0;
   current_block = NULL;
   current_row = NULL;
   while (page_res_it.word () != NULL) {
     if (current_block != page_res_it.block ()) {
       current_block = page_res_it.block ();
       current_block->char_count = 0;
       current_block->rej_count = 0;
     }
     if (current_row != page_res_it.row ()) {
       current_row = page_res_it.row ();
       current_row->char_count = 0;
       current_row->rej_count = 0;
       current_row->whole_word_rej_count = 0;
     }
     page_res_it.rej_stat_word ();
     page_res_it.forward ();
   }
 }


 /*************************************************************************
  * doc_and_block_rejection()
  *
  * If the page has too many rejects - reject all of it.
  * If any block has too many rejects - reject all words in the block
  *************************************************************************/

 void Tesseract::doc_and_block_rejection(  //reject big chunks
                                         PAGE_RES_IT &page_res_it,
                                         BOOL8 good_quality_doc) {
   inT16 block_no = 0;
   inT16 row_no = 0;
   BLOCK_RES *current_block;
   ROW_RES *current_row;

   BOOL8 rej_word;
   BOOL8 prev_word_rejected;
   inT16 char_quality;
   inT16 accepted_char_quality;

   if ((page_res_it.page_res->rej_count * 100.0 /
   page_res_it.page_res->char_count) > tessedit_reject_doc_percent) {
     reject_whole_page(page_res_it);
     #ifndef SECURE_NAMES
     if (tessedit_debug_doc_rejection) {
       tprintf ("REJECT ALL #chars: %d #Rejects: %d; \n",
         page_res_it.page_res->char_count,
         page_res_it.page_res->rej_count);
     }
     #endif
   }
   else {
     #ifndef SECURE_NAMES
     if (tessedit_debug_doc_rejection)
       tprintf ("NO PAGE REJECTION #chars: %d  # Rejects: %d; \n",
         page_res_it.page_res->char_count,
         page_res_it.page_res->rej_count);
     #endif

     /* Walk blocks testing for block rejection */

     page_res_it.restart_page ();
     while (page_res_it.word () != NULL) {
       current_block = page_res_it.block ();
       block_no = current_block->block->index();
       if ((page_res_it.block ()->char_count > 0) &&
         ((page_res_it.block ()->rej_count * 100.0 /
         page_res_it.block ()->char_count) >
       tessedit_reject_block_percent)) {
         #ifndef SECURE_NAMES
         if (tessedit_debug_block_rejection)
           tprintf ("REJECTING BLOCK %d  #chars: %d;  #Rejects: %d\n",
             block_no,
             page_res_it.block ()->char_count,
             page_res_it.block ()->rej_count);
         #endif
         prev_word_rejected = FALSE;
         while ((page_res_it.word () != NULL) &&
         (page_res_it.block () == current_block)) {
           if (tessedit_preserve_blk_rej_perfect_wds) {
             rej_word =
               (page_res_it.word ()->reject_map.reject_count () > 0)
               || (page_res_it.word ()->reject_map.length () <
               tessedit_preserve_min_wd_len);
             if (rej_word && tessedit_dont_blkrej_good_wds
               && !(page_res_it.word ()->reject_map.length () <
               tessedit_preserve_min_wd_len)
               &&
               (acceptable_word_string
                (page_res_it.word()->best_choice->unichar_string().string(),
                page_res_it.word ()->best_choice->unichar_lengths().string()) !=
                AC_UNACCEPTABLE)) {
               word_char_quality (page_res_it.word (),
                 page_res_it.row ()->row,
                 &char_quality,
                 &accepted_char_quality);
               rej_word = char_quality !=
                 page_res_it.word ()->reject_map.length ();
             }
           }
           else
             rej_word = TRUE;
           if (rej_word) {
             /*
               Reject spacing if both current and prev words are rejected.
               NOTE - this is NOT restricted to FUZZY spaces. - When tried this
               generated more space errors.
             */
             if (tessedit_use_reject_spaces &&
               prev_word_rejected &&
               (page_res_it.prev_row () == page_res_it.row ()) &&
               (page_res_it.word ()->word->space () == 1))
               page_res_it.word ()->reject_spaces = TRUE;
             page_res_it.word ()->reject_map.rej_word_block_rej ();
           }
           prev_word_rejected = rej_word;
           page_res_it.forward ();
         }
       }
       else {
         #ifndef SECURE_NAMES
         if (tessedit_debug_block_rejection)
           tprintf
             ("NOT REJECTING BLOCK %d #chars: %d  # Rejects: %d; \n",
             block_no, page_res_it.block ()->char_count,
             page_res_it.block ()->rej_count);
         #endif

         /* Walk rows in block testing for row rejection */
         row_no = 0;
         while ((page_res_it.word () != NULL) &&
         (page_res_it.block () == current_block)) {
           current_row = page_res_it.row ();
           row_no++;
           /* Reject whole row if:
             fraction of chars on row which are rejected exceed a limit AND
             fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
             limit
           */
           if ((page_res_it.row ()->char_count > 0) &&
             ((page_res_it.row ()->rej_count * 100.0 /
             page_res_it.row ()->char_count) >
             tessedit_reject_row_percent) &&
             ((page_res_it.row ()->whole_word_rej_count * 100.0 /
             page_res_it.row ()->rej_count) <
           tessedit_whole_wd_rej_row_percent)) {
             #ifndef SECURE_NAMES
             if (tessedit_debug_block_rejection)
               tprintf
                 ("REJECTING ROW %d  #chars: %d;  #Rejects: %d\n",
                 row_no, page_res_it.row ()->char_count,
                 page_res_it.row ()->rej_count);
             #endif
             prev_word_rejected = FALSE;
             while ((page_res_it.word () != NULL) &&
             (page_res_it.row () == current_row)) {
               /* Preserve words on good docs unless they are mostly rejected*/
               if (!tessedit_row_rej_good_docs && good_quality_doc) {
                 rej_word =
                   page_res_it.word ()->reject_map.
                   reject_count () /
                   (float) page_res_it.word ()->reject_map.
                   length () > tessedit_good_doc_still_rowrej_wd;
               }

               /* Preserve perfect words anyway */
               else if (tessedit_preserve_row_rej_perfect_wds) {
                 rej_word =
                   (page_res_it.word ()->reject_map.
                   reject_count () > 0)
                   || (page_res_it.word ()->reject_map.
                   length () < tessedit_preserve_min_wd_len);
                 if (rej_word && tessedit_dont_rowrej_good_wds
                   && !(page_res_it.word ()->reject_map.
                   length () <
                   tessedit_preserve_min_wd_len)
                   &&
                   (acceptable_word_string
                    (page_res_it.word ()->best_choice->
                     unichar_string().string(),
                     page_res_it.word ()->best_choice->
                     unichar_lengths().string()) != AC_UNACCEPTABLE)) {
                   word_char_quality (page_res_it.word (),
                     page_res_it.row ()->row,
                     &char_quality,
                     &accepted_char_quality);
                   rej_word = char_quality !=
                     page_res_it.word ()->reject_map.length ();
                 }
               }
               else
                 rej_word = TRUE;
               if (rej_word) {
                 /*
                   Reject spacing if both current and prev words are rejected.
                   NOTE - this is NOT restricted to FUZZY spaces. - When tried
                   this generated more space errors.
                 */
                 if (tessedit_use_reject_spaces &&
                   prev_word_rejected &&
                   (page_res_it.prev_row () ==
                   page_res_it.row ())
                   && (page_res_it.word ()->word->space () ==
                   1))
                   page_res_it.word ()->reject_spaces = TRUE;
                 page_res_it.word ()->reject_map.
                   rej_word_row_rej();
               }
               prev_word_rejected = rej_word;
               page_res_it.forward ();
             }
           }
           else {
             #ifndef SECURE_NAMES
             if (tessedit_debug_block_rejection)
               tprintf
                 ("NOT REJECTING ROW %d #chars: %d  # Rejects: %d; \n",
                 row_no, page_res_it.row ()->char_count,
                 page_res_it.row ()->rej_count);
             #endif
             while ((page_res_it.word () != NULL) &&
               (page_res_it.row () == current_row))
               page_res_it.forward ();
           }
         }
       }
     }
   }
 }
 }  // namespace tesseract


 /*************************************************************************
  * reject_whole_page()
  * Dont believe any of it - set the reject map to 00..00 in all words
  *
  *************************************************************************/

 void reject_whole_page(PAGE_RES_IT &page_res_it) {
   page_res_it.restart_page ();
   while (page_res_it.word () != NULL) {
     page_res_it.word ()->reject_map.rej_word_doc_rej ();
     page_res_it.forward ();
   }
                                  //whole page is rejected
   page_res_it.page_res->rejected = TRUE;
 }

 namespace tesseract {
 void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
   WERD_RES *word;
   GARBAGE_LEVEL garbage_level;
   PAGE_RES_IT copy_it;
   BOOL8 prev_potential_marked = FALSE;
   BOOL8 found_terrible_word = FALSE;
   BOOL8 ok_dict_word;

   page_res_it.restart_page ();
   while (page_res_it.word () != NULL) {
     word = page_res_it.word ();

     if (crunch_early_convert_bad_unlv_chs)
       convert_bad_unlv_chs(word);

     if (crunch_early_merge_tess_fails)
       merge_tess_fails(word);

     if (word->reject_map.accept_count () != 0) {
       found_terrible_word = FALSE;
                                  //Forget earlier potential crunches
       prev_potential_marked = FALSE;
     }
     else {
       ok_dict_word = safe_dict_word(*(word->best_choice));
       garbage_level = garbage_word (word, ok_dict_word);

       if ((garbage_level != G_NEVER_CRUNCH) &&
       (terrible_word_crunch (word, garbage_level))) {
         if (crunch_debug > 0) {
           tprintf ("T CRUNCHING: \"%s\"\n",
             word->best_choice->unichar_string().string());
         }
         word->unlv_crunch_mode = CR_KEEP_SPACE;
         if (prev_potential_marked) {
           while (copy_it.word () != word) {
             if (crunch_debug > 0) {
               tprintf ("P1 CRUNCHING: \"%s\"\n",
                 copy_it.word()->best_choice->unichar_string().string());
             }
             copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
             copy_it.forward ();
           }
           prev_potential_marked = FALSE;
         }
         found_terrible_word = TRUE;
       }
       else if ((garbage_level != G_NEVER_CRUNCH) &&
         (potential_word_crunch (word,
       garbage_level, ok_dict_word))) {
         if (found_terrible_word) {
           if (crunch_debug > 0) {
             tprintf ("P2 CRUNCHING: \"%s\"\n",
               word->best_choice->unichar_string().string());
           }
           word->unlv_crunch_mode = CR_KEEP_SPACE;
         }
         else if (!prev_potential_marked) {
           copy_it = page_res_it;
           prev_potential_marked = TRUE;
           if (crunch_debug > 1) {
             tprintf ("P3 CRUNCHING: \"%s\"\n",
               word->best_choice->unichar_string().string());
           }
         }
       }
       else {
         found_terrible_word = FALSE;
                                  //Forget earlier potential crunches
         prev_potential_marked = FALSE;
         if (crunch_debug > 2) {
           tprintf ("NO CRUNCH: \"%s\"\n",
             word->best_choice->unichar_string().string());
         }
       }
     }
     page_res_it.forward ();
   }
 }
 }  // namespace tesseract


 BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) {
   float rating_per_ch;
   int adjusted_len;
   int crunch_mode = 0;

   if ((word->best_choice->unichar_string().length () == 0) ||
     (strspn (word->best_choice->unichar_string().string(), " ") ==
     word->best_choice->unichar_string().length ()))
     crunch_mode = 1;
   else {
     adjusted_len = word->reject_map.length ();
     if (adjusted_len > crunch_rating_max)
       adjusted_len = crunch_rating_max;
     rating_per_ch = word->best_choice->rating () / adjusted_len;

     if (rating_per_ch > crunch_terrible_rating)
       crunch_mode = 2;
     else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
       crunch_mode = 3;
     else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
       (garbage_level != G_OK))
       crunch_mode = 4;
     else if ((rating_per_ch > crunch_poor_garbage_rate) &&
       (garbage_level != G_OK))
       crunch_mode = 5;
   }
   if (crunch_mode > 0) {
     if (crunch_debug > 2) {
       tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
         crunch_mode, word->best_choice->unichar_string().string());
     }
     return TRUE;
   }
   else
     return FALSE;
 }

 namespace tesseract {
 BOOL8 Tesseract::potential_word_crunch(WERD_RES *word,
                                        GARBAGE_LEVEL garbage_level,
                                        BOOL8 ok_dict_word) {
   float rating_per_ch;
   int adjusted_len;
   const char *str = word->best_choice->unichar_string().string();
   const char *lengths = word->best_choice->unichar_lengths().string();
   BOOL8 word_crunchable;
   int poor_indicator_count = 0;

   word_crunchable =
     !crunch_leave_accept_strings ||
     (word->reject_map.length () < 3) ||
     ((acceptable_word_string (str, lengths) == AC_UNACCEPTABLE) &&
      !ok_dict_word);

   adjusted_len = word->reject_map.length ();
   if (adjusted_len > 10)
     adjusted_len = 10;
   rating_per_ch = word->best_choice->rating () / adjusted_len;

   if (rating_per_ch > crunch_pot_poor_rate) {
     if (crunch_debug > 2) {
       tprintf ("Potential poor rating on \"%s\"\n",
         word->best_choice->unichar_string().string());
     }
     poor_indicator_count++;
   }

   if (word_crunchable &&
   (word->best_choice->certainty () < crunch_pot_poor_cert)) {
     if (crunch_debug > 2) {
       tprintf ("Potential poor cert on \"%s\"\n",
         word->best_choice->unichar_string().string());
     }
     poor_indicator_count++;
   }

   if (garbage_level != G_OK) {
     if (crunch_debug > 2) {
       tprintf ("Potential garbage on \"%s\"\n",
         word->best_choice->unichar_string().string());
     }
     poor_indicator_count++;
   }
   return (poor_indicator_count >= crunch_pot_indicators);
 }
 }  // namespace tesseract


 namespace tesseract {
 void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
   WERD_RES *word;
   PAGE_RES_IT copy_it;
   BOOL8 deleting_from_bol = FALSE;
   BOOL8 marked_delete_point = FALSE;
   inT16 debug_delete_mode;
   CRUNCH_MODE delete_mode;
   inT16 x_debug_delete_mode;
   CRUNCH_MODE x_delete_mode;

   page_res_it.restart_page ();
   while (page_res_it.word () != NULL) {
     word = page_res_it.word ();

     delete_mode = word_deletable (word, debug_delete_mode);
     if (delete_mode != CR_NONE) {
       if (word->word->flag (W_BOL) || deleting_from_bol) {
         if (crunch_debug > 0) {
           tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
             debug_delete_mode,
             word->best_choice->unichar_string().string());
         }
         word->unlv_crunch_mode = delete_mode;
         deleting_from_bol = TRUE;
       }
       else if (word->word->flag (W_EOL)) {
         if (marked_delete_point) {
           while (copy_it.word () != word) {
             x_delete_mode = word_deletable (copy_it.word (),
               x_debug_delete_mode);
             if (crunch_debug > 0) {
               tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
                 x_debug_delete_mode,
                 copy_it.word()->best_choice->unichar_string().string());
             }
             copy_it.word ()->unlv_crunch_mode = x_delete_mode;
             copy_it.forward ();
           }
         }
         if (crunch_debug > 0) {
           tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
             debug_delete_mode,
             word->best_choice->unichar_string().string());
         }
         word->unlv_crunch_mode = delete_mode;
         deleting_from_bol = FALSE;
         marked_delete_point = FALSE;
       }
       else {
         if (!marked_delete_point) {
           copy_it = page_res_it;
           marked_delete_point = TRUE;
         }
       }
     }
     else {
       deleting_from_bol = FALSE;
                                  //Forget earlier potential crunches
       marked_delete_point = FALSE;
     }
     /*
       The following step has been left till now as the tess fails are used to
       determine if the word is deletable.
     */
     if (!crunch_early_merge_tess_fails)
       merge_tess_fails(word);
     page_res_it.forward ();
   }
 }


 void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
   int i;
   UNICHAR_ID unichar_dash = unicharset.unichar_to_id("-");
   UNICHAR_ID unichar_space = unicharset.unichar_to_id(" ");
   UNICHAR_ID unichar_tilde = unicharset.unichar_to_id("~");
   UNICHAR_ID unichar_pow = unicharset.unichar_to_id("^");
   bool modified = false;
   for (i = 0; i < word_res->reject_map.length(); ++i) {
     if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
       word_res->best_choice->set_unichar_id(unichar_dash, i);
       modified = true;
       if (word_res->reject_map[i].accepted ())
         word_res->reject_map[i].setrej_unlv_rej ();
     }
     if (word_res->best_choice->unichar_id(i) == unichar_pow) {
       word_res->best_choice->set_unichar_id(unichar_space, i);
       modified = true;
       if (word_res->reject_map[i].accepted ())
         word_res->reject_map[i].setrej_unlv_rej ();
     }
   }
   if (modified) {
     word_res->best_choice->populate_unichars(unicharset);
   }
 }

 // Change pairs of tess failures to a single one
 void Tesseract::merge_tess_fails(WERD_RES *word_res) {
   PBLOB_IT blob_it;              //blobs
   int len = word_res->best_choice->length();
   bool modified = false;

   ASSERT_HOST (word_res->reject_map.length () == len);
   ASSERT_HOST (word_res->outword->blob_list ()->length () == len);

   UNICHAR_ID unichar_space = unicharset.unichar_to_id(" ");
   blob_it = word_res->outword->blob_list ();
   int i = 0;
   while (i < word_res->best_choice->length()-1) {
     if ((word_res->best_choice->unichar_id(i) == unichar_space) &&
         (word_res->best_choice->unichar_id(i+1) == unichar_space)) {
       modified = true;
       word_res->best_choice->remove_unichar_id(i);
       word_res->reject_map.remove_pos (i);
       merge_blobs (blob_it.data_relative (1), blob_it.data ());
       delete blob_it.extract (); //get rid of spare
     } else {
       i++;
     }
     blob_it.forward ();
   }
   len = word_res->best_choice->length();
   ASSERT_HOST (word_res->reject_map.length () == len);
   ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
   if (modified) {
     word_res->best_choice->populate_unichars(unicharset);
   }
 }

 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
   enum STATES
   {
     JUNK,
     FIRST_UPPER,
     FIRST_LOWER,
     FIRST_NUM,
     SUBSEQUENT_UPPER,
     SUBSEQUENT_LOWER,
     SUBSEQUENT_NUM
   };
   const char *str = word->best_choice->unichar_string().string();
   const char *lengths = word->best_choice->unichar_lengths().string();
   STATES state = JUNK;
   int len = 0;
   int isolated_digits = 0;
   int isolated_alphas = 0;
   int bad_char_count = 0;
   int tess_rejs = 0;
   int dodgy_chars = 0;
   int ok_chars;
   UNICHAR_ID last_char = -1;
   int alpha_repetition_count = 0;
   int longest_alpha_repetition_count = 0;
   int longest_lower_run_len = 0;
   int lower_string_count = 0;
   int longest_upper_run_len = 0;
   int upper_string_count = 0;
   int total_alpha_count = 0;
   int total_digit_count = 0;

   for (; *str != '\0'; str += *(lengths++)) {
     len++;
     if (unicharset.get_isupper (str, *lengths)) {
       total_alpha_count++;
       switch (state) {
         case SUBSEQUENT_UPPER:
         case FIRST_UPPER:
           state = SUBSEQUENT_UPPER;
           upper_string_count++;
           if (longest_upper_run_len < upper_string_count)
             longest_upper_run_len = upper_string_count;
           if (last_char == unicharset.unichar_to_id(str, *lengths)) {
             alpha_repetition_count++;
             if (longest_alpha_repetition_count < alpha_repetition_count) {
               longest_alpha_repetition_count = alpha_repetition_count;
             }
           }
           else {
             last_char = unicharset.unichar_to_id(str, *lengths);
             alpha_repetition_count = 1;
           }
           break;
         case FIRST_NUM:
           isolated_digits++;
         default:
           state = FIRST_UPPER;
           last_char = unicharset.unichar_to_id(str, *lengths);
           alpha_repetition_count = 1;
           upper_string_count = 1;
           break;
       }
     }
     else if (unicharset.get_islower (str, *lengths)) {
       total_alpha_count++;
       switch (state) {
         case SUBSEQUENT_LOWER:
         case FIRST_LOWER:
           state = SUBSEQUENT_LOWER;
           lower_string_count++;
           if (longest_lower_run_len < lower_string_count)
             longest_lower_run_len = lower_string_count;
           if (last_char == unicharset.unichar_to_id(str, *lengths)) {
             alpha_repetition_count++;
             if (longest_alpha_repetition_count < alpha_repetition_count) {
               longest_alpha_repetition_count = alpha_repetition_count;
             }
           }
           else {
             last_char = unicharset.unichar_to_id(str, *lengths);
             alpha_repetition_count = 1;
           }
           break;
         case FIRST_NUM:
           isolated_digits++;
         default:
           state = FIRST_LOWER;
           last_char = unicharset.unichar_to_id(str, *lengths);
           alpha_repetition_count = 1;
           lower_string_count = 1;
           break;
       }
     }
     else if (unicharset.get_isdigit (str, *lengths)) {
       total_digit_count++;
       switch (state) {
         case FIRST_NUM:
           state = SUBSEQUENT_NUM;
         case SUBSEQUENT_NUM:
           break;
         case FIRST_UPPER:
         case FIRST_LOWER:
           isolated_alphas++;
         default:
           state = FIRST_NUM;
           break;
       }
     }
     else {
       if (*lengths == 1 && *str == ' ')
         tess_rejs++;
       else
         bad_char_count++;
       switch (state) {
         case FIRST_NUM:
           isolated_digits++;
           break;
         case FIRST_UPPER:
         case FIRST_LOWER:
           isolated_alphas++;
         default:
           break;
       }
       state = JUNK;
     }
   }

   switch (state) {
     case FIRST_NUM:
       isolated_digits++;
       break;
     case FIRST_UPPER:
     case FIRST_LOWER:
       isolated_alphas++;
     default:
       break;
   }

   if (crunch_include_numerals) {
     total_alpha_count += total_digit_count - isolated_digits;
   }

   if (crunch_leave_ok_strings &&
     (len >= 4) &&
     (2 * (total_alpha_count - isolated_alphas) > len) &&
   (longest_alpha_repetition_count < crunch_long_repetitions)) {
     if ((crunch_accept_ok &&
       (acceptable_word_string (str, lengths) != AC_UNACCEPTABLE)) ||
       (longest_lower_run_len > crunch_leave_lc_strings) ||
       (longest_upper_run_len > crunch_leave_uc_strings))
       return G_NEVER_CRUNCH;
   }
   if ((word->reject_map.length () > 1) &&
     (strpbrk (str, " ") == NULL) &&
     ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
     (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
     (word->best_choice->permuter () == USER_DAWG_PERM) ||
     (word->best_choice->permuter () == NUMBER_PERM) ||
     (acceptable_word_string (str, lengths) != AC_UNACCEPTABLE) || ok_dict_word))
     return G_OK;

   ok_chars = len - bad_char_count - isolated_digits -
     isolated_alphas - tess_rejs;

   if (crunch_debug > 3) {
     tprintf ("garbage_word: \"%s\"\n",
       word->best_choice->unichar_string().string());
     tprintf ("LEN: %d  bad: %d  iso_N: %d  iso_A: %d  rej: %d\n",
       len,
       bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
   }
   if ((bad_char_count == 0) &&
     (tess_rejs == 0) &&
     ((len > isolated_digits + isolated_alphas) || (len <= 2)))
     return G_OK;

   if ((tess_rejs > ok_chars) ||
     ((tess_rejs > 0) && ((bad_char_count + tess_rejs) * 2 > len)))
     return G_TERRIBLE;

   if (len > 4) {
     dodgy_chars = 2 * tess_rejs + bad_char_count +
       isolated_digits + isolated_alphas;
     if ((dodgy_chars > 5) || ((dodgy_chars / (float) len) > 0.5))
       return G_DODGY;
     else
       return G_OK;
   }
   else {
     dodgy_chars = 2 * tess_rejs + bad_char_count;
     if (((len == 4) && (dodgy_chars > 2)) ||
       ((len == 3) && (dodgy_chars > 2)) || (dodgy_chars >= len))
       return G_DODGY;
     else
       return G_OK;
   }
 }
 }  // namespace tesseract


 /*************************************************************************
  * word_deletable()
  *     DELETE WERDS AT ENDS OF ROWS IF
  *        Word is crunched &&
  *        ( string length = 0                                          OR
  *          > 50% of chars are "|" (before merging)                    OR
  *          certainty < -10                                            OR
  *          rating /char > 60                                          OR
  *          TOP of word is more than 0.5 xht BELOW baseline            OR
  *          BOTTOM of word is more than 0.5 xht ABOVE xht              OR
  *          length of word < 3xht                                      OR
  *          height of word < 0.7 xht                                   OR
  *          height of word > 3.0 xht                                   OR
  *          >75% of the outline BBs have longest dimension < 0.5xht
  *************************************************************************/

 CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode) {
   int word_len = word->reject_map.length ();
   float rating_per_ch;
   TBOX box;                       //BB of word

   if (word->unlv_crunch_mode == CR_NONE) {
     delete_mode = 0;
     return CR_NONE;
   }

   if (word_len == 0) {
     delete_mode = 1;
     return CR_DELETE;
   }

   box = word->outword->bounding_box ();
   if (box.height () < crunch_del_min_ht * bln_x_height) {
     delete_mode = 4;
     return CR_DELETE;
   }

   if (noise_outlines (word->outword)) {
     delete_mode = 5;
     return CR_DELETE;
   }

   if ((failure_count (word) * 1.5) > word_len) {
     delete_mode = 2;
     return CR_LOOSE_SPACE;
   }

   if (word->best_choice->certainty () < crunch_del_cert) {
     delete_mode = 7;
     return CR_LOOSE_SPACE;
   }

   rating_per_ch = word->best_choice->rating () / word_len;

   if (rating_per_ch > crunch_del_rating) {
     delete_mode = 8;
     return CR_LOOSE_SPACE;
   }

   if (box.top () < bln_baseline_offset - crunch_del_low_word * bln_x_height) {
     delete_mode = 9;
     return CR_LOOSE_SPACE;
   }

   if (box.bottom () >
   bln_baseline_offset + crunch_del_high_word * bln_x_height) {
     delete_mode = 10;
     return CR_LOOSE_SPACE;
   }

   if (box.height () > crunch_del_max_ht * bln_x_height) {
     delete_mode = 11;
     return CR_LOOSE_SPACE;
   }

   if (box.width () < crunch_del_min_width * bln_x_height) {
     delete_mode = 3;
     return CR_LOOSE_SPACE;
   }

   delete_mode = 0;
   return CR_NONE;
 }

 inT16 failure_count(WERD_RES *word) {
   const char *str = word->best_choice->unichar_string().string();
   int tess_rejs = 0;

   for (; *str != '\0'; str++) {
     if (*str == ' ')
       tess_rejs++;
   }
   return tess_rejs;
 }


 BOOL8 noise_outlines(WERD *word) {
   PBLOB_IT blob_it;
   OUTLINE_IT outline_it;
   TBOX box;                       //BB of outline
   inT16 outline_count = 0;
   inT16 small_outline_count = 0;
   inT16 max_dimension;
   float small_limit = bln_x_height * crunch_small_outlines_size;

   blob_it.set_to_list (word->blob_list ());
   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
     outline_it.set_to_list (blob_it.data ()->out_list ());
     for (outline_it.mark_cycle_pt ();
     !outline_it.cycled_list (); outline_it.forward ()) {
       outline_count++;
       box = outline_it.data ()->bounding_box ();
       if (box.height () > box.width ())
         max_dimension = box.height ();
       else
         max_dimension = box.width ();
       if (max_dimension < small_limit)
         small_outline_count++;
     }
   }
   return (small_outline_count >= outline_count);
 }


 /*************************************************************************
  * insert_rej_cblobs()
  * Put rejected word blobs back into the outword.
  * NOTE!!! AFTER THIS THE CHOICES LIST WILL NOT HAVE THE CORRECT NUMBER
  * OF ELEMENTS.
  *************************************************************************/
 namespace tesseract {
 void Tesseract::insert_rej_cblobs(WERD_RES *word) {
   PBLOB_IT blob_it;              //blob iterator
   PBLOB_IT rej_blob_it;
   const STRING *word_str;
   const STRING *word_lengths;
   int old_len;
   int rej_len;
   char new_str[512 * UNICHAR_LEN];
   char new_lengths[512];
   REJMAP new_map;
   int i = 0;                     //new_str index
   int j = 0;                     //old_str index
   int i_offset = 0;              //new_str offset
   int j_offset = 0;              //old_str offset
   int new_len;

   gblob_sort_list (word->outword->rej_blob_list (), TRUE);
   rej_blob_it.set_to_list (word->outword->rej_blob_list ());
   if (rej_blob_it.empty ())
     return;
   rej_len = rej_blob_it.length ();
   blob_it.set_to_list (word->outword->blob_list ());
   word_str = &(word->best_choice->unichar_string());
   word_lengths = &(word->best_choice->unichar_lengths());
   old_len = word->best_choice->length();
   ASSERT_HOST (word->reject_map.length () == old_len);
   ASSERT_HOST (blob_it.length () == old_len);
   if ((old_len + rej_len) > 511)
     return;                      //Word is garbage anyway prevent abort
   new_map.initialise (old_len + rej_len);

   while (!rej_blob_it.empty ()) {
     if ((j >= old_len) ||
       (rej_blob_it.data ()->bounding_box ().left () <=
     blob_it.data ()->bounding_box ().left ())) {
       /* Insert reject blob */
       if (j >= old_len)
         blob_it.add_to_end (rej_blob_it.extract ());
       else
         blob_it.add_before_stay_put (rej_blob_it.extract ());
       if (!rej_blob_it.empty ())
         rej_blob_it.forward ();
       new_str[i_offset] = ' ';
       new_lengths[i] = 1;
       new_map[i].setrej_rej_cblob ();
       i_offset += new_lengths[i++];
     }
     else {
       strncpy(new_str + i_offset, &(*word_str)[j_offset],
               (*word_lengths)[j]);
       new_lengths[i] = (*word_lengths)[j];
       new_map[i] = word->reject_map[j];
       i_offset += new_lengths[i++];
       j_offset += (*word_lengths)[j++];
       blob_it.forward ();
     }
   }
   /* Add any extra normal blobs to strings */
   while (j < word_lengths->length ()) {
     strncpy(new_str + i_offset, &(*word_str)[j_offset],
             (*word_lengths)[j]);
     new_lengths[i] = (*word_lengths)[j];
     new_map[i] = word->reject_map[j];
     i_offset += new_lengths[i++];
     j_offset += (*word_lengths)[j++];
   }
   new_str[i_offset] = '\0';
   new_lengths[i] = 0;
   /*
     tprintf(
           "\nOld len %d; New len %d; New str \"%s\"; New map \"%s\"\n",
           old_len, i, new_str, new_map );
   */
   ASSERT_HOST (i == blob_it.length ());
   ASSERT_HOST (i == old_len + rej_len);
   word->reject_map = new_map;

   // Update word->best_choice if needed.
   if (strcmp(new_str, word->best_choice->unichar_string().string()) != 0 ||
       strcmp(new_lengths, word->best_choice->unichar_lengths().string()) != 0) {
     WERD_CHOICE *new_choice =
       new WERD_CHOICE(new_str, new_lengths,
                       word->best_choice->rating(),
                       word->best_choice->certainty(),
                       word->best_choice->permuter(),
                       getDict().getUnicharset());
    new_choice->populate_unichars(getDict().getUnicharset());
    delete word->best_choice;
    word->best_choice = new_choice;
   }
   new_len = word->best_choice->length();
   ASSERT_HOST (word->reject_map.length () == new_len);
   ASSERT_HOST (word->outword->blob_list ()->length () == new_len);

 }
 }  // namespace tesseract