textord/wordseg.cpp - platform/external/tesseract - Git at Google

 /**********************************************************************
  * File:        wordseg.cpp  (Formerly wspace.c)
  * Description: Code to segment the blobs into words.
  * Author:		Ray Smith
  * Created:		Fri Oct 16 11:32:28 BST 1992
  *
  * (C) Copyright 1992, Hewlett-Packard Ltd.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  **********************************************************************/

 #include "mfcpch.h"
 #ifdef __UNIX__
 #include          <assert.h>
 #endif
 #include          "stderr.h"
 #include          "blobbox.h"
 #include          "ocrclass.h"
 #include          "lmedsq.h"
 #include          "statistc.h"
 #include          "drawtord.h"
 #include          "makerow.h"
 #include          "pitsync1.h"
 #include          "blobcmpl.h"
 #include          "tovars.h"
 #include          "topitch.h"
 #include          "tospace.h"
 #include          "fpchop.h"
 #include          "wordseg.h"

 #define EXTERN

 EXTERN BOOL_VAR (textord_fp_chopping, TRUE, "Do fixed pitch chopping");
 extern /*"C" */ ETEXT_DESC *global_monitor;     //progress monitor

 #define FIXED_WIDTH_MULTIPLE  5
 #define BLOCK_STATS_CLUSTERS  10

 /**********************************************************************
  * make_single_word
  *
  * Arrange the blobs into one word. There is no fixed pitch detection.
  **********************************************************************/

 void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST* real_rows) {
   TO_ROW_IT to_row_it(rows);
   TO_ROW* row = to_row_it.data();
   // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
   // to create the word.
   C_BLOB_LIST cblobs;
   C_BLOB_IT cblob_it(&cblobs);
   BLOBNBOX_IT box_it(row->blob_list());
   for (;!box_it.empty(); box_it.forward()) {
     BLOBNBOX* bblob= box_it.extract();
     if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
       if (bblob->cblob() != NULL) {
         C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
         cout_it.move_to_last();
         cout_it.add_list_after(bblob->cblob()->out_list());
         delete bblob->cblob();
       }
     } else {
       if (bblob->cblob() != NULL)
         cblob_it.add_after_then_move(bblob->cblob());
       delete bblob;
     }
   }
   // Convert the TO_ROW to a ROW.
   ROW* real_row = new ROW(row, static_cast<inT16>(row->kern_size),
                           static_cast<inT16>(row->space_size));
   WERD_IT word_it(real_row->word_list());
   WERD* word = new WERD(&cblobs, 0, NULL);
   word->set_flag(W_BOL, TRUE);
   word->set_flag(W_EOL, TRUE);
   word_it.add_after_then_move(word);
   ROW_IT row_it(real_rows);
   row_it.add_after_then_move(real_row);
 }

 /**********************************************************************
  * make_words
  *
  * Arrange the blobs into words.
  **********************************************************************/

 void make_words(                             //make words
                 ICOORD page_tr,              //top right
                 float gradient,              //page skew
                 BLOCK_LIST *blocks,          //block list
                 TO_BLOCK_LIST *land_blocks,  //rotated for landscape
                 TO_BLOCK_LIST *port_blocks,  //output list
                 tesseract::Tesseract* tess
                ) {
   TO_BLOCK_IT block_it;          //iterator
   TO_BLOCK *block;               //current block;

   compute_fixed_pitch (page_tr, port_blocks, gradient, FCOORD (0.0f, -1.0f),
     !(BOOL8) textord_test_landscape, tess);
   if (global_monitor != NULL) {
     global_monitor->ocr_alive = TRUE;
     global_monitor->progress = 25;
   }
   to_spacing(page_tr, port_blocks);
   block_it.set_to_list (port_blocks);
   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
   block_it.forward ()) {
     block = block_it.data ();
     //              set_row_spaces(block,FCOORD(1,0),!(BOOL8)textord_test_landscape);
                                  //make proper classes
     make_real_words (block, FCOORD (1.0f, 0.0f));
   }
 }


 /**********************************************************************
  * set_row_spaces
  *
  * Set the min_space and max_nonspace members of the row so that
  * the blobs can be arranged into words.
  **********************************************************************/

 void set_row_spaces(                  //find space sizes
                     TO_BLOCK *block,  //block to do
                     FCOORD rotation,  //for drawing
                     BOOL8 testing_on  //correct orientation
                    ) {
   inT32 maxwidth;                //of widest space
   TO_ROW *row;                   //current row
   TO_ROW_IT row_it = block->get_rows ();

   if (row_it.empty ())
     return;                      //empty block
   maxwidth = (inT32) ceil (block->xheight * textord_words_maxspace);
   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
     row = row_it.data ();
     if (row->fixed_pitch == 0) {
       //                      if (!textord_test_mode
       //                      && row_words(block,row,maxwidth,rotation,testing_on)==0
       //                      || textord_test_mode
       //                      && row_words2(block,row,maxwidth,rotation,testing_on)==0)
       //                      {
       row->min_space =
         (inT32) ceil (row->pr_space -
         (row->pr_space -
         row->pr_nonsp) * textord_words_definite_spread);
       row->max_nonspace =
         (inT32) floor (row->pr_nonsp +
         (row->pr_space -
         row->pr_nonsp) * textord_words_definite_spread);
       if (testing_on && textord_show_initial_words) {
         tprintf ("Assigning defaults %d non, %d space to row at %g\n",
           row->max_nonspace, row->min_space, row->intercept ());
       }
       row->space_threshold = (row->max_nonspace + row->min_space) / 2;
       row->space_size = row->pr_space;
       row->kern_size = row->pr_nonsp;
       //                      }
     }
 #ifndef GRAPHICS_DISABLED
     if (textord_show_initial_words && testing_on) {
       plot_word_decisions (to_win, (inT16) row->fixed_pitch, row);
     }
 #endif
   }
 }


 /**********************************************************************
  * row_words
  *
  * Compute the max nonspace and min space for the row.
  **********************************************************************/

 inT32 row_words(                  //compute space size
                 TO_BLOCK *block,  //block it came from
                 TO_ROW *row,      //row to operate on
                 inT32 maxwidth,   //max expected space size
                 FCOORD rotation,  //for drawing
                 BOOL8 testing_on  //for debug
                ) {
   BOOL8 testing_row;             //contains testpt
   BOOL8 prev_valid;              //if decent size
   BOOL8 this_valid;              //current blob big enough
   inT32 prev_x;                  //end of prev blob
   inT32 min_gap;                 //min interesting gap
   inT32 cluster_count;           //no of clusters
   inT32 gap_index;               //which cluster
   inT32 smooth_factor;           //for smoothing stats
   BLOBNBOX *blob;                //current blob
   float lower, upper;            //clustering parameters
   float gaps[3];                 //gap clusers
   ICOORD testpt;
   TBOX blob_box;                  //bounding box
                                  //iterator
   BLOBNBOX_IT blob_it = row->blob_list ();
   STATS gap_stats (0, maxwidth);
   STATS cluster_stats[4];        //clusters

   testpt = ICOORD (textord_test_x, textord_test_y);
   smooth_factor =
     (inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5);
   //      if (testing_on)
   //              tprintf("Row smooth factor=%d\n",smooth_factor);
   prev_valid = FALSE;
   prev_x = -MAX_INT32;
   testing_row = FALSE;
   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
     blob = blob_it.data ();
     blob_box = blob->bounding_box ();
     if (blob_box.contains (testpt))
       testing_row = TRUE;
     gap_stats.add (blob_box.width (), 1);
   }
   min_gap = (inT32) floor (gap_stats.ile (textord_words_width_ile));
   gap_stats.clear ();
   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
     blob = blob_it.data ();
     if (!blob->joined_to_prev ()) {
       blob_box = blob->bounding_box ();
       //                      this_valid=blob_box.width()>=min_gap;
       this_valid = TRUE;
       if (this_valid && prev_valid
       && blob_box.left () - prev_x < maxwidth) {
         gap_stats.add (blob_box.left () - prev_x, 1);
       }
       prev_x = blob_box.right ();
       prev_valid = this_valid;
     }
   }
   if (gap_stats.get_total () == 0) {
     row->min_space = 0;          //no evidence
     row->max_nonspace = 0;
     return 0;
   }
   gap_stats.smooth (smooth_factor);
   lower = row->xheight * textord_words_initial_lower;
   upper = row->xheight * textord_words_initial_upper;
   cluster_count = gap_stats.cluster (lower, upper,
     textord_spacesize_ratioprop, 3,
     cluster_stats);
   while (cluster_count < 2 && ceil (lower) < floor (upper)) {
                                  //shrink gap
     upper = (upper * 3 + lower) / 4;
     lower = (lower * 3 + upper) / 4;
     cluster_count = gap_stats.cluster (lower, upper,
       textord_spacesize_ratioprop, 3,
       cluster_stats);
   }
   if (cluster_count < 2) {
     row->min_space = 0;          //no evidence
     row->max_nonspace = 0;
     return 0;
   }
   for (gap_index = 0; gap_index < cluster_count; gap_index++)
     gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
   //get medians
   if (cluster_count > 2) {
     if (testing_on && textord_show_initial_words) {
       tprintf ("Row at %g has 3 sizes of gap:%g,%g,%g\n",
         row->intercept (),
         cluster_stats[1].ile (0.5),
         cluster_stats[2].ile (0.5), cluster_stats[3].ile (0.5));
     }
     lower = gaps[0];
     if (gaps[1] > lower) {
       upper = gaps[1];           //prefer most frequent
       if (upper < block->xheight * textord_words_min_minspace
       && gaps[2] > gaps[1]) {
         upper = gaps[2];
       }
     }
     else if (gaps[2] > lower
       && gaps[2] >= block->xheight * textord_words_min_minspace)
       upper = gaps[2];
     else if (lower >= block->xheight * textord_words_min_minspace) {
       upper = lower;             //not nice
       lower = gaps[1];
       if (testing_on && textord_show_initial_words) {
         tprintf ("Had to switch most common from lower to upper!!\n");
         gap_stats.print (stdout, TRUE);
       }
     }
     else {
       row->min_space = 0;        //no evidence
       row->max_nonspace = 0;
       return 0;
     }
   }
   else {
     if (gaps[1] < gaps[0]) {
       if (testing_on && textord_show_initial_words) {
         tprintf ("Had to switch most common from lower to upper!!\n");
         gap_stats.print (stdout, TRUE);
       }
       lower = gaps[1];
       upper = gaps[0];
     }
     else {
       upper = gaps[1];
       lower = gaps[0];
     }
   }
   if (upper < block->xheight * textord_words_min_minspace) {
     row->min_space = 0;          //no evidence
     row->max_nonspace = 0;
     return 0;
   }
   if (upper * 3 < block->min_space * 2 + block->max_nonspace
   || lower * 3 > block->min_space * 2 + block->max_nonspace) {
     if (testing_on && textord_show_initial_words) {
       tprintf ("Disagreement between block and row at %g!!\n",
         row->intercept ());
       tprintf ("Lower=%g, upper=%g, Stats:\n", lower, upper);
       gap_stats.print (stdout, TRUE);
     }
   }
   row->min_space =
     (inT32) ceil (upper - (upper - lower) * textord_words_definite_spread);
   row->max_nonspace =
     (inT32) floor (lower + (upper - lower) * textord_words_definite_spread);
   row->space_threshold = (row->max_nonspace + row->min_space) / 2;
   row->space_size = upper;
   row->kern_size = lower;
   if (testing_on && textord_show_initial_words) {
     if (testing_row) {
       tprintf ("GAP STATS\n");
       gap_stats.print (stdout, TRUE);
       tprintf ("SPACE stats\n");
       cluster_stats[2].print (stdout, FALSE);
       tprintf ("NONSPACE stats\n");
       cluster_stats[1].print (stdout, FALSE);
     }
     tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n",
       row->intercept (), row->min_space, upper,
       row->max_nonspace, lower);
   }
   return cluster_stats[2].get_total ();
 }


 /**********************************************************************
  * row_words2
  *
  * Compute the max nonspace and min space for the row.
  **********************************************************************/

 inT32 row_words2(                  //compute space size
                  TO_BLOCK *block,  //block it came from
                  TO_ROW *row,      //row to operate on
                  inT32 maxwidth,   //max expected space size
                  FCOORD rotation,  //for drawing
                  BOOL8 testing_on  //for debug
                 ) {
   BOOL8 testing_row;             //contains testpt
   BOOL8 prev_valid;              //if decent size
   BOOL8 this_valid;              //current blob big enough
   inT32 prev_x;                  //end of prev blob
   inT32 min_width;               //min interesting width
   inT32 valid_count;             //good gaps
   inT32 total_count;             //total gaps
   inT32 cluster_count;           //no of clusters
   inT32 prev_count;              //previous cluster_count
   inT32 gap_index;               //which cluster
   inT32 smooth_factor;           //for smoothing stats
   BLOBNBOX *blob;                //current blob
   float lower, upper;            //clustering parameters
   ICOORD testpt;
   TBOX blob_box;                  //bounding box
                                  //iterator
   BLOBNBOX_IT blob_it = row->blob_list ();
   STATS gap_stats (0, maxwidth);
                                  //gap sizes
   float gaps[BLOCK_STATS_CLUSTERS];
   STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
   //clusters

   testpt = ICOORD (textord_test_x, textord_test_y);
   smooth_factor =
     (inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5);
   //      if (testing_on)
   //              tprintf("Row smooth factor=%d\n",smooth_factor);
   prev_valid = FALSE;
   prev_x = -MAX_INT16;
   testing_row = FALSE;
                                  //min blob size
   min_width = (inT32) block->pr_space;
   total_count = 0;
   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
     blob = blob_it.data ();
     if (!blob->joined_to_prev ()) {
       blob_box = blob->bounding_box ();
       this_valid = blob_box.width () >= min_width;
       this_valid = TRUE;
       if (this_valid && prev_valid
       && blob_box.left () - prev_x < maxwidth) {
         gap_stats.add (blob_box.left () - prev_x, 1);
       }
       total_count++;             //count possibles
       prev_x = blob_box.right ();
       prev_valid = this_valid;
     }
   }
   valid_count = gap_stats.get_total ();
   if (valid_count < total_count * textord_words_minlarge) {
     gap_stats.clear ();
     prev_x = -MAX_INT16;
     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
     blob_it.forward ()) {
       blob = blob_it.data ();
       if (!blob->joined_to_prev ()) {
         blob_box = blob->bounding_box ();
         if (blob_box.left () - prev_x < maxwidth) {
           gap_stats.add (blob_box.left () - prev_x, 1);
         }
         prev_x = blob_box.right ();
       }
     }
   }
   if (gap_stats.get_total () == 0) {
     row->min_space = 0;          //no evidence
     row->max_nonspace = 0;
     return 0;
   }

   cluster_count = 0;
   lower = block->xheight * words_initial_lower;
   upper = block->xheight * words_initial_upper;
   gap_stats.smooth (smooth_factor);
   do {
     prev_count = cluster_count;
     cluster_count = gap_stats.cluster (lower, upper,
       textord_spacesize_ratioprop,
       BLOCK_STATS_CLUSTERS, cluster_stats);
   }
   while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
   if (cluster_count < 1) {
     row->min_space = 0;
     row->max_nonspace = 0;
     return 0;
   }
   for (gap_index = 0; gap_index < cluster_count; gap_index++)
     gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
   //get medians
   if (testing_on) {
     tprintf ("cluster_count=%d:", cluster_count);
     for (gap_index = 0; gap_index < cluster_count; gap_index++)
       tprintf (" %g(%d)", gaps[gap_index],
         cluster_stats[gap_index + 1].get_total ());
     tprintf ("\n");
   }

   //Try to find proportional non-space and space for row.
   for (gap_index = 0; gap_index < cluster_count
     && gaps[gap_index] > block->max_nonspace; gap_index++);
   if (gap_index < cluster_count)
     lower = gaps[gap_index];     //most frequent below
   else {
     if (testing_on)
       tprintf ("No cluster below block threshold!, using default=%g\n",
         block->pr_nonsp);
     lower = block->pr_nonsp;
   }
   for (gap_index = 0; gap_index < cluster_count
     && gaps[gap_index] <= block->max_nonspace; gap_index++);
   if (gap_index < cluster_count)
     upper = gaps[gap_index];     //most frequent above
   else {
     if (testing_on)
       tprintf ("No cluster above block threshold!, using default=%g\n",
         block->pr_space);
     upper = block->pr_space;
   }
   row->min_space =
     (inT32) ceil (upper - (upper - lower) * textord_words_definite_spread);
   row->max_nonspace =
     (inT32) floor (lower + (upper - lower) * textord_words_definite_spread);
   row->space_threshold = (row->max_nonspace + row->min_space) / 2;
   row->space_size = upper;
   row->kern_size = lower;
   if (testing_on) {
     if (testing_row) {
       tprintf ("GAP STATS\n");
       gap_stats.print (stdout, TRUE);
       tprintf ("SPACE stats\n");
       cluster_stats[2].print (stdout, FALSE);
       tprintf ("NONSPACE stats\n");
       cluster_stats[1].print (stdout, FALSE);
     }
     tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n",
       row->intercept (), row->min_space, upper,
       row->max_nonspace, lower);
   }
   return 1;
 }


 /**********************************************************************
  * make_real_words
  *
  * Convert a TO_BLOCK to a BLOCK.
  **********************************************************************/

 void make_real_words(                  //find lines
                      TO_BLOCK *block,  //block to do
                      FCOORD rotation   //for drawing
                     ) {
   TO_ROW *row;                   //current row
   TO_ROW_IT row_it = block->get_rows ();
   ROW *real_row = NULL;          //output row
   ROW_IT real_row_it = block->block->row_list ();

   if (row_it.empty ())
     return;                      //empty block
   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
     row = row_it.data ();
     if (row->blob_list ()->empty () && !row->rep_words.empty ()) {
       real_row = make_rep_words (row, block);
     }
     else if (!row->blob_list ()->empty ()) {
       //                      tprintf("Row pitch_decision=%d",row->pitch_decision);
       if (row->pitch_decision == PITCH_DEF_FIXED
         || row->pitch_decision == PITCH_CORR_FIXED)
         real_row = fixed_pitch_words (row, rotation);
       else if (row->pitch_decision == PITCH_DEF_PROP
         || row->pitch_decision == PITCH_CORR_PROP)
         real_row = make_prop_words (row, rotation);
       else
         ASSERT_HOST(FALSE);
     }
     if (real_row != NULL) {
                                  //put row in block
       real_row_it.add_after_then_move (real_row);
     }
   }
   block->block->set_stats (block->fixed_pitch == 0, (inT16) block->kern_size,
     (inT16) block->space_size,
     (inT16) block->fixed_pitch);
   block->block->check_pitch ();
 }


 /**********************************************************************
  * make_rep_words
  *
  * Fabricate a real row from only the repeated blob words.
  * Get the xheight from the block as it may be more meaningful.
  **********************************************************************/

 ROW *make_rep_words(                 //make a row
                     TO_ROW *row,     //row to convert
                     TO_BLOCK *block  //block it lives in
                    ) {
   inT32 xstarts[2];              //ends of row
   ROW *real_row;                 //output row
   TBOX word_box;                  //bounding box
   double coeffs[3];              //spline
                                  //iterator
   WERD_IT word_it = &row->rep_words;

   if (word_it.empty ())
     return NULL;
   word_box = word_it.data ()->bounding_box ();
   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ())
     word_box += word_it.data ()->bounding_box ();
   xstarts[0] = word_box.left ();
   xstarts[1] = word_box.right ();
   coeffs[0] = 0;
   coeffs[1] = row->line_m ();
   coeffs[2] = row->line_c ();
   row->xheight = block->xheight;
   real_row = new ROW(row,
     (inT16) block->kern_size, (inT16) block->space_size);
   word_it.set_to_list (real_row->word_list ());
                                  //put words in row
   word_it.add_list_after (&row->rep_words);
   real_row->recalc_bounding_box ();
   return real_row;
 }


 /**********************************************************************
  * make_real_word
  *
  * Construct a WERD from a given number of adjacent entries in a
  * list of BLOBNBOXs.
  **********************************************************************/

 WERD *make_real_word(                      //make a WERD
                      BLOBNBOX_IT *box_it,  //iterator
                      inT32 blobcount,      //no of blobs to use
                      BOOL8 bol,            //start of line
                      BOOL8 fuzzy_sp,       //fuzzy space
                      BOOL8 fuzzy_non,      //fuzzy non-space
                      uinT8 blanks          //no of blanks
                     ) {
   OUTLINE_IT out_it;             //outlines
   C_OUTLINE_IT cout_it;
   PBLOB_LIST blobs;              //blobs in word
   C_BLOB_LIST cblobs;
   PBLOB_IT blob_it = &blobs;     //iterator
   C_BLOB_IT cblob_it = &cblobs;
   WERD *word;                    //new word
   BLOBNBOX *bblob;               //current blob
   inT32 blobindex;               //in row

   for (blobindex = 0; blobindex < blobcount; blobindex++) {
     bblob = box_it->extract ();
     if (bblob->joined_to_prev ()) {
       if (bblob->blob () != NULL) {
         out_it.set_to_list (blob_it.data ()->out_list ());
         out_it.move_to_last ();
         out_it.add_list_after (bblob->blob ()->out_list ());
         delete bblob->blob ();
       }
       else if (bblob->cblob () != NULL) {
         cout_it.set_to_list (cblob_it.data ()->out_list ());
         cout_it.move_to_last ();
         cout_it.add_list_after (bblob->cblob ()->out_list ());
         delete bblob->cblob ();
       }
     }
     else {
       if (bblob->blob () != NULL)
         blob_it.add_after_then_move (bblob->blob ());
       else if (bblob->cblob () != NULL)
         cblob_it.add_after_then_move (bblob->cblob ());
     }
     delete bblob;
     box_it->forward ();          //next one
   }

   if (blanks < 1)
     blanks = 1;
   if (!blob_it.empty ()) {
                                  //make real word
     word = new WERD (&blobs, blanks, NULL);
   }
   else {
     word = new WERD (&cblobs, blanks, NULL);
   }
   if (bol) {
     word->set_flag (W_BOL, TRUE);
   }
   if (fuzzy_sp)
                                  //probably space
     word->set_flag (W_FUZZY_SP, TRUE);
   else if (fuzzy_non)
                                  //probably not
     word->set_flag (W_FUZZY_NON, TRUE);
   if (box_it->at_first ()) {
     word->set_flag (W_EOL, TRUE);//at end of line
   }
   return word;
 }
	/**********************************************************************
	* File: wordseg.cpp (Formerly wspace.c)
	* Description: Code to segment the blobs into words.
	* Author: Ray Smith
	* Created: Fri Oct 16 11:32:28 BST 1992
	*
	* (C) Copyright 1992, Hewlett-Packard Ltd.
	** Licensed under the Apache License, Version 2.0 (the "License");
	** you may not use this file except in compliance with the License.
	** You may obtain a copy of the License at
	** http://www.apache.org/licenses/LICENSE-2.0
	** Unless required by applicable law or agreed to in writing, software
	** distributed under the License is distributed on an "AS IS" BASIS,
	** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	** See the License for the specific language governing permissions and
	** limitations under the License.
	*
	**********************************************************************/

	#include "mfcpch.h"
	#ifdef __UNIX__
	#include <assert.h>
	#endif
	#include "stderr.h"
	#include "blobbox.h"
	#include "ocrclass.h"
	#include "lmedsq.h"
	#include "statistc.h"
	#include "drawtord.h"
	#include "makerow.h"
	#include "pitsync1.h"
	#include "blobcmpl.h"
	#include "tovars.h"
	#include "topitch.h"
	#include "tospace.h"
	#include "fpchop.h"
	#include "wordseg.h"

	#define EXTERN

	EXTERN BOOL_VAR (textord_fp_chopping, TRUE, "Do fixed pitch chopping");
	extern /"C" / ETEXT_DESC *global_monitor; //progress monitor

	#define FIXED_WIDTH_MULTIPLE 5
	#define BLOCK_STATS_CLUSTERS 10

	/**********************************************************************
	* make_single_word
	*
	* Arrange the blobs into one word. There is no fixed pitch detection.
	**********************************************************************/

	void make_single_word(bool one_blob, TO_ROW_LIST rows, ROW_LIST real_rows) {
	TO_ROW_IT to_row_it(rows);
	TO_ROW* row = to_row_it.data();
	// The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
	// to create the word.
	C_BLOB_LIST cblobs;
	C_BLOB_IT cblob_it(&cblobs);
	BLOBNBOX_IT box_it(row->blob_list());
	for (;!box_it.empty(); box_it.forward()) {
	BLOBNBOX* bblob= box_it.extract();
	if (bblob->joined_to_prev() \|\| (one_blob && !cblob_it.empty())) {
	if (bblob->cblob() != NULL) {
	C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
	cout_it.move_to_last();
	cout_it.add_list_after(bblob->cblob()->out_list());
	delete bblob->cblob();
	}
	} else {
	if (bblob->cblob() != NULL)
	cblob_it.add_after_then_move(bblob->cblob());
	delete bblob;
	}
	}
	// Convert the TO_ROW to a ROW.
	ROW* real_row = new ROW(row, static_cast<inT16>(row->kern_size),
	static_cast<inT16>(row->space_size));
	WERD_IT word_it(real_row->word_list());
	WERD* word = new WERD(&cblobs, 0, NULL);
	word->set_flag(W_BOL, TRUE);
	word->set_flag(W_EOL, TRUE);
	word_it.add_after_then_move(word);
	ROW_IT row_it(real_rows);
	row_it.add_after_then_move(real_row);
	}

	/**********************************************************************
	* make_words
	*
	* Arrange the blobs into words.
	**********************************************************************/

	void make_words( //make words
	ICOORD page_tr, //top right
	float gradient, //page skew
	BLOCK_LIST *blocks, //block list
	TO_BLOCK_LIST *land_blocks, //rotated for landscape
	TO_BLOCK_LIST *port_blocks, //output list
	tesseract::Tesseract* tess
	) {
	TO_BLOCK_IT block_it; //iterator
	TO_BLOCK *block; //current block;

	compute_fixed_pitch (page_tr, port_blocks, gradient, FCOORD (0.0f, -1.0f),
	!(BOOL8) textord_test_landscape, tess);
	if (global_monitor != NULL) {
	global_monitor->ocr_alive = TRUE;
	global_monitor->progress = 25;
	}
	to_spacing(page_tr, port_blocks);
	block_it.set_to_list (port_blocks);
	for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
	block_it.forward ()) {
	block = block_it.data ();
	// set_row_spaces(block,FCOORD(1,0),!(BOOL8)textord_test_landscape);
	//make proper classes
	make_real_words (block, FCOORD (1.0f, 0.0f));
	}
	}


	/**********************************************************************
	* set_row_spaces
	*
	* Set the min_space and max_nonspace members of the row so that
	* the blobs can be arranged into words.
	**********************************************************************/

	void set_row_spaces( //find space sizes
	TO_BLOCK *block, //block to do
	FCOORD rotation, //for drawing
	BOOL8 testing_on //correct orientation
	) {
	inT32 maxwidth; //of widest space
	TO_ROW *row; //current row
	TO_ROW_IT row_it = block->get_rows ();

	if (row_it.empty ())
	return; //empty block
	maxwidth = (inT32) ceil (block->xheight * textord_words_maxspace);
	for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
	row = row_it.data ();
	if (row->fixed_pitch == 0) {
	// if (!textord_test_mode
	// && row_words(block,row,maxwidth,rotation,testing_on)==0
	// \|\| textord_test_mode
	// && row_words2(block,row,maxwidth,rotation,testing_on)==0)
	// {
	row->min_space =
	(inT32) ceil (row->pr_space -
	(row->pr_space -
	row->pr_nonsp) * textord_words_definite_spread);
	row->max_nonspace =
	(inT32) floor (row->pr_nonsp +
	(row->pr_space -
	row->pr_nonsp) * textord_words_definite_spread);
	if (testing_on && textord_show_initial_words) {
	tprintf ("Assigning defaults %d non, %d space to row at %g\n",
	row->max_nonspace, row->min_space, row->intercept ());
	}
	row->space_threshold = (row->max_nonspace + row->min_space) / 2;
	row->space_size = row->pr_space;
	row->kern_size = row->pr_nonsp;
	// }
	}
	#ifndef GRAPHICS_DISABLED
	if (textord_show_initial_words && testing_on) {
	plot_word_decisions (to_win, (inT16) row->fixed_pitch, row);
	}
	#endif
	}
	}


	/**********************************************************************
	* row_words
	*
	* Compute the max nonspace and min space for the row.
	**********************************************************************/

	inT32 row_words( //compute space size
	TO_BLOCK *block, //block it came from
	TO_ROW *row, //row to operate on
	inT32 maxwidth, //max expected space size
	FCOORD rotation, //for drawing
	BOOL8 testing_on //for debug
	) {
	BOOL8 testing_row; //contains testpt
	BOOL8 prev_valid; //if decent size
	BOOL8 this_valid; //current blob big enough
	inT32 prev_x; //end of prev blob
	inT32 min_gap; //min interesting gap
	inT32 cluster_count; //no of clusters
	inT32 gap_index; //which cluster
	inT32 smooth_factor; //for smoothing stats
	BLOBNBOX *blob; //current blob
	float lower, upper; //clustering parameters
	float gaps[3]; //gap clusers
	ICOORD testpt;
	TBOX blob_box; //bounding box
	//iterator
	BLOBNBOX_IT blob_it = row->blob_list ();
	STATS gap_stats (0, maxwidth);
	STATS cluster_stats[4]; //clusters

	testpt = ICOORD (textord_test_x, textord_test_y);
	smooth_factor =
	(inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5);
	// if (testing_on)
	// tprintf("Row smooth factor=%d\n",smooth_factor);
	prev_valid = FALSE;
	prev_x = -MAX_INT32;
	testing_row = FALSE;
	for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
	blob = blob_it.data ();
	blob_box = blob->bounding_box ();
	if (blob_box.contains (testpt))
	testing_row = TRUE;
	gap_stats.add (blob_box.width (), 1);
	}
	min_gap = (inT32) floor (gap_stats.ile (textord_words_width_ile));
	gap_stats.clear ();
	for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
	blob = blob_it.data ();
	if (!blob->joined_to_prev ()) {
	blob_box = blob->bounding_box ();
	// this_valid=blob_box.width()>=min_gap;
	this_valid = TRUE;
	if (this_valid && prev_valid
	&& blob_box.left () - prev_x < maxwidth) {
	gap_stats.add (blob_box.left () - prev_x, 1);
	}
	prev_x = blob_box.right ();
	prev_valid = this_valid;
	}
	}
	if (gap_stats.get_total () == 0) {
	row->min_space = 0; //no evidence
	row->max_nonspace = 0;
	return 0;
	}
	gap_stats.smooth (smooth_factor);
	lower = row->xheight * textord_words_initial_lower;
	upper = row->xheight * textord_words_initial_upper;
	cluster_count = gap_stats.cluster (lower, upper,
	textord_spacesize_ratioprop, 3,
	cluster_stats);
	while (cluster_count < 2 && ceil (lower) < floor (upper)) {
	//shrink gap
	upper = (upper * 3 + lower) / 4;
	lower = (lower * 3 + upper) / 4;
	cluster_count = gap_stats.cluster (lower, upper,
	textord_spacesize_ratioprop, 3,
	cluster_stats);
	}
	if (cluster_count < 2) {
	row->min_space = 0; //no evidence
	row->max_nonspace = 0;
	return 0;
	}
	for (gap_index = 0; gap_index < cluster_count; gap_index++)
	gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
	//get medians
	if (cluster_count > 2) {
	if (testing_on && textord_show_initial_words) {
	tprintf ("Row at %g has 3 sizes of gap:%g,%g,%g\n",
	row->intercept (),
	cluster_stats[1].ile (0.5),
	cluster_stats[2].ile (0.5), cluster_stats[3].ile (0.5));
	}
	lower = gaps[0];
	if (gaps[1] > lower) {
	upper = gaps[1]; //prefer most frequent
	if (upper < block->xheight * textord_words_min_minspace
	&& gaps[2] > gaps[1]) {
	upper = gaps[2];
	}
	}
	else if (gaps[2] > lower
	&& gaps[2] >= block->xheight * textord_words_min_minspace)
	upper = gaps[2];
	else if (lower >= block->xheight * textord_words_min_minspace) {
	upper = lower; //not nice
	lower = gaps[1];
	if (testing_on && textord_show_initial_words) {
	tprintf ("Had to switch most common from lower to upper!!\n");
	gap_stats.print (stdout, TRUE);
	}
	}
	else {
	row->min_space = 0; //no evidence
	row->max_nonspace = 0;
	return 0;
	}
	}
	else {
	if (gaps[1] < gaps[0]) {
	if (testing_on && textord_show_initial_words) {
	tprintf ("Had to switch most common from lower to upper!!\n");
	gap_stats.print (stdout, TRUE);
	}
	lower = gaps[1];
	upper = gaps[0];
	}
	else {
	upper = gaps[1];
	lower = gaps[0];
	}
	}
	if (upper < block->xheight * textord_words_min_minspace) {
	row->min_space = 0; //no evidence
	row->max_nonspace = 0;
	return 0;
	}
	if (upper * 3 < block->min_space * 2 + block->max_nonspace
	\|\| lower * 3 > block->min_space * 2 + block->max_nonspace) {
	if (testing_on && textord_show_initial_words) {
	tprintf ("Disagreement between block and row at %g!!\n",
	row->intercept ());
	tprintf ("Lower=%g, upper=%g, Stats:\n", lower, upper);
	gap_stats.print (stdout, TRUE);
	}
	}
	row->min_space =
	(inT32) ceil (upper - (upper - lower) * textord_words_definite_spread);
	row->max_nonspace =
	(inT32) floor (lower + (upper - lower) * textord_words_definite_spread);
	row->space_threshold = (row->max_nonspace + row->min_space) / 2;
	row->space_size = upper;
	row->kern_size = lower;
	if (testing_on && textord_show_initial_words) {
	if (testing_row) {
	tprintf ("GAP STATS\n");
	gap_stats.print (stdout, TRUE);
	tprintf ("SPACE stats\n");
	cluster_stats[2].print (stdout, FALSE);
	tprintf ("NONSPACE stats\n");
	cluster_stats[1].print (stdout, FALSE);
	}
	tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n",
	row->intercept (), row->min_space, upper,
	row->max_nonspace, lower);
	}
	return cluster_stats[2].get_total ();
	}


	/**********************************************************************
	* row_words2
	*
	* Compute the max nonspace and min space for the row.
	**********************************************************************/

	inT32 row_words2( //compute space size
	TO_BLOCK *block, //block it came from
	TO_ROW *row, //row to operate on
	inT32 maxwidth, //max expected space size
	FCOORD rotation, //for drawing
	BOOL8 testing_on //for debug
	) {
	BOOL8 testing_row; //contains testpt
	BOOL8 prev_valid; //if decent size
	BOOL8 this_valid; //current blob big enough
	inT32 prev_x; //end of prev blob
	inT32 min_width; //min interesting width
	inT32 valid_count; //good gaps
	inT32 total_count; //total gaps
	inT32 cluster_count; //no of clusters
	inT32 prev_count; //previous cluster_count
	inT32 gap_index; //which cluster
	inT32 smooth_factor; //for smoothing stats
	BLOBNBOX *blob; //current blob
	float lower, upper; //clustering parameters
	ICOORD testpt;
	TBOX blob_box; //bounding box
	//iterator
	BLOBNBOX_IT blob_it = row->blob_list ();
	STATS gap_stats (0, maxwidth);
	//gap sizes
	float gaps[BLOCK_STATS_CLUSTERS];
	STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
	//clusters

	testpt = ICOORD (textord_test_x, textord_test_y);
	smooth_factor =
	(inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5);
	// if (testing_on)
	// tprintf("Row smooth factor=%d\n",smooth_factor);
	prev_valid = FALSE;
	prev_x = -MAX_INT16;
	testing_row = FALSE;
	//min blob size
	min_width = (inT32) block->pr_space;
	total_count = 0;
	for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
	blob = blob_it.data ();
	if (!blob->joined_to_prev ()) {
	blob_box = blob->bounding_box ();
	this_valid = blob_box.width () >= min_width;
	this_valid = TRUE;
	if (this_valid && prev_valid
	&& blob_box.left () - prev_x < maxwidth) {
	gap_stats.add (blob_box.left () - prev_x, 1);
	}
	total_count++; //count possibles
	prev_x = blob_box.right ();
	prev_valid = this_valid;
	}
	}
	valid_count = gap_stats.get_total ();
	if (valid_count < total_count * textord_words_minlarge) {
	gap_stats.clear ();
	prev_x = -MAX_INT16;
	for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
	blob_it.forward ()) {
	blob = blob_it.data ();
	if (!blob->joined_to_prev ()) {
	blob_box = blob->bounding_box ();
	if (blob_box.left () - prev_x < maxwidth) {
	gap_stats.add (blob_box.left () - prev_x, 1);
	}
	prev_x = blob_box.right ();
	}
	}
	}
	if (gap_stats.get_total () == 0) {
	row->min_space = 0; //no evidence
	row->max_nonspace = 0;
	return 0;
	}

	cluster_count = 0;
	lower = block->xheight * words_initial_lower;
	upper = block->xheight * words_initial_upper;
	gap_stats.smooth (smooth_factor);
	do {
	prev_count = cluster_count;
	cluster_count = gap_stats.cluster (lower, upper,
	textord_spacesize_ratioprop,
	BLOCK_STATS_CLUSTERS, cluster_stats);
	}
	while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
	if (cluster_count < 1) {
	row->min_space = 0;
	row->max_nonspace = 0;
	return 0;
	}
	for (gap_index = 0; gap_index < cluster_count; gap_index++)
	gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
	//get medians
	if (testing_on) {
	tprintf ("cluster_count=%d:", cluster_count);
	for (gap_index = 0; gap_index < cluster_count; gap_index++)
	tprintf (" %g(%d)", gaps[gap_index],
	cluster_stats[gap_index + 1].get_total ());
	tprintf ("\n");
	}

	//Try to find proportional non-space and space for row.
	for (gap_index = 0; gap_index < cluster_count
	&& gaps[gap_index] > block->max_nonspace; gap_index++);
	if (gap_index < cluster_count)
	lower = gaps[gap_index]; //most frequent below
	else {
	if (testing_on)
	tprintf ("No cluster below block threshold!, using default=%g\n",
	block->pr_nonsp);
	lower = block->pr_nonsp;
	}
	for (gap_index = 0; gap_index < cluster_count
	&& gaps[gap_index] <= block->max_nonspace; gap_index++);
	if (gap_index < cluster_count)
	upper = gaps[gap_index]; //most frequent above
	else {
	if (testing_on)
	tprintf ("No cluster above block threshold!, using default=%g\n",
	block->pr_space);
	upper = block->pr_space;
	}
	row->min_space =
	(inT32) ceil (upper - (upper - lower) * textord_words_definite_spread);
	row->max_nonspace =
	(inT32) floor (lower + (upper - lower) * textord_words_definite_spread);
	row->space_threshold = (row->max_nonspace + row->min_space) / 2;
	row->space_size = upper;
	row->kern_size = lower;
	if (testing_on) {
	if (testing_row) {
	tprintf ("GAP STATS\n");
	gap_stats.print (stdout, TRUE);
	tprintf ("SPACE stats\n");
	cluster_stats[2].print (stdout, FALSE);
	tprintf ("NONSPACE stats\n");
	cluster_stats[1].print (stdout, FALSE);
	}
	tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n",
	row->intercept (), row->min_space, upper,
	row->max_nonspace, lower);
	}
	return 1;
	}


	/**********************************************************************
	* make_real_words
	*
	* Convert a TO_BLOCK to a BLOCK.
	**********************************************************************/

	void make_real_words( //find lines
	TO_BLOCK *block, //block to do
	FCOORD rotation //for drawing
	) {
	TO_ROW *row; //current row
	TO_ROW_IT row_it = block->get_rows ();
	ROW *real_row = NULL; //output row
	ROW_IT real_row_it = block->block->row_list ();

	if (row_it.empty ())
	return; //empty block
	for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
	row = row_it.data ();
	if (row->blob_list ()->empty () && !row->rep_words.empty ()) {
	real_row = make_rep_words (row, block);
	}
	else if (!row->blob_list ()->empty ()) {
	// tprintf("Row pitch_decision=%d",row->pitch_decision);
	if (row->pitch_decision == PITCH_DEF_FIXED
	\|\| row->pitch_decision == PITCH_CORR_FIXED)
	real_row = fixed_pitch_words (row, rotation);
	else if (row->pitch_decision == PITCH_DEF_PROP
	\|\| row->pitch_decision == PITCH_CORR_PROP)
	real_row = make_prop_words (row, rotation);
	else
	ASSERT_HOST(FALSE);
	}
	if (real_row != NULL) {
	//put row in block
	real_row_it.add_after_then_move (real_row);
	}
	}
	block->block->set_stats (block->fixed_pitch == 0, (inT16) block->kern_size,
	(inT16) block->space_size,
	(inT16) block->fixed_pitch);
	block->block->check_pitch ();
	}


	/**********************************************************************
	* make_rep_words
	*
	* Fabricate a real row from only the repeated blob words.
	* Get the xheight from the block as it may be more meaningful.
	**********************************************************************/

	ROW *make_rep_words( //make a row
	TO_ROW *row, //row to convert
	TO_BLOCK *block //block it lives in
	) {
	inT32 xstarts[2]; //ends of row
	ROW *real_row; //output row
	TBOX word_box; //bounding box
	double coeffs[3]; //spline
	//iterator
	WERD_IT word_it = &row->rep_words;

	if (word_it.empty ())
	return NULL;
	word_box = word_it.data ()->bounding_box ();
	for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ())
	word_box += word_it.data ()->bounding_box ();
	xstarts[0] = word_box.left ();
	xstarts[1] = word_box.right ();
	coeffs[0] = 0;
	coeffs[1] = row->line_m ();
	coeffs[2] = row->line_c ();
	row->xheight = block->xheight;
	real_row = new ROW(row,
	(inT16) block->kern_size, (inT16) block->space_size);
	word_it.set_to_list (real_row->word_list ());
	//put words in row
	word_it.add_list_after (&row->rep_words);
	real_row->recalc_bounding_box ();
	return real_row;
	}


	/**********************************************************************
	* make_real_word
	*
	* Construct a WERD from a given number of adjacent entries in a
	* list of BLOBNBOXs.
	**********************************************************************/

	WERD *make_real_word( //make a WERD
	BLOBNBOX_IT *box_it, //iterator
	inT32 blobcount, //no of blobs to use
	BOOL8 bol, //start of line
	BOOL8 fuzzy_sp, //fuzzy space
	BOOL8 fuzzy_non, //fuzzy non-space
	uinT8 blanks //no of blanks
	) {
	OUTLINE_IT out_it; //outlines
	C_OUTLINE_IT cout_it;
	PBLOB_LIST blobs; //blobs in word
	C_BLOB_LIST cblobs;
	PBLOB_IT blob_it = &blobs; //iterator
	C_BLOB_IT cblob_it = &cblobs;
	WERD *word; //new word
	BLOBNBOX *bblob; //current blob
	inT32 blobindex; //in row

	for (blobindex = 0; blobindex < blobcount; blobindex++) {
	bblob = box_it->extract ();
	if (bblob->joined_to_prev ()) {
	if (bblob->blob () != NULL) {
	out_it.set_to_list (blob_it.data ()->out_list ());
	out_it.move_to_last ();
	out_it.add_list_after (bblob->blob ()->out_list ());
	delete bblob->blob ();
	}
	else if (bblob->cblob () != NULL) {
	cout_it.set_to_list (cblob_it.data ()->out_list ());
	cout_it.move_to_last ();
	cout_it.add_list_after (bblob->cblob ()->out_list ());
	delete bblob->cblob ();
	}
	}
	else {
	if (bblob->blob () != NULL)
	blob_it.add_after_then_move (bblob->blob ());
	else if (bblob->cblob () != NULL)
	cblob_it.add_after_then_move (bblob->cblob ());
	}
	delete bblob;
	box_it->forward (); //next one
	}

	if (blanks < 1)
	blanks = 1;
	if (!blob_it.empty ()) {
	//make real word
	word = new WERD (&blobs, blanks, NULL);
	}
	else {
	word = new WERD (&cblobs, blanks, NULL);
	}
	if (bol) {
	word->set_flag (W_BOL, TRUE);
	}
	if (fuzzy_sp)
	//probably space
	word->set_flag (W_FUZZY_SP, TRUE);
	else if (fuzzy_non)
	//probably not
	word->set_flag (W_FUZZY_NON, TRUE);
	if (box_it->at_first ()) {
	word->set_flag (W_EOL, TRUE);//at end of line
	}
	return word;
	}