8843794 take 2

Signed-off-by: Iliyan Malchev <malchev@google.com>
diff --git a/Android.mk b/Android.mk
index dc71258..4d7f6c4 100755
--- a/Android.mk
+++ b/Android.mk
@@ -236,7 +236,8 @@
 	textord/tospace.cpp	\
 	textord/tovars.cpp	\
 	textord/underlin.cpp	\
-	textord/wordseg.cpp
+	textord/wordseg.cpp     \
+	textord/pagesegmain.cpp
 #	textord/drawtord.cpp
 #	textord/drawedg.cpp	
 
diff --git a/BUILD b/BUILD
index ba9da2b..2e2a254 100644
--- a/BUILD
+++ b/BUILD
@@ -237,6 +237,7 @@
                    "textord/gap_map.cpp",
                    "textord/makerow.cpp",
                    "textord/oldbasel.cpp",
+                   "textord/pagesegmain.cpp",
                    "textord/pithsync.cpp",
                    "textord/pitsync1.cpp",
                    "textord/scanedg.cpp",
diff --git a/ccmain/baseapi.cpp b/ccmain/baseapi.cpp
index ffa33d4..2fa1928 100644
--- a/ccmain/baseapi.cpp
+++ b/ccmain/baseapi.cpp
@@ -200,6 +200,15 @@
   return true;
 }
 
+// Set the current page segmentation mode. Defaults to PSM_AUTO.
+// The mode is stored as an INT_VARIABLE so it can also be modified by
+// ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
+void TessBaseAPI::SetPageSegMode(PageSegMode mode) {
+  if (tesseract_ == NULL)
+    tesseract_ = new Tesseract;
+  tesseract_->tessedit_pageseg_mode.set_value(mode);
+}
+
 // Recognize a rectangle from an image and return the result as a string.
 // May be called many times for a single Init.
 // Currently has no error checking.
@@ -332,6 +341,9 @@
 #ifdef HAVE_LIBLEPT
     tesseract_->pgeditor_main(block_list_);
 #endif
+    delete page_res_;
+    page_res_ = NULL;
+    return -1;
   } else if (tesseract_->tessedit_train_from_boxes) {
     apply_box_training(*output_file_, block_list_);
   } else {
@@ -717,30 +729,6 @@
   return true;
 }
 
-#ifndef HAVE_LIBLEPT
-namespace tesseract {
-void pgeditor_read_file(STRING &filename,
-                        BLOCK_LIST *blocks,  // block list to add to
-			Tesseract *tess)
-{
-  STRING name = filename;        //truncated name
-  const char *lastdot;           //of name
-  TO_BLOCK_LIST land_blocks, port_blocks;
-  TBOX page_box;
-
-  lastdot = strrchr (name.string (), '.');
-  if (lastdot != NULL)
-    name[lastdot-name.string()] = '\0';
-  if (!read_pd_file (name, page_image.get_xsize (), page_image.get_ysize (),
-                     blocks)) {
-    segment_page(blocks);
-  }
-  find_components(blocks, &land_blocks, &port_blocks, &page_box);
-  textord_page(page_box.topright(), blocks, &land_blocks, &port_blocks, tess);
-}
-}
-#endif//HAVE_LIBLEPT
-
 // Run the thresholder to make the thresholded image.
 void TessBaseAPI::Threshold() {
   thresholder_->ThresholdToIMAGE(&page_image);
@@ -752,19 +740,17 @@
 
 // Find lines from the image making the BLOCK_LIST.
 int TessBaseAPI::FindLines() {
-  // The following call creates a full-page block and then runs connected
-  // component analysis and text line creation.
   if (input_file_ == NULL)
     input_file_ = new STRING(kInputFile);
   if (tesseract_ == NULL) {
     tesseract_ = new Tesseract;
     tesseract_->InitAdaptiveClassifier();
   }
-#ifdef HAVE_LIBLEPT
-  tesseract_->pgeditor_read_file(*input_file_, block_list_);
-#else
-  tesseract::pgeditor_read_file(*input_file_, block_list_, tesseract_);
-#endif
+  ASSERT_HOST(page_image.get_xsize() == rect_width_ ||
+              page_image.get_xsize() == rect_width_ - 1);
+  ASSERT_HOST(page_image.get_ysize() == rect_height_ ||
+              page_image.get_ysize() == rect_height_ - 1);
+  tesseract_->SegmentPage(input_file_, NULL, &page_image, block_list_);
   return 0;
 }
 
diff --git a/ccmain/baseapi.h b/ccmain/baseapi.h
index a660392..b98643c 100644
--- a/ccmain/baseapi.h
+++ b/ccmain/baseapi.h
@@ -43,6 +43,17 @@
 typedef int (Dict::*DictFunc)(void* dawg, void* node, int char_index,
                               char prevchar, const char *word, int word_end);
 
+enum PageSegMode {
+  PSM_AUTO,           // Fully automatic page segmentation. (Default.)
+  PSM_SINGLE_COLUMN,  // Assume a single column of text of variable sizes.
+  PSM_SINGLE_BLOCK,   // Assume a single uniform block of text.
+  PSM_SINGLE_LINE,    // Treat the image as a single text line.
+  PSM_SINGLE_WORD,    // Treat the image as a single word.
+  PSM_SINGLE_CHAR,    // Treat the image as a single character.
+
+  PSM_COUNT           // Number of enum entries.
+};
+
 // Base class for all tesseract APIs.
 // Specific classes can add ability to work on different inputs or produce
 // different outputs.
@@ -110,6 +121,11 @@
   // and also accepts a relative or absolute path name.
   bool ReadConfigFile(const char* filename);
 
+  // Set the current page segmentation mode. Defaults to PSM_AUTO.
+  // The mode is stored as an INT_VARIABLE so it can also be modified by
+  // ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
+  void SetPageSegMode(PageSegMode mode);
+
   // Recognize a rectangle from an image and return the result as a string.
   // May be called many times for a single Init.
   // Currently has no error checking.
diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp
index 3543507..326965a 100644
--- a/ccmain/tesseractclass.cpp
+++ b/ccmain/tesseractclass.cpp
@@ -28,6 +28,8 @@
                 "Take segmentation and labeling from box file"),
     BOOL_MEMBER(tessedit_train_from_boxes, false,
                 "Generate training data from boxed chars"),
+    INT_MEMBER(tessedit_pageseg_mode, 0,
+               "Page seg mode: 0=auto, 1=col, 2=block, 3=line, 4=word, 6=char"),
     STRING_MEMBER(tessedit_char_blacklist, "",
                   "Blacklist of chars not to recognize"),
     STRING_MEMBER(tessedit_char_whitelist, "",
diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h
index 5a29b64..5c16788 100644
--- a/ccmain/tesseractclass.h
+++ b/ccmain/tesseractclass.h
@@ -76,6 +76,8 @@
   ~Tesseract();
 
   void SetBlackAndWhitelist();
+  void SegmentPage(const STRING* input_file,
+                   Pix* pix, IMAGE* image, BLOCK_LIST* blocks);
 
   //// control.h /////////////////////////////////////////////////////////
   void recog_all_words(                                //process words
@@ -237,15 +239,6 @@
   void adapt_to_good_samples(WERD_RES *word,
                              CHAR_SAMPLES_LIST *char_clusters,
                              CHAR_SAMPLE_LIST *chars_waiting);
-
-  BOOL_VAR_H(tessedit_resegment_from_boxes, false,
-             "Take segmentation and labeling from box file");
-  BOOL_VAR_H(tessedit_train_from_boxes, false,
-             "Generate training data from boxed chars");
-  STRING_VAR_H(tessedit_char_blacklist, "",
-               "Blacklist of chars not to recognize");
-  STRING_VAR_H(tessedit_char_whitelist, "",
-               "Whitelist of chars to recognize");
   BOOL8 word_adaptable(  //should we adapt?
                        WERD_RES *word,
                        uinT16 mode);
@@ -467,6 +460,18 @@
                           );
   //// fixxht.cpp ///////////////////////////////////////////////////////
   void check_block_occ(WERD_RES *word_res);
+
+  //// Data members ///////////////////////////////////////////////////////
+  BOOL_VAR_H(tessedit_resegment_from_boxes, false,
+             "Take segmentation and labeling from box file");
+  BOOL_VAR_H(tessedit_train_from_boxes, false,
+             "Generate training data from boxed chars");
+  INT_VAR_H(tessedit_pageseg_mode, 0,
+            "Page seg mode: 0=auto, 1=col, 2=block, 3=line, 4=word, 6=char");
+  STRING_VAR_H(tessedit_char_blacklist, "",
+               "Blacklist of chars not to recognize");
+  STRING_VAR_H(tessedit_char_whitelist, "",
+               "Whitelist of chars to recognize");
 };
 
 }  // namespace tesseract
diff --git a/textord/makerow.cpp b/textord/makerow.cpp
index 74f6218..ab2ff64 100644
--- a/textord/makerow.cpp
+++ b/textord/makerow.cpp
@@ -121,6 +121,52 @@
 #define MAX_HEIGHT_MODES  12
 
 /**********************************************************************
+ * make_single_row
+ *
+ * Arrange the blobs into a single row.
+ **********************************************************************/
+
+float make_single_row(ICOORD page_tr, TO_BLOCK* block, TO_BLOCK_LIST* blocks) {
+  BLOBNBOX_IT blob_it = &block->blobs;
+  TO_ROW_IT row_it = block->get_rows();
+
+  // Include all the small blobs and large blobs.
+  blob_it.add_list_after(&block->small_blobs);
+  blob_it.add_list_after(&block->noise_blobs);
+  blob_it.add_list_after(&block->large_blobs);
+  blob_it.sort(blob_x_order);
+  blob_it.move_to_first();
+  TO_ROW* row = NULL;
+  // Add all the blobs to a single TO_ROW.
+  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+    BLOBNBOX* blob = blob_it.extract();
+    int top = blob->bounding_box().top();
+    int bottom = blob->bounding_box().bottom();
+    if (row == NULL) {
+      row = new TO_ROW(blob, top, bottom, block->line_size);
+      row_it.add_before_then_move(row);
+    } else {
+      row->add_blob(blob, top, bottom, block->line_size);
+    }
+  }
+  // Fit an LMS line to the row.
+  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward())
+    fit_lms_line(row_it.data());
+  float gradient;
+  float fit_error;
+  // Compute the skew based on the fitted line.
+  compute_page_skew(blocks, gradient, fit_error);
+  FCOORD rotation(1.0f, 0.0f);
+  // Associate i dots and other diacriticals with the appropriate blobs.
+  pre_associate_blobs(page_tr, block, rotation, false);
+  int block_edge = block->block->bounding_box().left();
+  fit_parallel_rows(block, gradient, rotation, block_edge, false);
+  // Make the curved baselines and setup some key block members.
+  make_spline_rows(block, gradient, rotation, block_edge, false);
+  return gradient;
+}
+
+/**********************************************************************
  * make_rows
  *
  * Arrange the blobs into rows.
diff --git a/textord/makerow.h b/textord/makerow.h
index ee0c67f..afde328 100644
--- a/textord/makerow.h
+++ b/textord/makerow.h
@@ -109,6 +109,8 @@
 extern double_VAR_H (textord_descx_ratio_min, 0.15, "Min desc/xheight");
 extern double_VAR_H (textord_descx_ratio_max, 0.6, "Max desc/xheight");
 extern double_VAR_H (textord_xheight_error_margin, 0.1, "Accepted variation");
+
+float make_single_row(ICOORD page_tr, TO_BLOCK* block, TO_BLOCK_LIST* blocks);
 float make_rows(                             //make rows
                 ICOORD page_tr,              //top right
                 BLOCK_LIST *blocks,          //block list
diff --git a/textord/pagesegmain.cpp b/textord/pagesegmain.cpp
new file mode 100644
index 0000000..fd85f16
--- /dev/null
+++ b/textord/pagesegmain.cpp
@@ -0,0 +1,112 @@
+/**********************************************************************
+ * File:        pagesegmain.cpp
+ * Description: Top-level page segmenter for Tesseract.
+ * Author:      Ray Smith
+ * Created:     Thu Sep 25 17:12:01 PDT 2008
+ *
+ * (C) Copyright 2008, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#ifdef HAVE_LIBLEPT
+// Include leptonica library only if autoconf (or makefile etc) tell us to.
+#include "allheaders.h"
+#endif
+
+#include "tesseractclass.h"
+#include "img.h"
+#include "blobbox.h"
+#include "blread.h"
+#include "wordseg.h"
+#include "makerow.h"
+#include "baseapi.h"
+#include "tordmain.h"
+
+namespace tesseract {
+
+// Minimum believable resolution.
+const int kMinCredibleResolution = 70;
+// Default resolution used if input in not believable.
+const int kDefaultResolution = 300;
+
+// Segment the page according to the current value of tessedit_pageseg_mode.
+// If the input pix is not NULL, it is used as the source image, and copied
+// to image, otherwise it just uses image as the input.
+// On return the blocks list owns all the constructed page layout.
+void Tesseract::SegmentPage(const STRING* input_file,
+                            Pix* pix, IMAGE* image, BLOCK_LIST* blocks) {
+  int width = image->get_xsize();
+  int height = image->get_ysize();
+  int resolution = image->get_res();
+#ifdef HAVE_LIBLEPT
+  if (pix != NULL) {
+    width = pixGetWidth(pix);
+    height = pixGetHeight(pix);
+    resolution = pixGetXRes(pix);
+    image->FromPix(pix);
+  }
+#endif
+  // Zero resolution messes up the algorithms, so make sure it is credible.
+  if (resolution < kMinCredibleResolution)
+    resolution = kDefaultResolution;
+  // If a UNLV zone file can be found, use that instead of segmentation.
+  if (input_file != NULL && input_file->length() > 0) {
+    STRING name = *input_file;
+    const char* lastdot = strrchr(name.string(), '.');
+    if (lastdot != NULL)
+      name[lastdot - name.string()] = '\0';
+    read_pd_file(name, width, height, blocks);
+  }
+  if (blocks->empty()) {
+    // No UNLV file present. Work according to the PageSegMode.
+    BLOCK_IT block_it(blocks);
+    BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
+    block_it.add_to_end(block);
+    // TODO(rays) Add call to AutoPageSeg here.
+  }
+  TO_BLOCK_LIST land_blocks, port_blocks;
+  TBOX page_box;
+  find_components(blocks, &land_blocks, &port_blocks, &page_box);
+
+  TO_BLOCK_IT to_block_it(&port_blocks);
+  TO_BLOCK* to_block = to_block_it.data();
+  if (tessedit_pageseg_mode <= PSM_SINGLE_BLOCK ||
+      to_block->line_size < 2) {
+    // For now, AUTO, SINGLE_COLUMN and SINGLE_BLOCK all map to the
+    // old textord.
+    textord_page(page_box.topright(), blocks, &land_blocks, &port_blocks,
+                 this);
+  } else {
+    // SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row.
+    float gradient = make_single_row(page_box.topright(),
+                                     to_block, &port_blocks);
+    if (tessedit_pageseg_mode == PSM_SINGLE_LINE) {
+      // SINGLE_LINE uses the old word maker on the single line.
+      make_words(page_box.topright(), gradient, blocks,
+                 &land_blocks, &port_blocks, this);
+    } else {
+      // SINGLE_WORD and SINGLE_CHAR cram all the blobs into a
+      // single word, and in SINGLE_CHAR mode, all the outlines
+      // go in a single blob.
+      make_single_word(tessedit_pageseg_mode == PSM_SINGLE_CHAR,
+                       to_block->get_rows(), to_block->block->row_list());
+    }
+  }
+}
+
+}  // namespace tesseract.
+
diff --git a/textord/wordseg.cpp b/textord/wordseg.cpp
index e83dd68..8919249 100644
--- a/textord/wordseg.cpp
+++ b/textord/wordseg.cpp
@@ -45,6 +45,47 @@
 #define BLOCK_STATS_CLUSTERS  10
 
 /**********************************************************************
+ * make_single_word
+ *
+ * Arrange the blobs into one word. There is no fixed pitch detection.
+ **********************************************************************/
+
+void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST* real_rows) {
+  TO_ROW_IT to_row_it(rows);
+  TO_ROW* row = to_row_it.data();
+  // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
+  // to create the word.
+  C_BLOB_LIST cblobs;
+  C_BLOB_IT cblob_it(&cblobs);
+  BLOBNBOX_IT box_it(row->blob_list());
+  for (;!box_it.empty(); box_it.forward()) {
+    BLOBNBOX* bblob= box_it.extract();
+    if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
+      if (bblob->cblob() != NULL) {
+        C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
+        cout_it.move_to_last();
+        cout_it.add_list_after(bblob->cblob()->out_list());
+        delete bblob->cblob();
+      }
+    } else {
+      if (bblob->cblob() != NULL)
+        cblob_it.add_after_then_move(bblob->cblob());
+      delete bblob;
+    }
+  }
+  // Convert the TO_ROW to a ROW.
+  ROW* real_row = new ROW(row, static_cast<inT16>(row->kern_size),
+                          static_cast<inT16>(row->space_size));
+  WERD_IT word_it(real_row->word_list());
+  WERD* word = new WERD(&cblobs, 0, NULL);
+  word->set_flag(W_BOL, TRUE);
+  word->set_flag(W_EOL, TRUE);
+  word_it.add_after_then_move(word);
+  ROW_IT row_it(real_rows);
+  row_it.add_after_then_move(real_row);
+}
+
+/**********************************************************************
  * make_words
  *
  * Arrange the blobs into words.
@@ -535,7 +576,7 @@
   coeffs[1] = row->line_m ();
   coeffs[2] = row->line_c ();
   row->xheight = block->xheight;
-  real_row = new ROW (row,
+  real_row = new ROW(row,
     (inT16) block->kern_size, (inT16) block->space_size);
   word_it.set_to_list (real_row->word_list ());
                                  //put words in row
diff --git a/textord/wordseg.h b/textord/wordseg.h
index 3a6c115..5f30868 100644
--- a/textord/wordseg.h
+++ b/textord/wordseg.h
@@ -29,6 +29,7 @@
 }
 
 extern BOOL_VAR_H (textord_fp_chopping, TRUE, "Do fixed pitch chopping");
+void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST* real_rows);
 void make_words(                             //make words
                 ICOORD page_tr,              //top right
                 float gradient,              //page skew