8843794 take 2
Signed-off-by: Iliyan Malchev <malchev@google.com>
diff --git a/Android.mk b/Android.mk
index dc71258..4d7f6c4 100755
--- a/Android.mk
+++ b/Android.mk
@@ -236,7 +236,8 @@
textord/tospace.cpp \
textord/tovars.cpp \
textord/underlin.cpp \
- textord/wordseg.cpp
+ textord/wordseg.cpp \
+ textord/pagesegmain.cpp
# textord/drawtord.cpp
# textord/drawedg.cpp
diff --git a/BUILD b/BUILD
index ba9da2b..2e2a254 100644
--- a/BUILD
+++ b/BUILD
@@ -237,6 +237,7 @@
"textord/gap_map.cpp",
"textord/makerow.cpp",
"textord/oldbasel.cpp",
+ "textord/pagesegmain.cpp",
"textord/pithsync.cpp",
"textord/pitsync1.cpp",
"textord/scanedg.cpp",
diff --git a/ccmain/baseapi.cpp b/ccmain/baseapi.cpp
index ffa33d4..2fa1928 100644
--- a/ccmain/baseapi.cpp
+++ b/ccmain/baseapi.cpp
@@ -200,6 +200,15 @@
return true;
}
+// Set the current page segmentation mode. Defaults to PSM_AUTO.
+// The mode is stored as an INT_VARIABLE so it can also be modified by
+// ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
+void TessBaseAPI::SetPageSegMode(PageSegMode mode) {
+ if (tesseract_ == NULL)
+ tesseract_ = new Tesseract;
+ tesseract_->tessedit_pageseg_mode.set_value(mode);
+}
+
// Recognize a rectangle from an image and return the result as a string.
// May be called many times for a single Init.
// Currently has no error checking.
@@ -332,6 +341,9 @@
#ifdef HAVE_LIBLEPT
tesseract_->pgeditor_main(block_list_);
#endif
+ delete page_res_;
+ page_res_ = NULL;
+ return -1;
} else if (tesseract_->tessedit_train_from_boxes) {
apply_box_training(*output_file_, block_list_);
} else {
@@ -717,30 +729,6 @@
return true;
}
-#ifndef HAVE_LIBLEPT
-namespace tesseract {
-void pgeditor_read_file(STRING &filename,
- BLOCK_LIST *blocks, // block list to add to
- Tesseract *tess)
-{
- STRING name = filename; //truncated name
- const char *lastdot; //of name
- TO_BLOCK_LIST land_blocks, port_blocks;
- TBOX page_box;
-
- lastdot = strrchr (name.string (), '.');
- if (lastdot != NULL)
- name[lastdot-name.string()] = '\0';
- if (!read_pd_file (name, page_image.get_xsize (), page_image.get_ysize (),
- blocks)) {
- segment_page(blocks);
- }
- find_components(blocks, &land_blocks, &port_blocks, &page_box);
- textord_page(page_box.topright(), blocks, &land_blocks, &port_blocks, tess);
-}
-}
-#endif//HAVE_LIBLEPT
-
// Run the thresholder to make the thresholded image.
void TessBaseAPI::Threshold() {
thresholder_->ThresholdToIMAGE(&page_image);
@@ -752,19 +740,17 @@
// Find lines from the image making the BLOCK_LIST.
int TessBaseAPI::FindLines() {
- // The following call creates a full-page block and then runs connected
- // component analysis and text line creation.
if (input_file_ == NULL)
input_file_ = new STRING(kInputFile);
if (tesseract_ == NULL) {
tesseract_ = new Tesseract;
tesseract_->InitAdaptiveClassifier();
}
-#ifdef HAVE_LIBLEPT
- tesseract_->pgeditor_read_file(*input_file_, block_list_);
-#else
- tesseract::pgeditor_read_file(*input_file_, block_list_, tesseract_);
-#endif
+ ASSERT_HOST(page_image.get_xsize() == rect_width_ ||
+ page_image.get_xsize() == rect_width_ - 1);
+ ASSERT_HOST(page_image.get_ysize() == rect_height_ ||
+ page_image.get_ysize() == rect_height_ - 1);
+ tesseract_->SegmentPage(input_file_, NULL, &page_image, block_list_);
return 0;
}
diff --git a/ccmain/baseapi.h b/ccmain/baseapi.h
index a660392..b98643c 100644
--- a/ccmain/baseapi.h
+++ b/ccmain/baseapi.h
@@ -43,6 +43,17 @@
typedef int (Dict::*DictFunc)(void* dawg, void* node, int char_index,
char prevchar, const char *word, int word_end);
+enum PageSegMode {
+ PSM_AUTO, // Fully automatic page segmentation. (Default.)
+ PSM_SINGLE_COLUMN, // Assume a single column of text of variable sizes.
+ PSM_SINGLE_BLOCK, // Assume a single uniform block of text.
+ PSM_SINGLE_LINE, // Treat the image as a single text line.
+ PSM_SINGLE_WORD, // Treat the image as a single word.
+ PSM_SINGLE_CHAR, // Treat the image as a single character.
+
+ PSM_COUNT // Number of enum entries.
+};
+
// Base class for all tesseract APIs.
// Specific classes can add ability to work on different inputs or produce
// different outputs.
@@ -110,6 +121,11 @@
// and also accepts a relative or absolute path name.
bool ReadConfigFile(const char* filename);
+ // Set the current page segmentation mode. Defaults to PSM_AUTO.
+ // The mode is stored as an INT_VARIABLE so it can also be modified by
+ // ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
+ void SetPageSegMode(PageSegMode mode);
+
// Recognize a rectangle from an image and return the result as a string.
// May be called many times for a single Init.
// Currently has no error checking.
diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp
index 3543507..326965a 100644
--- a/ccmain/tesseractclass.cpp
+++ b/ccmain/tesseractclass.cpp
@@ -28,6 +28,8 @@
"Take segmentation and labeling from box file"),
BOOL_MEMBER(tessedit_train_from_boxes, false,
"Generate training data from boxed chars"),
+ INT_MEMBER(tessedit_pageseg_mode, 0,
+ "Page seg mode: 0=auto, 1=col, 2=block, 3=line, 4=word, 6=char"),
STRING_MEMBER(tessedit_char_blacklist, "",
"Blacklist of chars not to recognize"),
STRING_MEMBER(tessedit_char_whitelist, "",
diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h
index 5a29b64..5c16788 100644
--- a/ccmain/tesseractclass.h
+++ b/ccmain/tesseractclass.h
@@ -76,6 +76,8 @@
~Tesseract();
void SetBlackAndWhitelist();
+ void SegmentPage(const STRING* input_file,
+ Pix* pix, IMAGE* image, BLOCK_LIST* blocks);
//// control.h /////////////////////////////////////////////////////////
void recog_all_words( //process words
@@ -237,15 +239,6 @@
void adapt_to_good_samples(WERD_RES *word,
CHAR_SAMPLES_LIST *char_clusters,
CHAR_SAMPLE_LIST *chars_waiting);
-
- BOOL_VAR_H(tessedit_resegment_from_boxes, false,
- "Take segmentation and labeling from box file");
- BOOL_VAR_H(tessedit_train_from_boxes, false,
- "Generate training data from boxed chars");
- STRING_VAR_H(tessedit_char_blacklist, "",
- "Blacklist of chars not to recognize");
- STRING_VAR_H(tessedit_char_whitelist, "",
- "Whitelist of chars to recognize");
BOOL8 word_adaptable( //should we adapt?
WERD_RES *word,
uinT16 mode);
@@ -467,6 +460,18 @@
);
//// fixxht.cpp ///////////////////////////////////////////////////////
void check_block_occ(WERD_RES *word_res);
+
+ //// Data members ///////////////////////////////////////////////////////
+ BOOL_VAR_H(tessedit_resegment_from_boxes, false,
+ "Take segmentation and labeling from box file");
+ BOOL_VAR_H(tessedit_train_from_boxes, false,
+ "Generate training data from boxed chars");
+ INT_VAR_H(tessedit_pageseg_mode, 0,
+ "Page seg mode: 0=auto, 1=col, 2=block, 3=line, 4=word, 6=char");
+ STRING_VAR_H(tessedit_char_blacklist, "",
+ "Blacklist of chars not to recognize");
+ STRING_VAR_H(tessedit_char_whitelist, "",
+ "Whitelist of chars to recognize");
};
} // namespace tesseract
diff --git a/textord/makerow.cpp b/textord/makerow.cpp
index 74f6218..ab2ff64 100644
--- a/textord/makerow.cpp
+++ b/textord/makerow.cpp
@@ -121,6 +121,52 @@
#define MAX_HEIGHT_MODES 12
/**********************************************************************
+ * make_single_row
+ *
+ * Arrange the blobs into a single row.
+ **********************************************************************/
+
+float make_single_row(ICOORD page_tr, TO_BLOCK* block, TO_BLOCK_LIST* blocks) {
+ BLOBNBOX_IT blob_it = &block->blobs;
+ TO_ROW_IT row_it = block->get_rows();
+
+ // Include all the small blobs and large blobs.
+ blob_it.add_list_after(&block->small_blobs);
+ blob_it.add_list_after(&block->noise_blobs);
+ blob_it.add_list_after(&block->large_blobs);
+ blob_it.sort(blob_x_order);
+ blob_it.move_to_first();
+ TO_ROW* row = NULL;
+ // Add all the blobs to a single TO_ROW.
+ for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+ BLOBNBOX* blob = blob_it.extract();
+ int top = blob->bounding_box().top();
+ int bottom = blob->bounding_box().bottom();
+ if (row == NULL) {
+ row = new TO_ROW(blob, top, bottom, block->line_size);
+ row_it.add_before_then_move(row);
+ } else {
+ row->add_blob(blob, top, bottom, block->line_size);
+ }
+ }
+ // Fit an LMS line to the row.
+ for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward())
+ fit_lms_line(row_it.data());
+ float gradient;
+ float fit_error;
+ // Compute the skew based on the fitted line.
+ compute_page_skew(blocks, gradient, fit_error);
+ FCOORD rotation(1.0f, 0.0f);
+ // Associate i dots and other diacriticals with the appropriate blobs.
+ pre_associate_blobs(page_tr, block, rotation, false);
+ int block_edge = block->block->bounding_box().left();
+ fit_parallel_rows(block, gradient, rotation, block_edge, false);
+ // Make the curved baselines and setup some key block members.
+ make_spline_rows(block, gradient, rotation, block_edge, false);
+ return gradient;
+}
+
+/**********************************************************************
* make_rows
*
* Arrange the blobs into rows.
diff --git a/textord/makerow.h b/textord/makerow.h
index ee0c67f..afde328 100644
--- a/textord/makerow.h
+++ b/textord/makerow.h
@@ -109,6 +109,8 @@
extern double_VAR_H (textord_descx_ratio_min, 0.15, "Min desc/xheight");
extern double_VAR_H (textord_descx_ratio_max, 0.6, "Max desc/xheight");
extern double_VAR_H (textord_xheight_error_margin, 0.1, "Accepted variation");
+
+float make_single_row(ICOORD page_tr, TO_BLOCK* block, TO_BLOCK_LIST* blocks);
float make_rows( //make rows
ICOORD page_tr, //top right
BLOCK_LIST *blocks, //block list
diff --git a/textord/pagesegmain.cpp b/textord/pagesegmain.cpp
new file mode 100644
index 0000000..fd85f16
--- /dev/null
+++ b/textord/pagesegmain.cpp
@@ -0,0 +1,112 @@
+/**********************************************************************
+ * File: pagesegmain.cpp
+ * Description: Top-level page segmenter for Tesseract.
+ * Author: Ray Smith
+ * Created: Thu Sep 25 17:12:01 PDT 2008
+ *
+ * (C) Copyright 2008, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#ifdef HAVE_LIBLEPT
+// Include leptonica library only if autoconf (or makefile etc) tell us to.
+#include "allheaders.h"
+#endif
+
+#include "tesseractclass.h"
+#include "img.h"
+#include "blobbox.h"
+#include "blread.h"
+#include "wordseg.h"
+#include "makerow.h"
+#include "baseapi.h"
+#include "tordmain.h"
+
+namespace tesseract {
+
+// Minimum believable resolution.
+const int kMinCredibleResolution = 70;
+// Default resolution used if input in not believable.
+const int kDefaultResolution = 300;
+
+// Segment the page according to the current value of tessedit_pageseg_mode.
+// If the input pix is not NULL, it is used as the source image, and copied
+// to image, otherwise it just uses image as the input.
+// On return the blocks list owns all the constructed page layout.
+void Tesseract::SegmentPage(const STRING* input_file,
+ Pix* pix, IMAGE* image, BLOCK_LIST* blocks) {
+ int width = image->get_xsize();
+ int height = image->get_ysize();
+ int resolution = image->get_res();
+#ifdef HAVE_LIBLEPT
+ if (pix != NULL) {
+ width = pixGetWidth(pix);
+ height = pixGetHeight(pix);
+ resolution = pixGetXRes(pix);
+ image->FromPix(pix);
+ }
+#endif
+ // Zero resolution messes up the algorithms, so make sure it is credible.
+ if (resolution < kMinCredibleResolution)
+ resolution = kDefaultResolution;
+ // If a UNLV zone file can be found, use that instead of segmentation.
+ if (input_file != NULL && input_file->length() > 0) {
+ STRING name = *input_file;
+ const char* lastdot = strrchr(name.string(), '.');
+ if (lastdot != NULL)
+ name[lastdot - name.string()] = '\0';
+ read_pd_file(name, width, height, blocks);
+ }
+ if (blocks->empty()) {
+ // No UNLV file present. Work according to the PageSegMode.
+ BLOCK_IT block_it(blocks);
+ BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
+ block_it.add_to_end(block);
+ // TODO(rays) Add call to AutoPageSeg here.
+ }
+ TO_BLOCK_LIST land_blocks, port_blocks;
+ TBOX page_box;
+ find_components(blocks, &land_blocks, &port_blocks, &page_box);
+
+ TO_BLOCK_IT to_block_it(&port_blocks);
+ TO_BLOCK* to_block = to_block_it.data();
+ if (tessedit_pageseg_mode <= PSM_SINGLE_BLOCK ||
+ to_block->line_size < 2) {
+ // For now, AUTO, SINGLE_COLUMN and SINGLE_BLOCK all map to the
+ // old textord.
+ textord_page(page_box.topright(), blocks, &land_blocks, &port_blocks,
+ this);
+ } else {
+ // SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row.
+ float gradient = make_single_row(page_box.topright(),
+ to_block, &port_blocks);
+ if (tessedit_pageseg_mode == PSM_SINGLE_LINE) {
+ // SINGLE_LINE uses the old word maker on the single line.
+ make_words(page_box.topright(), gradient, blocks,
+ &land_blocks, &port_blocks, this);
+ } else {
+ // SINGLE_WORD and SINGLE_CHAR cram all the blobs into a
+ // single word, and in SINGLE_CHAR mode, all the outlines
+ // go in a single blob.
+ make_single_word(tessedit_pageseg_mode == PSM_SINGLE_CHAR,
+ to_block->get_rows(), to_block->block->row_list());
+ }
+ }
+}
+
+} // namespace tesseract.
+
diff --git a/textord/wordseg.cpp b/textord/wordseg.cpp
index e83dd68..8919249 100644
--- a/textord/wordseg.cpp
+++ b/textord/wordseg.cpp
@@ -45,6 +45,47 @@
#define BLOCK_STATS_CLUSTERS 10
/**********************************************************************
+ * make_single_word
+ *
+ * Arrange the blobs into one word. There is no fixed pitch detection.
+ **********************************************************************/
+
+void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST* real_rows) {
+ TO_ROW_IT to_row_it(rows);
+ TO_ROW* row = to_row_it.data();
+ // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
+ // to create the word.
+ C_BLOB_LIST cblobs;
+ C_BLOB_IT cblob_it(&cblobs);
+ BLOBNBOX_IT box_it(row->blob_list());
+ for (;!box_it.empty(); box_it.forward()) {
+ BLOBNBOX* bblob= box_it.extract();
+ if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
+ if (bblob->cblob() != NULL) {
+ C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
+ cout_it.move_to_last();
+ cout_it.add_list_after(bblob->cblob()->out_list());
+ delete bblob->cblob();
+ }
+ } else {
+ if (bblob->cblob() != NULL)
+ cblob_it.add_after_then_move(bblob->cblob());
+ delete bblob;
+ }
+ }
+ // Convert the TO_ROW to a ROW.
+ ROW* real_row = new ROW(row, static_cast<inT16>(row->kern_size),
+ static_cast<inT16>(row->space_size));
+ WERD_IT word_it(real_row->word_list());
+ WERD* word = new WERD(&cblobs, 0, NULL);
+ word->set_flag(W_BOL, TRUE);
+ word->set_flag(W_EOL, TRUE);
+ word_it.add_after_then_move(word);
+ ROW_IT row_it(real_rows);
+ row_it.add_after_then_move(real_row);
+}
+
+/**********************************************************************
* make_words
*
* Arrange the blobs into words.
@@ -535,7 +576,7 @@
coeffs[1] = row->line_m ();
coeffs[2] = row->line_c ();
row->xheight = block->xheight;
- real_row = new ROW (row,
+ real_row = new ROW(row,
(inT16) block->kern_size, (inT16) block->space_size);
word_it.set_to_list (real_row->word_list ());
//put words in row
diff --git a/textord/wordseg.h b/textord/wordseg.h
index 3a6c115..5f30868 100644
--- a/textord/wordseg.h
+++ b/textord/wordseg.h
@@ -29,6 +29,7 @@
}
extern BOOL_VAR_H (textord_fp_chopping, TRUE, "Do fixed pitch chopping");
+void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST* real_rows);
void make_words( //make words
ICOORD page_tr, //top right
float gradient, //page skew