| /********************************************************************** |
| * File: applybox.cpp (Formerly applybox.c) |
| * Description: Re segment rows according to box file data |
| * Author: Phil Cheatle |
| * Created: Wed Nov 24 09:11:23 GMT 1993 |
| * |
| * (C) Copyright 1993, Hewlett-Packard Ltd. |
| ** Licensed under the Apache License, Version 2.0 (the "License"); |
| ** you may not use this file except in compliance with the License. |
| ** You may obtain a copy of the License at |
| ** http://www.apache.org/licenses/LICENSE-2.0 |
| ** Unless required by applicable law or agreed to in writing, software |
| ** distributed under the License is distributed on an "AS IS" BASIS, |
| ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ** See the License for the specific language governing permissions and |
| ** limitations under the License. |
| * |
| **********************************************************************/ |
| #include "mfcpch.h" |
| |
| // Include automatically generated configuration file if running autoconf. |
| #ifdef HAVE_CONFIG_H |
| #include "config_auto.h" |
| #endif |
| |
| #ifdef HAVE_LIBLEPT |
| // Include leptonica library only if autoconf (or makefile etc) tell us to. |
| #include "allheaders.h" |
| #endif |
| |
| #include "applybox.h" |
| #include <ctype.h> |
| #include <string.h> |
| #ifdef __UNIX__ |
| #include <assert.h> |
| #include <errno.h> |
| #endif |
| #include "boxread.h" |
| #include "control.h" |
| #include "genblob.h" |
| #include "globals.h" |
| #include "fixxht.h" |
| #include "mainblk.h" |
| #include "matchdefs.h" |
| #include "secname.h" |
| #include "tessbox.h" |
| #include "unichar.h" |
| #include "unicharset.h" |
| #include "matchdefs.h" |
| #include "tesseractclass.h" |
| |
| #define SECURE_NAMES |
| #ifndef SECURE_NAMES |
| #include "wordstats.h" |
| #endif |
| |
| #define EXTERN |
| EXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead"); |
| EXTERN INT_VAR (applybox_debug, 5, "Debug level"); |
| EXTERN INT_VAR (applybox_page, 0, "Page number to apply boxes from"); |
| EXTERN STRING_VAR (applybox_test_exclusions, "", |
| "Chars ignored for testing"); |
| EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht"); |
| |
| EXTERN STRING_VAR(exposure_pattern, ".exp", |
| "Exposure value follows this pattern in the image" |
| " filename. The name of the image files are expected" |
| " to be in the form [lang].[fontname].exp[num].tif"); |
| |
| EXTERN BOOL_VAR(learn_chars_and_char_frags_mode, FALSE, |
| "Learn both character fragments (as is done in the" |
| " special low exposure mode) as well as unfragmented" |
| " characters."); |
| |
| extern IMAGE page_image; |
| |
| // The unicharset used during box training |
| static UNICHARSET unicharset_boxes; |
| |
| /************************************************************************* |
| * The code re-assigns outlines to form words each with ONE labelled blob. |
| * Noise is left in UNLABELLED words. The chars on the page are checked crudely |
| * for sensible position relative to baseline and xht. Failed boxes are |
| * compensated for by duplicating other believable instances of the character. |
| * |
| * The box file is assumed to contain box definitions, one per line, of the |
| * following format: |
| * <Char> <left> <bottom> <right> <top> ... arbitrary trailing fields unused |
| * |
| * The approach taken is to search the WHOLE page for stuff overlapping each box. |
| * - This is not too inefficient and is SAFE. |
| * - We can detect overlapping blobs as we will be attempting to put a blob |
| * from a LABELLED word into the current word. |
| * - When all the boxes have been processed we can detect any stuff which is |
| * being ignored - it is the unlabelled words left on the page. |
| * |
| * A box should only overlap one row. |
| * |
| * A warning is given if the box is on the same row as the previous box, but NOT |
| * on the same row as the previous blob. |
| * |
| * Any OUTLINE which overlaps the box is put into the new word. |
| * |
| * ascender chars must ascend above xht significantly |
| * xht chars must not rise above row xht significantly |
| * bl chars must not descend below baseline significantly |
| * descender chars must descend below baseline significantly |
| * |
| * ?? Certain chars are DROPPED - to limit the training data. |
| * |
| *************************************************************************/ |
| namespace tesseract { |
| void Tesseract::apply_boxes(const STRING& fname, |
| BLOCK_LIST *block_list //real blocks |
| ) { |
| inT16 boxfile_lineno = 0; |
| inT16 boxfile_charno = 0; |
| TBOX box; //boxfile box |
| UNICHAR_ID uch_id; //correct ch from boxfile |
| ROW *row; |
| ROW *prev_row = NULL; |
| inT16 prev_box_right = MAX_INT16; |
| inT16 block_id; |
| inT16 row_id; |
| inT16 box_count = 0; |
| inT16 box_failures = 0; |
| inT16 labels_ok; |
| inT16 rows_ok; |
| inT16 bad_blobs; |
| inT16 *tgt_char_counts = NULL; // No. of box samples |
| inT16 i; |
| inT16 rebalance_count = 0; |
| UNICHAR_ID min_uch_id = INVALID_UNICHAR_ID; |
| inT16 min_samples; |
| inT16 final_labelled_blob_count; |
| bool low_exposure = false; |
| |
| // Clean the unichar set |
| unicharset_boxes.clear(); |
| // Space character needed to represent NIL classification |
| unicharset_boxes.unichar_insert(" "); |
| |
| // Figure out whether this image file's exposure is less than 1, in which |
| // case when learning we will only pay attention to character fragments. |
| const char *ptr = strstr(imagefile.string(), exposure_pattern.string()); |
| if (ptr != NULL && |
| strtol(ptr += strlen(exposure_pattern.string()), NULL, 10) < 0) { |
| low_exposure = true; |
| } |
| |
| FILE* box_file; |
| STRING filename = fname; |
| const char *lastdot; //of name |
| |
| lastdot = strrchr (filename.string (), '.'); |
| if (lastdot != NULL) |
| filename[lastdot - filename.string()] = '\0'; |
| |
| filename += ".box"; |
| if (!(box_file = fopen (filename.string(), "r"))) { |
| CANTOPENFILE.error ("read_next_box", EXIT, |
| "Cant open box file %s %d", |
| filename.string(), errno); |
| } |
| |
| tgt_char_counts = new inT16[MAX_NUM_CLASSES]; |
| for (i = 0; i < MAX_NUM_CLASSES; i++) |
| tgt_char_counts[i] = 0; |
| |
| clear_any_old_text(block_list); |
| while (read_next_box(applybox_page, box_file, &box, &uch_id)) { |
| box_count++; |
| if (!low_exposure || learn_chars_and_char_frags_mode) { |
| tgt_char_counts[uch_id]++; |
| } |
| row = find_row_of_box (block_list, box, block_id, row_id); |
| if (box.left () < prev_box_right) { |
| boxfile_lineno++; |
| boxfile_charno = 1; |
| } |
| else |
| boxfile_charno++; |
| |
| if (row == NULL) { |
| box_failures++; |
| report_failed_box (boxfile_lineno, boxfile_charno, box, |
| unicharset_boxes.id_to_unichar(uch_id), |
| "FAILURE! box overlaps no blobs or blobs in multiple rows"); |
| } |
| else { |
| if ((box.left () >= prev_box_right) && (row != prev_row)) |
| report_failed_box (boxfile_lineno, boxfile_charno, box, |
| unicharset_boxes.id_to_unichar(uch_id), |
| "WARNING! false row break"); |
| box_failures += resegment_box (row, box, uch_id, block_id, row_id, |
| boxfile_lineno, boxfile_charno, tgt_char_counts, low_exposure, true); |
| prev_row = row; |
| } |
| prev_box_right = box.right (); |
| } |
| tidy_up(block_list, |
| labels_ok, |
| rows_ok, |
| bad_blobs, |
| tgt_char_counts, |
| rebalance_count, |
| &min_uch_id, |
| min_samples, |
| final_labelled_blob_count, |
| low_exposure, |
| true); |
| tprintf ("APPLY_BOXES:\n"); |
| tprintf (" Boxes read from boxfile: %6d\n", box_count); |
| tprintf (" Initially labelled blobs: %6d in %d rows\n", |
| labels_ok, rows_ok); |
| tprintf (" Box failures detected: %6d\n", box_failures); |
| tprintf (" Duped blobs for rebalance:%6d\n", rebalance_count); |
| tprintf (" \"%s\" has fewest samples:%6d\n", |
| unicharset_boxes.id_to_unichar(min_uch_id), min_samples); |
| tprintf (" Total unlabelled words: %6d\n", |
| bad_blobs); |
| tprintf (" Final labelled words: %6d\n", |
| final_labelled_blob_count); |
| |
| // Clean up. |
| delete[] tgt_char_counts; |
| } |
| |
| int Tesseract::Boxes2BlockList(int box_cnt, TBOX *boxes, |
| BLOCK_LIST *block_list, |
| bool right2left) { |
| inT16 boxfile_lineno = 0; |
| inT16 boxfile_charno = 0; |
| TBOX box; |
| ROW *row; |
| ROW *prev_row = NULL; |
| inT16 prev_box_right = MAX_INT16; |
| inT16 prev_box_left = 0; |
| inT16 block_id; |
| inT16 row_id; |
| inT16 box_failures = 0; |
| inT16 labels_ok; |
| inT16 rows_ok; |
| inT16 bad_blobs; |
| inT16 rebalance_count = 0; |
| UNICHAR_ID min_uch_id; |
| inT16 min_samples; |
| inT16 final_labelled_blob_count; |
| |
| clear_any_old_text(block_list); |
| for (int box_idx = 0; box_idx < box_cnt; box_idx++) { |
| box = boxes[box_idx]; |
| |
| row = find_row_of_box(block_list, box, block_id, row_id); |
| // check for a new row |
| if ((right2left && box.right () > prev_box_left) || |
| (!right2left && box.left () < prev_box_right)) { |
| boxfile_lineno++; |
| boxfile_charno = 1; |
| } |
| else { |
| boxfile_charno++; |
| } |
| |
| if (row == NULL) { |
| box_failures++; |
| } |
| else { |
| box_failures += resegment_box(row, box, 0, block_id, row_id, |
| boxfile_lineno, boxfile_charno, |
| NULL, false, false); |
| prev_row = row; |
| } |
| prev_box_right = box.right (); |
| prev_box_left = box.left (); |
| } |
| |
| tidy_up(block_list, labels_ok, rows_ok, bad_blobs, NULL, |
| rebalance_count, &min_uch_id, min_samples, final_labelled_blob_count, |
| false, false); |
| |
| return box_failures; |
| } |
| |
| } // namespace tesseract |
| |
| |
| void clear_any_old_text( //remove correct text |
| BLOCK_LIST *block_list //real blocks |
| ) { |
| BLOCK_IT block_it(block_list); |
| ROW_IT row_it; |
| WERD_IT word_it; |
| |
| for (block_it.mark_cycle_pt (); |
| !block_it.cycled_list (); block_it.forward ()) { |
| row_it.set_to_list (block_it.data ()->row_list ()); |
| for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { |
| word_it.set_to_list (row_it.data ()->word_list ()); |
| for (word_it.mark_cycle_pt (); |
| !word_it.cycled_list (); word_it.forward ()) { |
| word_it.data ()->set_text (""); |
| } |
| } |
| } |
| } |
| |
| UNICHAR_ID register_char(const char *uch) { |
| if (!unicharset_boxes.contains_unichar(uch)) { |
| unicharset_boxes.unichar_insert(uch); |
| if (unicharset_boxes.size() > MAX_NUM_CLASSES) { |
| tprintf("Error: Size of unicharset of boxes is " |
| "greater than MAX_NUM_CLASSES (%d)\n", MAX_NUM_CLASSES); |
| exit(1); |
| } |
| } |
| return unicharset_boxes.unichar_to_id(uch); |
| } |
| |
| BOOL8 read_next_box(int page, |
| FILE* box_file, |
| TBOX *box, |
| UNICHAR_ID *uch_id) { |
| int x_min; |
| int y_min; |
| int x_max; |
| int y_max; |
| char uch[kBoxReadBufSize]; |
| |
| if (read_next_box(page, box_file, uch, &x_min, &y_min, &x_max, &y_max)) { |
| *uch_id = register_char(uch); |
| *box = TBOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max)); |
| return TRUE; // read a box ok |
| } else { |
| return FALSE; // EOF |
| } |
| } |
| |
| |
| ROW *find_row_of_box( // |
| BLOCK_LIST *block_list, //real blocks |
| const TBOX &box, //from boxfile |
| inT16 &block_id, |
| inT16 &row_id_to_process) { |
| BLOCK_IT block_it(block_list); |
| BLOCK *block; |
| ROW_IT row_it; |
| ROW *row; |
| ROW *row_to_process = NULL; |
| inT16 row_id; |
| WERD_IT word_it; |
| WERD *word; |
| BOOL8 polyg; |
| PBLOB_IT blob_it; |
| PBLOB *blob; |
| OUTLINE_IT outline_it; |
| OUTLINE *outline; |
| |
| /* |
| Find row to process - error if box REALLY overlaps more than one row. (I.e |
| it overlaps blobs in the row - not just overlaps the bounding box of the |
| whole row.) |
| */ |
| |
| block_id = 0; |
| for (block_it.mark_cycle_pt (); |
| !block_it.cycled_list (); block_it.forward ()) { |
| block_id++; |
| row_id = 0; |
| block = block_it.data (); |
| if (block->bounding_box ().overlap (box)) { |
| row_it.set_to_list (block->row_list ()); |
| for (row_it.mark_cycle_pt (); |
| !row_it.cycled_list (); row_it.forward ()) { |
| row_id++; |
| row = row_it.data (); |
| if (row->bounding_box ().overlap (box)) { |
| word_it.set_to_list (row->word_list ()); |
| for (word_it.mark_cycle_pt (); |
| !word_it.cycled_list (); word_it.forward ()) { |
| word = word_it.data (); |
| polyg = word->flag (W_POLYGON); |
| if (word->bounding_box ().overlap (box)) { |
| blob_it.set_to_list (word->gblob_list ()); |
| for (blob_it.mark_cycle_pt (); |
| !blob_it.cycled_list (); blob_it.forward ()) { |
| blob = blob_it.data (); |
| if (gblob_bounding_box (blob, polyg). |
| overlap (box)) { |
| outline_it. |
| set_to_list (gblob_out_list |
| (blob, polyg)); |
| for (outline_it.mark_cycle_pt (); |
| !outline_it.cycled_list (); |
| outline_it.forward ()) { |
| outline = outline_it.data (); |
| if (goutline_bounding_box |
| (outline, polyg).major_overlap (box)) { |
| if ((row_to_process == NULL) || |
| (row_to_process == row)) { |
| row_to_process = row; |
| row_id_to_process = row_id; |
| } |
| else |
| /* RETURN ERROR Box overlaps blobs in more than one row */ |
| return NULL; |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| return row_to_process; |
| } |
| |
| |
| inT16 resegment_box( // |
| ROW *row, |
| TBOX &box, |
| UNICHAR_ID uch_id, |
| inT16 block_id, |
| inT16 row_id, |
| inT16 boxfile_lineno, |
| inT16 boxfile_charno, |
| inT16 *tgt_char_counts, |
| bool learn_char_fragments, |
| bool learning) { |
| WERD_LIST new_word_list; |
| WERD_IT word_it; |
| WERD_IT new_word_it(&new_word_list); |
| WERD *word = NULL; |
| WERD *new_word = NULL; |
| BOOL8 polyg = false; |
| PBLOB_IT blob_it; |
| PBLOB_IT new_blob_it; |
| PBLOB *blob; |
| PBLOB *new_blob; |
| OUTLINE_IT outline_it; |
| OUTLINE_LIST dummy; // Just to initialize new_outline_it. |
| OUTLINE_IT new_outline_it = &dummy; |
| OUTLINE *outline; |
| TBOX new_word_box; |
| TBOX curr_outline_box; |
| TBOX prev_outline_box; |
| float word_x_centre; |
| float baseline; |
| inT16 error_count = 0; //number of chars lost |
| STRING label; |
| UNICHAR_ID fragment_uch_id; |
| int fragment_index; |
| int new_word_it_len; |
| |
| if (learning && applybox_debug > 6) { |
| tprintf("\nAPPLY_BOX: in resegment_box() for %s(%d)\n", |
| unicharset_boxes.id_to_unichar(uch_id), uch_id); |
| } |
| word_it.set_to_list (row->word_list ()); |
| for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { |
| word = word_it.data (); |
| polyg = word->flag (W_POLYGON); |
| if (word->bounding_box ().overlap (box)) { |
| blob_it.set_to_list (word->gblob_list ()); |
| prev_outline_box = TBOX(); // clear prev_outline_box |
| curr_outline_box = TBOX(); // clear curr_outline_box |
| for (blob_it.mark_cycle_pt (); |
| !blob_it.cycled_list (); blob_it.forward ()) { |
| blob = blob_it.data (); |
| if (gblob_bounding_box (blob, polyg).overlap (box)) { |
| outline_it.set_to_list (gblob_out_list (blob, polyg)); |
| for (outline_it.mark_cycle_pt (); |
| !outline_it.cycled_list (); outline_it.forward ()) { |
| outline = outline_it.data (); |
| prev_outline_box += curr_outline_box; |
| curr_outline_box = goutline_bounding_box(outline, polyg); |
| if (curr_outline_box.major_overlap (box)) { |
| if (strlen (word->text ()) > 0) { |
| if (error_count == 0) { |
| error_count = 1; |
| if (learning && applybox_debug > 4) |
| report_failed_box (boxfile_lineno, |
| boxfile_charno, |
| box, unicharset_boxes.id_to_unichar(uch_id), |
| "FAILURE! box overlaps blob in labelled word"); |
| } |
| if (learning && applybox_debug > 4) |
| tprintf ("APPLY_BOXES: ALSO ignoring corrupted char" |
| " blk:%d row:%d \"%s\"\n", |
| block_id, row_id, word_it.data()->text()); |
| word_it.data ()->set_text (""); // UN label it |
| error_count++; |
| } |
| // Do not learn from fragments of characters that are broken |
| // into very small pieces to avoid picking up noise. |
| if ((learn_char_fragments || learn_chars_and_char_frags_mode) && |
| ((C_OUTLINE *)outline)->area() < kMinFragmentOutlineArea) { |
| if (applybox_debug > 6) { |
| tprintf("APPLY_BOX: fragment outline area %d is too small" |
| " - not recording fragments of this character\n", |
| ((C_OUTLINE *)outline)->area()); |
| } |
| error_count++; |
| } |
| |
| if (error_count == 0) { |
| if (applybox_debug > 6 ) { |
| tprintf("APPLY_BOX: Previous "); |
| prev_outline_box.print(); |
| tprintf("APPLY_BOX: Current area: %d ", |
| ((C_OUTLINE *)outline)->area()); |
| curr_outline_box.print(); |
| } |
| // When learning character fragments is enabled, we put |
| // outlines that do not overlap on x axis in separate WERDs. |
| bool start_new_word = |
| (learn_char_fragments || learn_chars_and_char_frags_mode) && |
| !curr_outline_box.major_x_overlap(prev_outline_box); |
| if (new_word == NULL || start_new_word) { |
| if (new_word != NULL) { // add prev new_word to new_word_list |
| new_word_it.add_to_end(new_word); |
| } |
| // Make a new word with a single blob. |
| new_word = word->shallow_copy(); |
| new_word->set_flag(W_FUZZY_NON, false); |
| new_word->set_flag(W_FUZZY_SP, false); |
| if (polyg){ |
| new_blob = new PBLOB; |
| } else { |
| new_blob = (PBLOB *) new C_BLOB; |
| } |
| new_blob_it.set_to_list(new_word->gblob_list()); |
| new_blob_it.add_to_end(new_blob); |
| new_outline_it.set_to_list( |
| gblob_out_list(new_blob, polyg)); |
| } |
| new_outline_it.add_to_end(outline_it.extract()); // move blob |
| } |
| } |
| } |
| if (outline_it.empty()) // no outlines in blob |
| delete blob_it.extract(); // so delete blob |
| } |
| } |
| if (blob_it.empty()) // no blobs in word |
| delete word_it.extract(); // so delete word |
| } |
| } |
| if (new_word != NULL) { // add prev new_word to new_word_list |
| new_word_it.add_to_end(new_word); |
| } |
| new_word_it_len = new_word_it.length(); |
| |
| // Check for failures. |
| if (error_count > 0) |
| return error_count; |
| if (learning && new_word_it_len <= 0) { |
| report_failed_box(boxfile_lineno, boxfile_charno, box, |
| unicharset_boxes.id_to_unichar(uch_id), |
| "FAILURE! Couldn't find any blobs"); |
| return 1; // failure |
| } |
| |
| if (learning && new_word_it_len > CHAR_FRAGMENT::kMaxChunks) { |
| tprintf("APPLY_BOXES: too many fragments (%d) for char %s\n", |
| new_word_it_len, unicharset_boxes.id_to_unichar(uch_id)); |
| return 1; // failure |
| } |
| |
| // Add labelled character or character fragments to the word list. |
| fragment_index = 0; |
| new_word_it.move_to_first(); |
| for (new_word_it.mark_cycle_pt(); !new_word_it.cycled_list(); |
| new_word_it.forward()) { |
| new_word = new_word_it.extract(); |
| if (new_word_it_len > 1) { // deal with a fragment |
| if (learning) { |
| label = CHAR_FRAGMENT::to_string(unicharset_boxes.id_to_unichar(uch_id), |
| fragment_index, new_word_it_len); |
| fragment_uch_id = register_char(label.string()); |
| new_word->set_text(label.string()); |
| ++fragment_index; |
| // For now we cheat by setting the expected number of char fragments |
| // to the number of char fragments actually parsed and labelled. |
| // TODO(daria): find out whether this can be improved. |
| tgt_char_counts[fragment_uch_id]++; |
| } else { |
| // No learning involved, Just stick a place-holder string |
| new_word->set_text("*"); |
| } |
| if (applybox_debug > 5) { |
| tprintf("APPLY_BOX: adding char fragment %s\n", label.string()); |
| } |
| } else { // deal with a regular character |
| if (learning) { |
| if (!learn_char_fragments || learn_chars_and_char_frags_mode) { |
| new_word->set_text(unicharset_boxes.id_to_unichar(uch_id)); |
| } else { |
| // not interested in non-fragmented chars if learning fragments, so |
| // unlabel it. |
| new_word->set_text(""); |
| } |
| } else { |
| // No learning involved here. Just stick a place holder string |
| new_word->set_text("*"); |
| } |
| } |
| gblob_sort_list(new_word->gblob_list(), polyg); |
| word_it.add_to_end(new_word); |
| new_word_box = new_word->bounding_box(); |
| word_x_centre = (new_word_box.left() + new_word_box.right()) / 2.0f; |
| baseline = row->base_line(word_x_centre); |
| } |
| |
| // All done. Now check if the EOL, BOL flags are set correctly. |
| word_it.move_to_first(); |
| for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { |
| word = word_it.data(); |
| word->set_flag(W_BOL, false); |
| word->set_flag(W_EOL, false); |
| } |
| word->set_flag(W_EOL, true); |
| word_it.move_to_first(); |
| word_it.data()->set_flag(W_BOL, true); |
| return 0; //success |
| |
| #if 0 |
| if (strlen(unicharset_boxes.id_to_unichar(uch_id)) == 1) { |
| if (STRING (chs_caps_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) && |
| (new_word_box.top () < |
| baseline + (1 + applybox_error_band) * row->x_height ())) { |
| report_failed_box (boxfile_lineno, boxfile_charno, box, |
| unicharset_boxes.id_to_unichar(uch_id), |
| "FAILURE! caps-ht char didn't ascend"); |
| new_word->set_text (""); |
| return 1; |
| } |
| if (STRING (chs_odd_top).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) && |
| (new_word_box.top () < |
| baseline + (1 - applybox_error_band) * row->x_height ())) { |
| report_failed_box (boxfile_lineno, boxfile_charno, box, |
| unicharset_boxes.id_to_unichar(uch_id), |
| "FAILURE! Odd top char below xht"); |
| new_word->set_text (""); |
| return 1; |
| } |
| if (STRING (chs_x_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) && |
| ((new_word_box.top () > |
| baseline + (1 + applybox_error_band) * row->x_height ()) || |
| (new_word_box.top () < |
| baseline + (1 - applybox_error_band) * row->x_height ()))) { |
| report_failed_box (boxfile_lineno, boxfile_charno, box, |
| unicharset_boxes.id_to_unichar(uch_id), |
| "FAILURE! x-ht char didn't have top near xht"); |
| new_word->set_text (""); |
| return 1; |
| } |
| if (STRING (chs_non_ambig_bl).contains |
| (unicharset_boxes.id_to_unichar(uch_id)[0]) && |
| ((new_word_box.bottom () < |
| baseline - applybox_error_band * row->x_height ()) || |
| (new_word_box.bottom () > |
| baseline + applybox_error_band * row->x_height ()))) { |
| report_failed_box (boxfile_lineno, boxfile_charno, box, |
| unicharset_boxes.id_to_unichar(uch_id), |
| "FAILURE! non ambig BL char didnt have bottom near baseline"); |
| new_word->set_text (""); |
| return 1; |
| } |
| if (STRING (chs_odd_bot).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) && |
| (new_word_box.bottom () > |
| baseline + applybox_error_band * row->x_height ())) { |
| report_failed_box (boxfile_lineno, boxfile_charno, box, |
| unicharset_boxes.id_to_unichar(uch_id), |
| "FAILURE! Odd bottom char above baseline"); |
| new_word->set_text (""); |
| return 1; |
| } |
| if (STRING (chs_desc).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) && |
| (new_word_box.bottom () > |
| baseline - applybox_error_band * row->x_height ())) { |
| report_failed_box (boxfile_lineno, boxfile_charno, box, |
| unicharset_boxes.id_to_unichar(uch_id), |
| "FAILURE! Descender doesn't descend"); |
| new_word->set_text (""); |
| return 1; |
| } |
| } |
| #endif |
| } |
| |
| |
| /************************************************************************* |
| * tidy_up() |
| * - report >1 block |
| * - sort the words in each row. |
| * - report any rows with no labelled words. |
| * - report any remaining unlabelled words |
| * - report total labelled words |
| * |
| *************************************************************************/ |
| void tidy_up( // |
| BLOCK_LIST *block_list, //real blocks |
| inT16 &ok_char_count, |
| inT16 &ok_row_count, |
| inT16 &unlabelled_words, |
| inT16 *tgt_char_counts, |
| inT16 &rebalance_count, |
| UNICHAR_ID *min_uch_id, |
| inT16 &min_samples, |
| inT16 &final_labelled_blob_count, |
| bool learn_character_fragments, |
| bool learning) { |
| BLOCK_IT block_it(block_list); |
| ROW_IT row_it; |
| ROW *row; |
| WERD_IT word_it; |
| WERD *word; |
| WERD *duplicate_word; |
| inT16 block_idx = 0; |
| inT16 row_idx; |
| inT16 all_row_idx = 0; |
| BOOL8 row_ok; |
| BOOL8 rebalance_needed = FALSE; |
| inT16 *labelled_char_counts = NULL; // num unique labelled samples |
| inT16 i; |
| UNICHAR_ID uch_id; |
| UNICHAR_ID prev_uch_id = -1; |
| BOOL8 at_dupe_of_prev_word; |
| ROW *prev_row = NULL; |
| inT16 left; |
| inT16 prev_left = -1; |
| |
| labelled_char_counts = new inT16[MAX_NUM_CLASSES]; |
| for (i = 0; i < MAX_NUM_CLASSES; i++) |
| labelled_char_counts[i] = 0; |
| |
| ok_char_count = 0; |
| ok_row_count = 0; |
| unlabelled_words = 0; |
| if (learning && (applybox_debug > 4) && (block_it.length () != 1)) { |
| if (block_it.length() > 1) { |
| tprintf ("APPLY_BOXES: More than one block??\n"); |
| } else { |
| tprintf("APPLY_BOXES: No blocks identified.\n"); |
| } |
| } |
| |
| for (block_it.mark_cycle_pt (); |
| !block_it.cycled_list (); block_it.forward ()) { |
| block_idx++; |
| row_idx = 0; |
| row_ok = FALSE; |
| row_it.set_to_list (block_it.data ()->row_list ()); |
| for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { |
| row_idx++; |
| all_row_idx++; |
| row = row_it.data (); |
| word_it.set_to_list (row->word_list ()); |
| word_it.sort (word_comparator); |
| for (word_it.mark_cycle_pt (); |
| !word_it.cycled_list (); word_it.forward ()) { |
| word = word_it.data (); |
| if (strlen (word->text ()) == 0 || |
| unicharset_boxes.unichar_to_id(word->text()) < 0) { |
| unlabelled_words++; |
| if (learning && applybox_debug > 4 && !learn_character_fragments) { |
| tprintf ("APPLY_BOXES: Unlabelled word blk:%d row:%d allrows:%d\n", |
| block_idx, row_idx, all_row_idx); |
| } |
| } else { |
| if (word->gblob_list ()->length () != 1) |
| tprintf ("APPLY_BOXES: FATALITY - MULTIBLOB Labelled word blk:%d" |
| " row:%d allrows:%d\n", block_idx, row_idx, all_row_idx); |
| |
| ok_char_count++; |
| ++labelled_char_counts[unicharset_boxes.unichar_to_id(word->text())]; |
| row_ok = TRUE; |
| } |
| } |
| if ((applybox_debug > 6) && (!row_ok)) { |
| tprintf("APPLY_BOXES: Row with no labelled words blk:%d row:%d" |
| " allrows:%d\n", block_idx, row_idx, all_row_idx); |
| } |
| else |
| ok_row_count++; |
| } |
| } |
| |
| min_samples = 9999; |
| for (i = 0; i < unicharset_boxes.size(); i++) { |
| if (tgt_char_counts[i] > labelled_char_counts[i]) { |
| if (labelled_char_counts[i] <= 1) { |
| tprintf("APPLY_BOXES: FATALITY - %d labelled samples of \"%s\" -" |
| " target is %d:\n", |
| labelled_char_counts[i], unicharset_boxes.debug_str(i).string(), |
| tgt_char_counts[i]); |
| } |
| else { |
| rebalance_needed = TRUE; |
| if (applybox_debug > 0) |
| tprintf("APPLY_BOXES: REBALANCE REQD \"%s\" - target of" |
| " %d from %d labelled samples\n", |
| unicharset_boxes.debug_str(i).string(), tgt_char_counts[i], |
| labelled_char_counts[i]); |
| } |
| } |
| if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) { |
| min_samples = labelled_char_counts[i]; |
| *min_uch_id = i; |
| } |
| } |
| |
| while (applybox_rebalance && rebalance_needed) { |
| block_it.set_to_list (block_list); |
| for (block_it.mark_cycle_pt (); |
| !block_it.cycled_list (); block_it.forward ()) { |
| row_it.set_to_list (block_it.data ()->row_list ()); |
| for (row_it.mark_cycle_pt (); |
| !row_it.cycled_list (); row_it.forward ()) { |
| row = row_it.data (); |
| word_it.set_to_list (row->word_list ()); |
| for (word_it.mark_cycle_pt (); |
| !word_it.cycled_list (); word_it.forward ()) { |
| word = word_it.data (); |
| left = word->bounding_box ().left (); |
| if (*word->text () != '\0') |
| uch_id = unicharset_boxes.unichar_to_id(word->text ()); |
| else |
| uch_id = -1; |
| at_dupe_of_prev_word = ((row == prev_row) && |
| (left = prev_left) && |
| (uch_id == prev_uch_id)); |
| if ((uch_id != -1) && |
| (labelled_char_counts[uch_id] > 1) && |
| (tgt_char_counts[uch_id] > labelled_char_counts[uch_id]) && |
| (!at_dupe_of_prev_word)) { |
| /* Duplicate the word to rebalance the labelled samples */ |
| if (applybox_debug > 9) { |
| tprintf ("Duping \"%s\" from ", unicharset_boxes.id_to_unichar(uch_id)); |
| word->bounding_box ().print (); |
| } |
| duplicate_word = new WERD; |
| *duplicate_word = *word; |
| word_it.add_after_then_move (duplicate_word); |
| rebalance_count++; |
| labelled_char_counts[uch_id]++; |
| } |
| prev_row = row; |
| prev_left = left; |
| prev_uch_id = uch_id; |
| } |
| } |
| } |
| rebalance_needed = FALSE; |
| for (i = 0; i < unicharset_boxes.size(); i++) { |
| if ((tgt_char_counts[i] > labelled_char_counts[i]) && |
| (labelled_char_counts[i] > 1)) { |
| rebalance_needed = TRUE; |
| break; |
| } |
| } |
| } |
| |
| /* Now final check - count labeled blobs */ |
| final_labelled_blob_count = 0; |
| block_it.set_to_list (block_list); |
| for (block_it.mark_cycle_pt (); |
| !block_it.cycled_list (); block_it.forward ()) { |
| row_it.set_to_list (block_it.data ()->row_list ()); |
| for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { |
| row = row_it.data (); |
| word_it.set_to_list (row->word_list ()); |
| word_it.sort (word_comparator); |
| for (word_it.mark_cycle_pt (); |
| !word_it.cycled_list (); word_it.forward ()) { |
| word = word_it.data (); |
| if ((strlen (word->text ()) > 0) && |
| (word->gblob_list()->length() == 1)) { |
| final_labelled_blob_count++; |
| } else { |
| delete word_it.extract(); |
| } |
| } |
| // delete the row if empty |
| if (row->word_list()->empty()) { |
| delete row_it.extract(); |
| } |
| } |
| } |
| |
| // Clean up. |
| delete[] labelled_char_counts; |
| } |
| |
| |
| void report_failed_box(inT16 boxfile_lineno, |
| inT16 boxfile_charno, |
| TBOX box, |
| const char *box_ch, |
| const char *err_msg) { |
| if (applybox_debug > 4) |
| tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n", |
| boxfile_lineno, |
| boxfile_charno, |
| box_ch, |
| box.left (), box.bottom (), box.right (), box.top (), err_msg); |
| } |
| |
| |
| void apply_box_training(const STRING& filename, BLOCK_LIST *block_list) { |
| BLOCK_IT block_it(block_list); |
| ROW_IT row_it; |
| ROW *row; |
| WERD_IT word_it; |
| WERD *word; |
| WERD *bln_word; |
| WERD copy_outword; // copy to denorm |
| PBLOB_IT blob_it; |
| DENORM denorm; |
| inT16 count = 0; |
| char unichar[UNICHAR_LEN + 1]; |
| |
| unichar[UNICHAR_LEN] = '\0'; |
| tprintf ("Generating training data\n"); |
| for (block_it.mark_cycle_pt (); |
| !block_it.cycled_list (); block_it.forward ()) { |
| row_it.set_to_list (block_it.data ()->row_list ()); |
| for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { |
| row = row_it.data (); |
| word_it.set_to_list (row->word_list ()); |
| for (word_it.mark_cycle_pt (); |
| !word_it.cycled_list (); word_it.forward ()) { |
| word = word_it.data (); |
| if ((strlen (word->text ()) > 0) && |
| (word->gblob_list ()->length () == 1)) { |
| // Here is a word with a single unichar label and a single blob so train on it. |
| bln_word = make_bln_copy(word, row, NULL, row->x_height (), &denorm); |
| blob_it.set_to_list (bln_word->blob_list ()); |
| strncpy(unichar, word->text (), UNICHAR_LEN); |
| tess_training_tester (filename, |
| blob_it.data (), //single blob |
| &denorm, TRUE, //correct |
| unichar, //correct character |
| strlen(unichar), //character length |
| NULL); |
| copy_outword = *(bln_word); |
| copy_outword.baseline_denormalise (&denorm); |
| blob_it.set_to_list (copy_outword.blob_list ()); |
| delete bln_word; |
| count++; |
| } |
| } |
| } |
| } |
| tprintf ("Generated training data for %d blobs\n", count); |
| } |
| |
| namespace tesseract { |
| void Tesseract::apply_box_testing(BLOCK_LIST *block_list) { |
| BLOCK_IT block_it(block_list); |
| ROW_IT row_it; |
| ROW *row; |
| inT16 row_count = 0; |
| WERD_IT word_it; |
| WERD *word; |
| WERD *bln_word; |
| inT16 word_count = 0; |
| PBLOB_IT blob_it; |
| DENORM denorm; |
| inT16 count = 0; |
| char ch[2]; |
| WERD *outword; //bln best choice |
| //segmentation |
| WERD_CHOICE *best_choice; //tess output |
| WERD_CHOICE *raw_choice; //top choice permuter |
| //detailed results |
| BLOB_CHOICE_LIST_CLIST blob_choices; |
| inT16 char_count = 0; |
| inT16 correct_count = 0; |
| inT16 err_count = 0; |
| inT16 rej_count = 0; |
| #ifndef SECURE_NAMES |
| WERDSTATS wordstats; //As from newdiff |
| #endif |
| char tess_rej_str[3]; |
| char tess_long_str[3]; |
| |
| ch[1] = '\0'; |
| strcpy (tess_rej_str, "|A"); |
| strcpy (tess_long_str, "|B"); |
| |
| for (block_it.mark_cycle_pt (); |
| !block_it.cycled_list (); block_it.forward ()) { |
| row_it.set_to_list (block_it.data ()->row_list ()); |
| for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { |
| row = row_it.data (); |
| row_count++; |
| word_count = 0; |
| word_it.set_to_list (row->word_list ()); |
| for (word_it.mark_cycle_pt (); |
| !word_it.cycled_list (); word_it.forward ()) { |
| word = word_it.data (); |
| word_count++; |
| if ((strlen (word->text ()) == 1) && |
| !STRING (applybox_test_exclusions).contains (*word->text ()) |
| && (word->gblob_list ()->length () == 1)) { |
| // Here is a word with a single char label and a single blob so test it. |
| bln_word = make_bln_copy(word, row, NULL, row->x_height (), &denorm); |
| blob_it.set_to_list (bln_word->blob_list ()); |
| ch[0] = *word->text (); |
| char_count++; |
| best_choice = tess_segment_pass1 (bln_word, |
| &denorm, |
| &Tesseract::tess_default_matcher, |
| raw_choice, |
| &blob_choices, outword); |
| |
| /* |
| Test for TESS screw up on word. Recog_word has already ensured that the |
| choice list, outword blob lists and best_choice string are the same |
| length. A TESS screw up is indicated by a blank filled or 0 length string. |
| */ |
| if ((best_choice->length() == 0) || |
| (strspn(best_choice->unichar_string().string(), " ") == |
| best_choice->unichar_string().length())) { |
| rej_count++; |
| tprintf ("%d:%d: \"%s\" -> TESS FAILED\n", |
| row_count, word_count, ch); |
| #ifndef SECURE_NAMES |
| wordstats.word (tess_rej_str, 2, ch, 1); |
| #endif |
| } |
| else { |
| if ((best_choice->length() != outword->blob_list()->length()) || |
| (best_choice->length() != blob_choices.length())) { |
| tprintf |
| ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", |
| best_choice->unichar_string().string(), |
| best_choice->length(), |
| outword->blob_list ()->length(), |
| blob_choices.length()); |
| } |
| ASSERT_HOST(best_choice->length() == |
| outword->blob_list()->length()); |
| ASSERT_HOST(best_choice->length() == blob_choices.length()); |
| fix_quotes (best_choice, |
| //turn to double |
| outword, &blob_choices); |
| if (strcmp (best_choice->unichar_string().string(), ch) != 0) { |
| err_count++; |
| tprintf ("%d:%d: \"%s\" -> \"%s\"\n", |
| row_count, word_count, ch, |
| best_choice->unichar_string().string()); |
| } |
| else |
| correct_count++; |
| #ifndef SECURE_NAMES |
| if (best_choice->unichar_string().length() > 2) |
| wordstats.word(tess_long_str, 2, ch, 1); |
| else |
| wordstats.word(best_choice->unichar_string().string(), |
| best_choice->unichar_string().length(), |
| ch, 1); |
| #endif |
| } |
| delete bln_word; |
| delete outword; |
| delete best_choice; |
| delete raw_choice; |
| blob_choices.deep_clear (); |
| count++; |
| } |
| } |
| } |
| } |
| #ifndef SECURE_NAMES |
| wordstats.print (1, 100.0); |
| wordstats.conf_matrix (); |
| tprintf ("Tested %d chars: %d correct; %d rejected by tess; %d errs\n", |
| char_count, correct_count, rej_count, err_count); |
| #endif |
| } |
| |
| } // namespace tesseract |