| /****************************************************************** |
| * File: docqual.cpp (Formerly docqual.c) |
| * Description: Document Quality Metrics |
| * Author: Phil Cheatle |
| * Created: Mon May 9 11:27:28 BST 1994 |
| * |
| * (C) Copyright 1994, Hewlett-Packard Ltd. |
| ** Licensed under the Apache License, Version 2.0 (the "License"); |
| ** you may not use this file except in compliance with the License. |
| ** You may obtain a copy of the License at |
| ** http://www.apache.org/licenses/LICENSE-2.0 |
| ** Unless required by applicable law or agreed to in writing, software |
| ** distributed under the License is distributed on an "AS IS" BASIS, |
| ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ** See the License for the specific language governing permissions and |
| ** limitations under the License. |
| * |
| **********************************************************************/ |
| |
| #include "mfcpch.h" |
| #include <ctype.h> |
| #include "docqual.h" |
| #include "tstruct.h" |
| #include "tfacep.h" |
| #include "reject.h" |
| #include "tessvars.h" |
| #include "genblob.h" |
| #include "secname.h" |
| #include "globals.h" |
| #include "tesseractclass.h" |
| |
| #define EXTERN |
| |
| EXTERN STRING_VAR (outlines_odd, "%| ", "Non standard number of outlines"); |
| EXTERN STRING_VAR (outlines_2, "ij!?%\":;", |
| "Non standard number of outlines"); |
| EXTERN BOOL_VAR (docqual_excuse_outline_errs, FALSE, |
| "Allow outline errs in unrejection?"); |
| EXTERN BOOL_VAR (tessedit_good_quality_unrej, TRUE, |
| "Reduce rejection on good docs"); |
| EXTERN BOOL_VAR (tessedit_use_reject_spaces, TRUE, "Reject spaces?"); |
| EXTERN double_VAR (tessedit_reject_doc_percent, 65.00, |
| "%rej allowed before rej whole doc"); |
| EXTERN double_VAR (tessedit_reject_block_percent, 45.00, |
| "%rej allowed before rej whole block"); |
| EXTERN double_VAR (tessedit_reject_row_percent, 40.00, |
| "%rej allowed before rej whole row"); |
| EXTERN double_VAR (tessedit_whole_wd_rej_row_percent, 70.00, |
| "%of row rejects in whole word rejects which prevents whole row rejection"); |
| EXTERN BOOL_VAR (tessedit_preserve_blk_rej_perfect_wds, TRUE, |
| "Only rej partially rejected words in block rejection"); |
| EXTERN BOOL_VAR (tessedit_preserve_row_rej_perfect_wds, TRUE, |
| "Only rej partially rejected words in row rejection"); |
| EXTERN BOOL_VAR (tessedit_dont_blkrej_good_wds, FALSE, |
| "Use word segmentation quality metric"); |
| EXTERN BOOL_VAR (tessedit_dont_rowrej_good_wds, FALSE, |
| "Use word segmentation quality metric"); |
| EXTERN INT_VAR (tessedit_preserve_min_wd_len, 2, |
| "Only preserve wds longer than this"); |
| EXTERN BOOL_VAR (tessedit_row_rej_good_docs, TRUE, |
| "Apply row rejection to good docs"); |
| EXTERN double_VAR (tessedit_good_doc_still_rowrej_wd, 1.1, |
| "rej good doc wd if more than this fraction rejected"); |
| EXTERN BOOL_VAR (tessedit_reject_bad_qual_wds, TRUE, |
| "Reject all bad quality wds"); |
| EXTERN BOOL_VAR (tessedit_debug_doc_rejection, FALSE, "Page stats"); |
| EXTERN BOOL_VAR (tessedit_debug_quality_metrics, FALSE, |
| "Output data to debug file"); |
| EXTERN BOOL_VAR (bland_unrej, FALSE, "unrej potential with no chekcs"); |
| EXTERN double_VAR (quality_rowrej_pc, 1.1, |
| "good_quality_doc gte good char limit"); |
| |
| EXTERN BOOL_VAR (unlv_tilde_crunching, TRUE, |
| "Mark v.bad words for tilde crunch"); |
| EXTERN BOOL_VAR (crunch_early_merge_tess_fails, TRUE, "Before word crunch?"); |
| EXTERN BOOL_EVAR (crunch_early_convert_bad_unlv_chs, FALSE, |
| "Take out ~^ early?"); |
| |
| EXTERN double_VAR (crunch_terrible_rating, 80.0, "crunch rating lt this"); |
| EXTERN BOOL_VAR (crunch_terrible_garbage, TRUE, "As it says"); |
| EXTERN double_VAR (crunch_poor_garbage_cert, -9.0, |
| "crunch garbage cert lt this"); |
| EXTERN double_VAR (crunch_poor_garbage_rate, 60, |
| "crunch garbage rating lt this"); |
| |
| EXTERN double_VAR (crunch_pot_poor_rate, 40, |
| "POTENTIAL crunch rating lt this"); |
| EXTERN double_VAR (crunch_pot_poor_cert, -8.0, |
| "POTENTIAL crunch cert lt this"); |
| EXTERN BOOL_VAR (crunch_pot_garbage, TRUE, "POTENTIAL crunch garbage"); |
| |
| EXTERN double_VAR (crunch_del_rating, 60, "POTENTIAL crunch rating lt this"); |
| EXTERN double_VAR (crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this"); |
| EXTERN double_VAR (crunch_del_min_ht, 0.7, "Del if word ht lt xht x this"); |
| EXTERN double_VAR (crunch_del_max_ht, 3.0, "Del if word ht gt xht x this"); |
| EXTERN double_VAR (crunch_del_min_width, 3.0, |
| "Del if word width lt xht x this"); |
| EXTERN double_VAR (crunch_del_high_word, 1.5, |
| "Del if word gt xht x this above bl"); |
| EXTERN double_VAR (crunch_del_low_word, 0.5, |
| "Del if word gt xht x this below bl"); |
| EXTERN double_VAR (crunch_small_outlines_size, 0.6, "Small if lt xht x this"); |
| |
| EXTERN INT_VAR (crunch_rating_max, 10, "For adj length in rating per ch"); |
| EXTERN INT_VAR (crunch_pot_indicators, 1, |
| "How many potential indicators needed"); |
| |
| EXTERN BOOL_VAR (crunch_leave_ok_strings, TRUE, |
| "Dont touch sensible strings"); |
| EXTERN BOOL_VAR (crunch_accept_ok, TRUE, "Use acceptability in okstring"); |
| EXTERN BOOL_VAR (crunch_leave_accept_strings, FALSE, |
| "Dont pot crunch sensible strings"); |
| EXTERN BOOL_VAR (crunch_include_numerals, FALSE, "Fiddle alpha figures"); |
| EXTERN INT_VAR (crunch_leave_lc_strings, 4, |
| "Dont crunch words with long lower case strings"); |
| EXTERN INT_VAR (crunch_leave_uc_strings, 4, |
| "Dont crunch words with long lower case strings"); |
| EXTERN INT_VAR (crunch_long_repetitions, 3, |
| "Crunch words with long repetitions"); |
| |
| EXTERN INT_VAR (crunch_debug, 0, "As it says"); |
| |
| /************************************************************************* |
| * word_blob_quality() |
| * How many blobs in the outword are identical to those of the inword? |
| * ASSUME blobs in both initial word and outword are in ascending order of |
| * left hand blob edge. |
| *************************************************************************/ |
| inT16 word_blob_quality( //Blob seg changes |
| WERD_RES *word, |
| ROW *row) { |
| WERD *bln_word; //BL norm init word |
| TWERD *tessword; //tess format |
| WERD *init_word; //BL norm init word |
| PBLOB_IT outword_it; |
| PBLOB_IT initial_it; |
| inT16 i; |
| inT16 init_blobs_left; |
| inT16 match_count = 0; |
| BOOL8 matched; |
| TBOX out_box; |
| PBLOB *test_blob; |
| DENORM denorm; |
| float bln_xht; |
| |
| if (word->word->gblob_list ()->empty ()) |
| return 0; |
| //xht used for blnorm |
| bln_xht = bln_x_height / word->denorm.scale (); |
| bln_word = make_bln_copy(word->word, row, NULL, bln_xht, &denorm); |
| /* |
| NOTE: Need to convert to tess format and back again to ensure that the |
| same float -> int rounding of coords is done to source wd as out wd before |
| comparison |
| */ |
| tessword = make_tess_word(bln_word, NULL); // Convert word. |
| init_word = make_ed_word (tessword, bln_word); |
| delete bln_word; |
| delete_word(tessword); |
| if (init_word == NULL) { |
| // Conversion failed. |
| return 0; |
| } |
| |
| initial_it.set_to_list (init_word->blob_list ()); |
| init_blobs_left = initial_it.length (); |
| outword_it.set_to_list (word->outword->blob_list ()); |
| |
| for (outword_it.mark_cycle_pt (); |
| !outword_it.cycled_list (); outword_it.forward ()) { |
| out_box = outword_it.data ()->bounding_box (); |
| |
| // Skip any initial blobs LEFT of current outword blob. |
| while (!initial_it.at_last () && |
| (initial_it.data ()->bounding_box ().left () < out_box.left ())) { |
| initial_it.forward (); |
| init_blobs_left--; |
| } |
| |
| /* See if current outword blob matches any initial blob with the same left |
| coord. (Normally only one but possibly more - in unknown order) */ |
| |
| i = 0; |
| matched = FALSE; |
| do { |
| test_blob = initial_it.data_relative (i++); |
| matched = crude_match_blobs (test_blob, outword_it.data ()); |
| if (matched) |
| match_count++; |
| } |
| while (!matched && |
| (init_blobs_left - i > 0) && |
| (i < 129) && |
| !initial_it.at_last () && |
| test_blob->bounding_box ().left () == out_box.left ()); |
| } |
| delete init_word; |
| return match_count; |
| } |
| |
| |
| /************************************************************************* |
| * crude_match_blobs() |
| * Check bounding boxes are the same and the number of outlines are the same. |
| *************************************************************************/ |
| BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2) { |
| TBOX box1 = blob1->bounding_box (); |
| TBOX box2 = blob2->bounding_box (); |
| |
| if (box1.contains (box2) && |
| box2.contains (box1) && |
| (blob1->out_list ()->length () == blob1->out_list ()->length ())) |
| return TRUE; |
| else |
| return FALSE; |
| } |
| |
| |
| inT16 word_outline_errs(WERD_RES *word) { |
| PBLOB_IT outword_it; |
| inT16 i = 0; |
| inT16 err_count = 0; |
| |
| outword_it.set_to_list (word->outword->blob_list ()); |
| |
| for (outword_it.mark_cycle_pt (); |
| !outword_it.cycled_list (); outword_it.forward ()) { |
| err_count += count_outline_errs (word->best_choice->unichar_string()[i], |
| outword_it.data()->out_list()->length()); |
| i++; |
| } |
| return err_count; |
| } |
| |
| |
| /************************************************************************* |
| * word_char_quality() |
| * Combination of blob quality and outline quality - how many good chars are |
| * there? - I.e chars which pass the blob AND outline tests. |
| *************************************************************************/ |
| void word_char_quality(WERD_RES *word, |
| ROW *row, |
| inT16 *match_count, |
| inT16 *accepted_match_count) { |
| WERD *bln_word; //BL norm init word |
| TWERD *tessword; //tess format |
| WERD *init_word; //BL norm init word |
| PBLOB_IT outword_it; |
| PBLOB_IT initial_it; |
| inT16 i; |
| inT16 init_blobs_left; |
| BOOL8 matched; |
| TBOX out_box; |
| PBLOB *test_blob; |
| DENORM denorm; |
| float bln_xht; |
| inT16 j = 0; |
| |
| *match_count = 0; |
| *accepted_match_count = 0; |
| if (word->word->gblob_list ()->empty ()) |
| return; |
| |
| //xht used for blnorm |
| bln_xht = bln_x_height / word->denorm.scale (); |
| bln_word = make_bln_copy(word->word, row, NULL, bln_xht, &denorm); |
| /* |
| NOTE: Need to convert to tess format and back again to ensure that the |
| same float -> int rounding of coords is done to source wd as out wd before |
| comparison |
| */ |
| tessword = make_tess_word(bln_word, NULL); // Convert word. |
| init_word = make_ed_word (tessword, bln_word); |
| delete bln_word; |
| delete_word(tessword); |
| if (init_word == NULL) |
| return; |
| |
| initial_it.set_to_list (init_word->blob_list ()); |
| init_blobs_left = initial_it.length (); |
| outword_it.set_to_list (word->outword->blob_list ()); |
| |
| for (outword_it.mark_cycle_pt (); |
| !outword_it.cycled_list (); outword_it.forward ()) { |
| out_box = outword_it.data ()->bounding_box (); |
| |
| /* Skip any initial blobs LEFT of current outword blob */ |
| while (!initial_it.at_last () && |
| (initial_it.data ()->bounding_box ().left () < out_box.left ())) { |
| initial_it.forward (); |
| init_blobs_left--; |
| } |
| |
| /* See if current outword blob matches any initial blob with the same left |
| coord. (Normally only one but possibly more - in unknown order) */ |
| |
| i = 0; |
| matched = FALSE; |
| do { |
| test_blob = initial_it.data_relative (i++); |
| matched = crude_match_blobs (test_blob, outword_it.data ()); |
| if (matched && |
| (count_outline_errs (word->best_choice->unichar_string()[j], |
| outword_it.data ()->out_list ()->length ()) |
| == 0)) { |
| (*match_count)++; |
| if (word->reject_map[j].accepted ()) |
| (*accepted_match_count)++; |
| } |
| } |
| while (!matched && |
| (init_blobs_left - i > 0) && |
| (i < 129) && |
| !initial_it.at_last () && |
| test_blob->bounding_box ().left () == out_box.left ()); |
| j++; |
| } |
| delete init_word; |
| } |
| |
| |
| /************************************************************************* |
| * unrej_good_chs() |
| * Unreject POTENTIAL rejects if the blob passes the blob and outline checks |
| *************************************************************************/ |
| void unrej_good_chs(WERD_RES *word, ROW *row) { |
| WERD *bln_word; //BL norm init word |
| TWERD *tessword; //tess format |
| WERD *init_word; //BL norm init word |
| PBLOB_IT outword_it; |
| PBLOB_IT initial_it; |
| inT16 i; |
| inT16 init_blobs_left; |
| BOOL8 matched; |
| TBOX out_box; |
| PBLOB *test_blob; |
| DENORM denorm; |
| float bln_xht; |
| inT16 j = 0; |
| |
| if (word->word->gblob_list ()->empty ()) |
| return; |
| |
| //xht used for blnorm |
| bln_xht = bln_x_height / word->denorm.scale (); |
| bln_word = make_bln_copy(word->word, row, NULL, bln_xht, &denorm); |
| /* |
| NOTE: Need to convert to tess format and back again to ensure that the |
| same float -> int rounding of coords is done to source wd as out wd before |
| comparison |
| */ |
| tessword = make_tess_word(bln_word, NULL); // Convert word |
| init_word = make_ed_word (tessword, bln_word); |
| delete bln_word; |
| delete_word(tessword); |
| if (init_word == NULL) |
| return; |
| |
| initial_it.set_to_list (init_word->blob_list ()); |
| init_blobs_left = initial_it.length (); |
| outword_it.set_to_list (word->outword->blob_list ()); |
| |
| for (outword_it.mark_cycle_pt (); |
| !outword_it.cycled_list (); outword_it.forward ()) { |
| out_box = outword_it.data ()->bounding_box (); |
| |
| /* Skip any initial blobs LEFT of current outword blob */ |
| while (!initial_it.at_last () && |
| (initial_it.data ()->bounding_box ().left () < out_box.left ())) { |
| initial_it.forward (); |
| init_blobs_left--; |
| } |
| |
| /* See if current outword blob matches any initial blob with the same left |
| coord. (Normally only one but possibly more - in unknown order) */ |
| |
| i = 0; |
| matched = FALSE; |
| do { |
| test_blob = initial_it.data_relative (i++); |
| matched = crude_match_blobs (test_blob, outword_it.data ()); |
| if (matched && |
| (word->reject_map[j].accept_if_good_quality ()) && |
| (docqual_excuse_outline_errs || |
| (count_outline_errs (word->best_choice->unichar_string()[j], |
| outword_it.data ()->out_list ()-> |
| length ()) == 0))) |
| word->reject_map[j].setrej_quality_accept (); |
| } |
| while (!matched && |
| (init_blobs_left - i > 0) && |
| (i < 129) && |
| !initial_it.at_last () && |
| test_blob->bounding_box ().left () == out_box.left ()); |
| j++; |
| } |
| delete init_word; |
| } |
| |
| |
| void print_boxes(WERD *word) { |
| PBLOB_IT it; |
| TBOX box; |
| |
| it.set_to_list (word->blob_list ()); |
| for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) { |
| box = it.data ()->bounding_box (); |
| box.print (); |
| } |
| } |
| |
| |
| inT16 count_outline_errs(char c, inT16 outline_count) { |
| int expected_outline_count; |
| |
| if (STRING (outlines_odd).contains (c)) |
| return 0; //Dont use this char |
| else if (STRING (outlines_2).contains (c)) |
| expected_outline_count = 2; |
| else |
| expected_outline_count = 1; |
| return abs (outline_count - expected_outline_count); |
| } |
| |
| |
| namespace tesseract { |
| void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it, |
| BOOL8 good_quality_doc) { |
| if ((tessedit_good_quality_unrej && good_quality_doc)) |
| unrej_good_quality_words(page_res_it); |
| doc_and_block_rejection(page_res_it, good_quality_doc); |
| |
| page_res_it.restart_page (); |
| while (page_res_it.word () != NULL) { |
| insert_rej_cblobs(page_res_it.word()); |
| page_res_it.forward(); |
| } |
| |
| if (unlv_tilde_crunching) { |
| tilde_crunch(page_res_it); |
| tilde_delete(page_res_it); |
| } |
| } |
| |
| |
| /************************************************************************* |
| * unrej_good_quality_words() |
| * Accept potential rejects in words which pass the following checks: |
| * - Contains a potential reject |
| * - Word looks like a sensible alpha word. |
| * - Word segmentation is the same as the original image |
| * - All characters have the expected number of outlines |
| * NOTE - the rejection counts are recalculated after unrejection |
| * - CANT do it in a single pass without a bit of fiddling |
| * - keep it simple but inefficient |
| *************************************************************************/ |
| void Tesseract::unrej_good_quality_words( //unreject potential |
| PAGE_RES_IT &page_res_it) { |
| WERD_RES *word; |
| ROW_RES *current_row; |
| BLOCK_RES *current_block; |
| int i; |
| |
| page_res_it.restart_page (); |
| while (page_res_it.word () != NULL) { |
| check_debug_pt (page_res_it.word (), 100); |
| if (bland_unrej) { |
| word = page_res_it.word (); |
| for (i = 0; i < word->reject_map.length (); i++) { |
| if (word->reject_map[i].accept_if_good_quality ()) |
| word->reject_map[i].setrej_quality_accept (); |
| } |
| page_res_it.forward (); |
| } |
| else if ((page_res_it.row ()->char_count > 0) && |
| ((page_res_it.row ()->rej_count / |
| (float) page_res_it.row ()->char_count) <= |
| quality_rowrej_pc)) { |
| word = page_res_it.word (); |
| if (word->reject_map.quality_recoverable_rejects () && |
| (tessedit_unrej_any_wd || |
| acceptable_word_string (word->best_choice->unichar_string().string(), |
| word->best_choice->unichar_lengths().string()) |
| != AC_UNACCEPTABLE)) { |
| unrej_good_chs (word, page_res_it.row ()->row); |
| } |
| page_res_it.forward (); |
| } |
| else { |
| /* Skip to end of dodgy row */ |
| current_row = page_res_it.row (); |
| while ((page_res_it.word () != NULL) && |
| (page_res_it.row () == current_row)) |
| page_res_it.forward (); |
| } |
| check_debug_pt (page_res_it.word (), 110); |
| } |
| page_res_it.restart_page (); |
| page_res_it.page_res->char_count = 0; |
| page_res_it.page_res->rej_count = 0; |
| current_block = NULL; |
| current_row = NULL; |
| while (page_res_it.word () != NULL) { |
| if (current_block != page_res_it.block ()) { |
| current_block = page_res_it.block (); |
| current_block->char_count = 0; |
| current_block->rej_count = 0; |
| } |
| if (current_row != page_res_it.row ()) { |
| current_row = page_res_it.row (); |
| current_row->char_count = 0; |
| current_row->rej_count = 0; |
| current_row->whole_word_rej_count = 0; |
| } |
| page_res_it.rej_stat_word (); |
| page_res_it.forward (); |
| } |
| } |
| |
| |
| /************************************************************************* |
| * doc_and_block_rejection() |
| * |
| * If the page has too many rejects - reject all of it. |
| * If any block has too many rejects - reject all words in the block |
| *************************************************************************/ |
| |
| void Tesseract::doc_and_block_rejection( //reject big chunks |
| PAGE_RES_IT &page_res_it, |
| BOOL8 good_quality_doc) { |
| inT16 block_no = 0; |
| inT16 row_no = 0; |
| BLOCK_RES *current_block; |
| ROW_RES *current_row; |
| |
| BOOL8 rej_word; |
| BOOL8 prev_word_rejected; |
| inT16 char_quality; |
| inT16 accepted_char_quality; |
| |
| if ((page_res_it.page_res->rej_count * 100.0 / |
| page_res_it.page_res->char_count) > tessedit_reject_doc_percent) { |
| reject_whole_page(page_res_it); |
| #ifndef SECURE_NAMES |
| if (tessedit_debug_doc_rejection) { |
| tprintf ("REJECT ALL #chars: %d #Rejects: %d; \n", |
| page_res_it.page_res->char_count, |
| page_res_it.page_res->rej_count); |
| } |
| #endif |
| } |
| else { |
| #ifndef SECURE_NAMES |
| if (tessedit_debug_doc_rejection) |
| tprintf ("NO PAGE REJECTION #chars: %d # Rejects: %d; \n", |
| page_res_it.page_res->char_count, |
| page_res_it.page_res->rej_count); |
| #endif |
| |
| /* Walk blocks testing for block rejection */ |
| |
| page_res_it.restart_page (); |
| while (page_res_it.word () != NULL) { |
| current_block = page_res_it.block (); |
| block_no = current_block->block->index(); |
| if ((page_res_it.block ()->char_count > 0) && |
| ((page_res_it.block ()->rej_count * 100.0 / |
| page_res_it.block ()->char_count) > |
| tessedit_reject_block_percent)) { |
| #ifndef SECURE_NAMES |
| if (tessedit_debug_block_rejection) |
| tprintf ("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", |
| block_no, |
| page_res_it.block ()->char_count, |
| page_res_it.block ()->rej_count); |
| #endif |
| prev_word_rejected = FALSE; |
| while ((page_res_it.word () != NULL) && |
| (page_res_it.block () == current_block)) { |
| if (tessedit_preserve_blk_rej_perfect_wds) { |
| rej_word = |
| (page_res_it.word ()->reject_map.reject_count () > 0) |
| || (page_res_it.word ()->reject_map.length () < |
| tessedit_preserve_min_wd_len); |
| if (rej_word && tessedit_dont_blkrej_good_wds |
| && !(page_res_it.word ()->reject_map.length () < |
| tessedit_preserve_min_wd_len) |
| && |
| (acceptable_word_string |
| (page_res_it.word()->best_choice->unichar_string().string(), |
| page_res_it.word ()->best_choice->unichar_lengths().string()) != |
| AC_UNACCEPTABLE)) { |
| word_char_quality (page_res_it.word (), |
| page_res_it.row ()->row, |
| &char_quality, |
| &accepted_char_quality); |
| rej_word = char_quality != |
| page_res_it.word ()->reject_map.length (); |
| } |
| } |
| else |
| rej_word = TRUE; |
| if (rej_word) { |
| /* |
| Reject spacing if both current and prev words are rejected. |
| NOTE - this is NOT restricted to FUZZY spaces. - When tried this |
| generated more space errors. |
| */ |
| if (tessedit_use_reject_spaces && |
| prev_word_rejected && |
| (page_res_it.prev_row () == page_res_it.row ()) && |
| (page_res_it.word ()->word->space () == 1)) |
| page_res_it.word ()->reject_spaces = TRUE; |
| page_res_it.word ()->reject_map.rej_word_block_rej (); |
| } |
| prev_word_rejected = rej_word; |
| page_res_it.forward (); |
| } |
| } |
| else { |
| #ifndef SECURE_NAMES |
| if (tessedit_debug_block_rejection) |
| tprintf |
| ("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", |
| block_no, page_res_it.block ()->char_count, |
| page_res_it.block ()->rej_count); |
| #endif |
| |
| /* Walk rows in block testing for row rejection */ |
| row_no = 0; |
| while ((page_res_it.word () != NULL) && |
| (page_res_it.block () == current_block)) { |
| current_row = page_res_it.row (); |
| row_no++; |
| /* Reject whole row if: |
| fraction of chars on row which are rejected exceed a limit AND |
| fraction rejects which occur in WHOLE WERD rejects is LESS THAN a |
| limit |
| */ |
| if ((page_res_it.row ()->char_count > 0) && |
| ((page_res_it.row ()->rej_count * 100.0 / |
| page_res_it.row ()->char_count) > |
| tessedit_reject_row_percent) && |
| ((page_res_it.row ()->whole_word_rej_count * 100.0 / |
| page_res_it.row ()->rej_count) < |
| tessedit_whole_wd_rej_row_percent)) { |
| #ifndef SECURE_NAMES |
| if (tessedit_debug_block_rejection) |
| tprintf |
| ("REJECTING ROW %d #chars: %d; #Rejects: %d\n", |
| row_no, page_res_it.row ()->char_count, |
| page_res_it.row ()->rej_count); |
| #endif |
| prev_word_rejected = FALSE; |
| while ((page_res_it.word () != NULL) && |
| (page_res_it.row () == current_row)) { |
| /* Preserve words on good docs unless they are mostly rejected*/ |
| if (!tessedit_row_rej_good_docs && good_quality_doc) { |
| rej_word = |
| page_res_it.word ()->reject_map. |
| reject_count () / |
| (float) page_res_it.word ()->reject_map. |
| length () > tessedit_good_doc_still_rowrej_wd; |
| } |
| |
| /* Preserve perfect words anyway */ |
| else if (tessedit_preserve_row_rej_perfect_wds) { |
| rej_word = |
| (page_res_it.word ()->reject_map. |
| reject_count () > 0) |
| || (page_res_it.word ()->reject_map. |
| length () < tessedit_preserve_min_wd_len); |
| if (rej_word && tessedit_dont_rowrej_good_wds |
| && !(page_res_it.word ()->reject_map. |
| length () < |
| tessedit_preserve_min_wd_len) |
| && |
| (acceptable_word_string |
| (page_res_it.word ()->best_choice-> |
| unichar_string().string(), |
| page_res_it.word ()->best_choice-> |
| unichar_lengths().string()) != AC_UNACCEPTABLE)) { |
| word_char_quality (page_res_it.word (), |
| page_res_it.row ()->row, |
| &char_quality, |
| &accepted_char_quality); |
| rej_word = char_quality != |
| page_res_it.word ()->reject_map.length (); |
| } |
| } |
| else |
| rej_word = TRUE; |
| if (rej_word) { |
| /* |
| Reject spacing if both current and prev words are rejected. |
| NOTE - this is NOT restricted to FUZZY spaces. - When tried |
| this generated more space errors. |
| */ |
| if (tessedit_use_reject_spaces && |
| prev_word_rejected && |
| (page_res_it.prev_row () == |
| page_res_it.row ()) |
| && (page_res_it.word ()->word->space () == |
| 1)) |
| page_res_it.word ()->reject_spaces = TRUE; |
| page_res_it.word ()->reject_map. |
| rej_word_row_rej(); |
| } |
| prev_word_rejected = rej_word; |
| page_res_it.forward (); |
| } |
| } |
| else { |
| #ifndef SECURE_NAMES |
| if (tessedit_debug_block_rejection) |
| tprintf |
| ("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", |
| row_no, page_res_it.row ()->char_count, |
| page_res_it.row ()->rej_count); |
| #endif |
| while ((page_res_it.word () != NULL) && |
| (page_res_it.row () == current_row)) |
| page_res_it.forward (); |
| } |
| } |
| } |
| } |
| } |
| } |
| } // namespace tesseract |
| |
| |
| /************************************************************************* |
| * reject_whole_page() |
| * Dont believe any of it - set the reject map to 00..00 in all words |
| * |
| *************************************************************************/ |
| |
| void reject_whole_page(PAGE_RES_IT &page_res_it) { |
| page_res_it.restart_page (); |
| while (page_res_it.word () != NULL) { |
| page_res_it.word ()->reject_map.rej_word_doc_rej (); |
| page_res_it.forward (); |
| } |
| //whole page is rejected |
| page_res_it.page_res->rejected = TRUE; |
| } |
| |
| namespace tesseract { |
| void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) { |
| WERD_RES *word; |
| GARBAGE_LEVEL garbage_level; |
| PAGE_RES_IT copy_it; |
| BOOL8 prev_potential_marked = FALSE; |
| BOOL8 found_terrible_word = FALSE; |
| BOOL8 ok_dict_word; |
| |
| page_res_it.restart_page (); |
| while (page_res_it.word () != NULL) { |
| word = page_res_it.word (); |
| |
| if (crunch_early_convert_bad_unlv_chs) |
| convert_bad_unlv_chs(word); |
| |
| if (crunch_early_merge_tess_fails) |
| merge_tess_fails(word); |
| |
| if (word->reject_map.accept_count () != 0) { |
| found_terrible_word = FALSE; |
| //Forget earlier potential crunches |
| prev_potential_marked = FALSE; |
| } |
| else { |
| ok_dict_word = safe_dict_word(*(word->best_choice)); |
| garbage_level = garbage_word (word, ok_dict_word); |
| |
| if ((garbage_level != G_NEVER_CRUNCH) && |
| (terrible_word_crunch (word, garbage_level))) { |
| if (crunch_debug > 0) { |
| tprintf ("T CRUNCHING: \"%s\"\n", |
| word->best_choice->unichar_string().string()); |
| } |
| word->unlv_crunch_mode = CR_KEEP_SPACE; |
| if (prev_potential_marked) { |
| while (copy_it.word () != word) { |
| if (crunch_debug > 0) { |
| tprintf ("P1 CRUNCHING: \"%s\"\n", |
| copy_it.word()->best_choice->unichar_string().string()); |
| } |
| copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE; |
| copy_it.forward (); |
| } |
| prev_potential_marked = FALSE; |
| } |
| found_terrible_word = TRUE; |
| } |
| else if ((garbage_level != G_NEVER_CRUNCH) && |
| (potential_word_crunch (word, |
| garbage_level, ok_dict_word))) { |
| if (found_terrible_word) { |
| if (crunch_debug > 0) { |
| tprintf ("P2 CRUNCHING: \"%s\"\n", |
| word->best_choice->unichar_string().string()); |
| } |
| word->unlv_crunch_mode = CR_KEEP_SPACE; |
| } |
| else if (!prev_potential_marked) { |
| copy_it = page_res_it; |
| prev_potential_marked = TRUE; |
| if (crunch_debug > 1) { |
| tprintf ("P3 CRUNCHING: \"%s\"\n", |
| word->best_choice->unichar_string().string()); |
| } |
| } |
| } |
| else { |
| found_terrible_word = FALSE; |
| //Forget earlier potential crunches |
| prev_potential_marked = FALSE; |
| if (crunch_debug > 2) { |
| tprintf ("NO CRUNCH: \"%s\"\n", |
| word->best_choice->unichar_string().string()); |
| } |
| } |
| } |
| page_res_it.forward (); |
| } |
| } |
| } // namespace tesseract |
| |
| |
| BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) { |
| float rating_per_ch; |
| int adjusted_len; |
| int crunch_mode = 0; |
| |
| if ((word->best_choice->unichar_string().length () == 0) || |
| (strspn (word->best_choice->unichar_string().string(), " ") == |
| word->best_choice->unichar_string().length ())) |
| crunch_mode = 1; |
| else { |
| adjusted_len = word->reject_map.length (); |
| if (adjusted_len > crunch_rating_max) |
| adjusted_len = crunch_rating_max; |
| rating_per_ch = word->best_choice->rating () / adjusted_len; |
| |
| if (rating_per_ch > crunch_terrible_rating) |
| crunch_mode = 2; |
| else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) |
| crunch_mode = 3; |
| else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) && |
| (garbage_level != G_OK)) |
| crunch_mode = 4; |
| else if ((rating_per_ch > crunch_poor_garbage_rate) && |
| (garbage_level != G_OK)) |
| crunch_mode = 5; |
| } |
| if (crunch_mode > 0) { |
| if (crunch_debug > 2) { |
| tprintf ("Terrible_word_crunch (%d) on \"%s\"\n", |
| crunch_mode, word->best_choice->unichar_string().string()); |
| } |
| return TRUE; |
| } |
| else |
| return FALSE; |
| } |
| |
| namespace tesseract { |
| BOOL8 Tesseract::potential_word_crunch(WERD_RES *word, |
| GARBAGE_LEVEL garbage_level, |
| BOOL8 ok_dict_word) { |
| float rating_per_ch; |
| int adjusted_len; |
| const char *str = word->best_choice->unichar_string().string(); |
| const char *lengths = word->best_choice->unichar_lengths().string(); |
| BOOL8 word_crunchable; |
| int poor_indicator_count = 0; |
| |
| word_crunchable = |
| !crunch_leave_accept_strings || |
| (word->reject_map.length () < 3) || |
| ((acceptable_word_string (str, lengths) == AC_UNACCEPTABLE) && |
| !ok_dict_word); |
| |
| adjusted_len = word->reject_map.length (); |
| if (adjusted_len > 10) |
| adjusted_len = 10; |
| rating_per_ch = word->best_choice->rating () / adjusted_len; |
| |
| if (rating_per_ch > crunch_pot_poor_rate) { |
| if (crunch_debug > 2) { |
| tprintf ("Potential poor rating on \"%s\"\n", |
| word->best_choice->unichar_string().string()); |
| } |
| poor_indicator_count++; |
| } |
| |
| if (word_crunchable && |
| (word->best_choice->certainty () < crunch_pot_poor_cert)) { |
| if (crunch_debug > 2) { |
| tprintf ("Potential poor cert on \"%s\"\n", |
| word->best_choice->unichar_string().string()); |
| } |
| poor_indicator_count++; |
| } |
| |
| if (garbage_level != G_OK) { |
| if (crunch_debug > 2) { |
| tprintf ("Potential garbage on \"%s\"\n", |
| word->best_choice->unichar_string().string()); |
| } |
| poor_indicator_count++; |
| } |
| return (poor_indicator_count >= crunch_pot_indicators); |
| } |
| } // namespace tesseract |
| |
| |
| namespace tesseract { |
| void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) { |
| WERD_RES *word; |
| PAGE_RES_IT copy_it; |
| BOOL8 deleting_from_bol = FALSE; |
| BOOL8 marked_delete_point = FALSE; |
| inT16 debug_delete_mode; |
| CRUNCH_MODE delete_mode; |
| inT16 x_debug_delete_mode; |
| CRUNCH_MODE x_delete_mode; |
| |
| page_res_it.restart_page (); |
| while (page_res_it.word () != NULL) { |
| word = page_res_it.word (); |
| |
| delete_mode = word_deletable (word, debug_delete_mode); |
| if (delete_mode != CR_NONE) { |
| if (word->word->flag (W_BOL) || deleting_from_bol) { |
| if (crunch_debug > 0) { |
| tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n", |
| debug_delete_mode, |
| word->best_choice->unichar_string().string()); |
| } |
| word->unlv_crunch_mode = delete_mode; |
| deleting_from_bol = TRUE; |
| } |
| else if (word->word->flag (W_EOL)) { |
| if (marked_delete_point) { |
| while (copy_it.word () != word) { |
| x_delete_mode = word_deletable (copy_it.word (), |
| x_debug_delete_mode); |
| if (crunch_debug > 0) { |
| tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n", |
| x_debug_delete_mode, |
| copy_it.word()->best_choice->unichar_string().string()); |
| } |
| copy_it.word ()->unlv_crunch_mode = x_delete_mode; |
| copy_it.forward (); |
| } |
| } |
| if (crunch_debug > 0) { |
| tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n", |
| debug_delete_mode, |
| word->best_choice->unichar_string().string()); |
| } |
| word->unlv_crunch_mode = delete_mode; |
| deleting_from_bol = FALSE; |
| marked_delete_point = FALSE; |
| } |
| else { |
| if (!marked_delete_point) { |
| copy_it = page_res_it; |
| marked_delete_point = TRUE; |
| } |
| } |
| } |
| else { |
| deleting_from_bol = FALSE; |
| //Forget earlier potential crunches |
| marked_delete_point = FALSE; |
| } |
| /* |
| The following step has been left till now as the tess fails are used to |
| determine if the word is deletable. |
| */ |
| if (!crunch_early_merge_tess_fails) |
| merge_tess_fails(word); |
| page_res_it.forward (); |
| } |
| } |
| |
| |
| void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) { |
| int i; |
| UNICHAR_ID unichar_dash = unicharset.unichar_to_id("-"); |
| UNICHAR_ID unichar_space = unicharset.unichar_to_id(" "); |
| UNICHAR_ID unichar_tilde = unicharset.unichar_to_id("~"); |
| UNICHAR_ID unichar_pow = unicharset.unichar_to_id("^"); |
| bool modified = false; |
| for (i = 0; i < word_res->reject_map.length(); ++i) { |
| if (word_res->best_choice->unichar_id(i) == unichar_tilde) { |
| word_res->best_choice->set_unichar_id(unichar_dash, i); |
| modified = true; |
| if (word_res->reject_map[i].accepted ()) |
| word_res->reject_map[i].setrej_unlv_rej (); |
| } |
| if (word_res->best_choice->unichar_id(i) == unichar_pow) { |
| word_res->best_choice->set_unichar_id(unichar_space, i); |
| modified = true; |
| if (word_res->reject_map[i].accepted ()) |
| word_res->reject_map[i].setrej_unlv_rej (); |
| } |
| } |
| if (modified) { |
| word_res->best_choice->populate_unichars(unicharset); |
| } |
| } |
| |
| // Change pairs of tess failures to a single one |
| void Tesseract::merge_tess_fails(WERD_RES *word_res) { |
| PBLOB_IT blob_it; //blobs |
| int len = word_res->best_choice->length(); |
| bool modified = false; |
| |
| ASSERT_HOST (word_res->reject_map.length () == len); |
| ASSERT_HOST (word_res->outword->blob_list ()->length () == len); |
| |
| UNICHAR_ID unichar_space = unicharset.unichar_to_id(" "); |
| blob_it = word_res->outword->blob_list (); |
| int i = 0; |
| while (i < word_res->best_choice->length()-1) { |
| if ((word_res->best_choice->unichar_id(i) == unichar_space) && |
| (word_res->best_choice->unichar_id(i+1) == unichar_space)) { |
| modified = true; |
| word_res->best_choice->remove_unichar_id(i); |
| word_res->reject_map.remove_pos (i); |
| merge_blobs (blob_it.data_relative (1), blob_it.data ()); |
| delete blob_it.extract (); //get rid of spare |
| } else { |
| i++; |
| } |
| blob_it.forward (); |
| } |
| len = word_res->best_choice->length(); |
| ASSERT_HOST (word_res->reject_map.length () == len); |
| ASSERT_HOST (word_res->outword->blob_list ()->length () == len); |
| if (modified) { |
| word_res->best_choice->populate_unichars(unicharset); |
| } |
| } |
| |
| GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) { |
| enum STATES |
| { |
| JUNK, |
| FIRST_UPPER, |
| FIRST_LOWER, |
| FIRST_NUM, |
| SUBSEQUENT_UPPER, |
| SUBSEQUENT_LOWER, |
| SUBSEQUENT_NUM |
| }; |
| const char *str = word->best_choice->unichar_string().string(); |
| const char *lengths = word->best_choice->unichar_lengths().string(); |
| STATES state = JUNK; |
| int len = 0; |
| int isolated_digits = 0; |
| int isolated_alphas = 0; |
| int bad_char_count = 0; |
| int tess_rejs = 0; |
| int dodgy_chars = 0; |
| int ok_chars; |
| UNICHAR_ID last_char = -1; |
| int alpha_repetition_count = 0; |
| int longest_alpha_repetition_count = 0; |
| int longest_lower_run_len = 0; |
| int lower_string_count = 0; |
| int longest_upper_run_len = 0; |
| int upper_string_count = 0; |
| int total_alpha_count = 0; |
| int total_digit_count = 0; |
| |
| for (; *str != '\0'; str += *(lengths++)) { |
| len++; |
| if (unicharset.get_isupper (str, *lengths)) { |
| total_alpha_count++; |
| switch (state) { |
| case SUBSEQUENT_UPPER: |
| case FIRST_UPPER: |
| state = SUBSEQUENT_UPPER; |
| upper_string_count++; |
| if (longest_upper_run_len < upper_string_count) |
| longest_upper_run_len = upper_string_count; |
| if (last_char == unicharset.unichar_to_id(str, *lengths)) { |
| alpha_repetition_count++; |
| if (longest_alpha_repetition_count < alpha_repetition_count) { |
| longest_alpha_repetition_count = alpha_repetition_count; |
| } |
| } |
| else { |
| last_char = unicharset.unichar_to_id(str, *lengths); |
| alpha_repetition_count = 1; |
| } |
| break; |
| case FIRST_NUM: |
| isolated_digits++; |
| default: |
| state = FIRST_UPPER; |
| last_char = unicharset.unichar_to_id(str, *lengths); |
| alpha_repetition_count = 1; |
| upper_string_count = 1; |
| break; |
| } |
| } |
| else if (unicharset.get_islower (str, *lengths)) { |
| total_alpha_count++; |
| switch (state) { |
| case SUBSEQUENT_LOWER: |
| case FIRST_LOWER: |
| state = SUBSEQUENT_LOWER; |
| lower_string_count++; |
| if (longest_lower_run_len < lower_string_count) |
| longest_lower_run_len = lower_string_count; |
| if (last_char == unicharset.unichar_to_id(str, *lengths)) { |
| alpha_repetition_count++; |
| if (longest_alpha_repetition_count < alpha_repetition_count) { |
| longest_alpha_repetition_count = alpha_repetition_count; |
| } |
| } |
| else { |
| last_char = unicharset.unichar_to_id(str, *lengths); |
| alpha_repetition_count = 1; |
| } |
| break; |
| case FIRST_NUM: |
| isolated_digits++; |
| default: |
| state = FIRST_LOWER; |
| last_char = unicharset.unichar_to_id(str, *lengths); |
| alpha_repetition_count = 1; |
| lower_string_count = 1; |
| break; |
| } |
| } |
| else if (unicharset.get_isdigit (str, *lengths)) { |
| total_digit_count++; |
| switch (state) { |
| case FIRST_NUM: |
| state = SUBSEQUENT_NUM; |
| case SUBSEQUENT_NUM: |
| break; |
| case FIRST_UPPER: |
| case FIRST_LOWER: |
| isolated_alphas++; |
| default: |
| state = FIRST_NUM; |
| break; |
| } |
| } |
| else { |
| if (*lengths == 1 && *str == ' ') |
| tess_rejs++; |
| else |
| bad_char_count++; |
| switch (state) { |
| case FIRST_NUM: |
| isolated_digits++; |
| break; |
| case FIRST_UPPER: |
| case FIRST_LOWER: |
| isolated_alphas++; |
| default: |
| break; |
| } |
| state = JUNK; |
| } |
| } |
| |
| switch (state) { |
| case FIRST_NUM: |
| isolated_digits++; |
| break; |
| case FIRST_UPPER: |
| case FIRST_LOWER: |
| isolated_alphas++; |
| default: |
| break; |
| } |
| |
| if (crunch_include_numerals) { |
| total_alpha_count += total_digit_count - isolated_digits; |
| } |
| |
| if (crunch_leave_ok_strings && |
| (len >= 4) && |
| (2 * (total_alpha_count - isolated_alphas) > len) && |
| (longest_alpha_repetition_count < crunch_long_repetitions)) { |
| if ((crunch_accept_ok && |
| (acceptable_word_string (str, lengths) != AC_UNACCEPTABLE)) || |
| (longest_lower_run_len > crunch_leave_lc_strings) || |
| (longest_upper_run_len > crunch_leave_uc_strings)) |
| return G_NEVER_CRUNCH; |
| } |
| if ((word->reject_map.length () > 1) && |
| (strpbrk (str, " ") == NULL) && |
| ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) || |
| (word->best_choice->permuter () == FREQ_DAWG_PERM) || |
| (word->best_choice->permuter () == USER_DAWG_PERM) || |
| (word->best_choice->permuter () == NUMBER_PERM) || |
| (acceptable_word_string (str, lengths) != AC_UNACCEPTABLE) || ok_dict_word)) |
| return G_OK; |
| |
| ok_chars = len - bad_char_count - isolated_digits - |
| isolated_alphas - tess_rejs; |
| |
| if (crunch_debug > 3) { |
| tprintf ("garbage_word: \"%s\"\n", |
| word->best_choice->unichar_string().string()); |
| tprintf ("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n", |
| len, |
| bad_char_count, isolated_digits, isolated_alphas, tess_rejs); |
| } |
| if ((bad_char_count == 0) && |
| (tess_rejs == 0) && |
| ((len > isolated_digits + isolated_alphas) || (len <= 2))) |
| return G_OK; |
| |
| if ((tess_rejs > ok_chars) || |
| ((tess_rejs > 0) && ((bad_char_count + tess_rejs) * 2 > len))) |
| return G_TERRIBLE; |
| |
| if (len > 4) { |
| dodgy_chars = 2 * tess_rejs + bad_char_count + |
| isolated_digits + isolated_alphas; |
| if ((dodgy_chars > 5) || ((dodgy_chars / (float) len) > 0.5)) |
| return G_DODGY; |
| else |
| return G_OK; |
| } |
| else { |
| dodgy_chars = 2 * tess_rejs + bad_char_count; |
| if (((len == 4) && (dodgy_chars > 2)) || |
| ((len == 3) && (dodgy_chars > 2)) || (dodgy_chars >= len)) |
| return G_DODGY; |
| else |
| return G_OK; |
| } |
| } |
| } // namespace tesseract |
| |
| |
| /************************************************************************* |
| * word_deletable() |
| * DELETE WERDS AT ENDS OF ROWS IF |
| * Word is crunched && |
| * ( string length = 0 OR |
| * > 50% of chars are "|" (before merging) OR |
| * certainty < -10 OR |
| * rating /char > 60 OR |
| * TOP of word is more than 0.5 xht BELOW baseline OR |
| * BOTTOM of word is more than 0.5 xht ABOVE xht OR |
| * length of word < 3xht OR |
| * height of word < 0.7 xht OR |
| * height of word > 3.0 xht OR |
| * >75% of the outline BBs have longest dimension < 0.5xht |
| *************************************************************************/ |
| |
| CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode) { |
| int word_len = word->reject_map.length (); |
| float rating_per_ch; |
| TBOX box; //BB of word |
| |
| if (word->unlv_crunch_mode == CR_NONE) { |
| delete_mode = 0; |
| return CR_NONE; |
| } |
| |
| if (word_len == 0) { |
| delete_mode = 1; |
| return CR_DELETE; |
| } |
| |
| box = word->outword->bounding_box (); |
| if (box.height () < crunch_del_min_ht * bln_x_height) { |
| delete_mode = 4; |
| return CR_DELETE; |
| } |
| |
| if (noise_outlines (word->outword)) { |
| delete_mode = 5; |
| return CR_DELETE; |
| } |
| |
| if ((failure_count (word) * 1.5) > word_len) { |
| delete_mode = 2; |
| return CR_LOOSE_SPACE; |
| } |
| |
| if (word->best_choice->certainty () < crunch_del_cert) { |
| delete_mode = 7; |
| return CR_LOOSE_SPACE; |
| } |
| |
| rating_per_ch = word->best_choice->rating () / word_len; |
| |
| if (rating_per_ch > crunch_del_rating) { |
| delete_mode = 8; |
| return CR_LOOSE_SPACE; |
| } |
| |
| if (box.top () < bln_baseline_offset - crunch_del_low_word * bln_x_height) { |
| delete_mode = 9; |
| return CR_LOOSE_SPACE; |
| } |
| |
| if (box.bottom () > |
| bln_baseline_offset + crunch_del_high_word * bln_x_height) { |
| delete_mode = 10; |
| return CR_LOOSE_SPACE; |
| } |
| |
| if (box.height () > crunch_del_max_ht * bln_x_height) { |
| delete_mode = 11; |
| return CR_LOOSE_SPACE; |
| } |
| |
| if (box.width () < crunch_del_min_width * bln_x_height) { |
| delete_mode = 3; |
| return CR_LOOSE_SPACE; |
| } |
| |
| delete_mode = 0; |
| return CR_NONE; |
| } |
| |
| inT16 failure_count(WERD_RES *word) { |
| const char *str = word->best_choice->unichar_string().string(); |
| int tess_rejs = 0; |
| |
| for (; *str != '\0'; str++) { |
| if (*str == ' ') |
| tess_rejs++; |
| } |
| return tess_rejs; |
| } |
| |
| |
| BOOL8 noise_outlines(WERD *word) { |
| PBLOB_IT blob_it; |
| OUTLINE_IT outline_it; |
| TBOX box; //BB of outline |
| inT16 outline_count = 0; |
| inT16 small_outline_count = 0; |
| inT16 max_dimension; |
| float small_limit = bln_x_height * crunch_small_outlines_size; |
| |
| blob_it.set_to_list (word->blob_list ()); |
| for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { |
| outline_it.set_to_list (blob_it.data ()->out_list ()); |
| for (outline_it.mark_cycle_pt (); |
| !outline_it.cycled_list (); outline_it.forward ()) { |
| outline_count++; |
| box = outline_it.data ()->bounding_box (); |
| if (box.height () > box.width ()) |
| max_dimension = box.height (); |
| else |
| max_dimension = box.width (); |
| if (max_dimension < small_limit) |
| small_outline_count++; |
| } |
| } |
| return (small_outline_count >= outline_count); |
| } |
| |
| |
| /************************************************************************* |
| * insert_rej_cblobs() |
| * Put rejected word blobs back into the outword. |
| * NOTE!!! AFTER THIS THE CHOICES LIST WILL NOT HAVE THE CORRECT NUMBER |
| * OF ELEMENTS. |
| *************************************************************************/ |
| namespace tesseract { |
| void Tesseract::insert_rej_cblobs(WERD_RES *word) { |
| PBLOB_IT blob_it; //blob iterator |
| PBLOB_IT rej_blob_it; |
| const STRING *word_str; |
| const STRING *word_lengths; |
| int old_len; |
| int rej_len; |
| char new_str[512 * UNICHAR_LEN]; |
| char new_lengths[512]; |
| REJMAP new_map; |
| int i = 0; //new_str index |
| int j = 0; //old_str index |
| int i_offset = 0; //new_str offset |
| int j_offset = 0; //old_str offset |
| int new_len; |
| |
| gblob_sort_list (word->outword->rej_blob_list (), TRUE); |
| rej_blob_it.set_to_list (word->outword->rej_blob_list ()); |
| if (rej_blob_it.empty ()) |
| return; |
| rej_len = rej_blob_it.length (); |
| blob_it.set_to_list (word->outword->blob_list ()); |
| word_str = &(word->best_choice->unichar_string()); |
| word_lengths = &(word->best_choice->unichar_lengths()); |
| old_len = word->best_choice->length(); |
| ASSERT_HOST (word->reject_map.length () == old_len); |
| ASSERT_HOST (blob_it.length () == old_len); |
| if ((old_len + rej_len) > 511) |
| return; //Word is garbage anyway prevent abort |
| new_map.initialise (old_len + rej_len); |
| |
| while (!rej_blob_it.empty ()) { |
| if ((j >= old_len) || |
| (rej_blob_it.data ()->bounding_box ().left () <= |
| blob_it.data ()->bounding_box ().left ())) { |
| /* Insert reject blob */ |
| if (j >= old_len) |
| blob_it.add_to_end (rej_blob_it.extract ()); |
| else |
| blob_it.add_before_stay_put (rej_blob_it.extract ()); |
| if (!rej_blob_it.empty ()) |
| rej_blob_it.forward (); |
| new_str[i_offset] = ' '; |
| new_lengths[i] = 1; |
| new_map[i].setrej_rej_cblob (); |
| i_offset += new_lengths[i++]; |
| } |
| else { |
| strncpy(new_str + i_offset, &(*word_str)[j_offset], |
| (*word_lengths)[j]); |
| new_lengths[i] = (*word_lengths)[j]; |
| new_map[i] = word->reject_map[j]; |
| i_offset += new_lengths[i++]; |
| j_offset += (*word_lengths)[j++]; |
| blob_it.forward (); |
| } |
| } |
| /* Add any extra normal blobs to strings */ |
| while (j < word_lengths->length ()) { |
| strncpy(new_str + i_offset, &(*word_str)[j_offset], |
| (*word_lengths)[j]); |
| new_lengths[i] = (*word_lengths)[j]; |
| new_map[i] = word->reject_map[j]; |
| i_offset += new_lengths[i++]; |
| j_offset += (*word_lengths)[j++]; |
| } |
| new_str[i_offset] = '\0'; |
| new_lengths[i] = 0; |
| /* |
| tprintf( |
| "\nOld len %d; New len %d; New str \"%s\"; New map \"%s\"\n", |
| old_len, i, new_str, new_map ); |
| */ |
| ASSERT_HOST (i == blob_it.length ()); |
| ASSERT_HOST (i == old_len + rej_len); |
| word->reject_map = new_map; |
| |
| // Update word->best_choice if needed. |
| if (strcmp(new_str, word->best_choice->unichar_string().string()) != 0 || |
| strcmp(new_lengths, word->best_choice->unichar_lengths().string()) != 0) { |
| WERD_CHOICE *new_choice = |
| new WERD_CHOICE(new_str, new_lengths, |
| word->best_choice->rating(), |
| word->best_choice->certainty(), |
| word->best_choice->permuter(), |
| getDict().getUnicharset()); |
| new_choice->populate_unichars(getDict().getUnicharset()); |
| delete word->best_choice; |
| word->best_choice = new_choice; |
| } |
| new_len = word->best_choice->length(); |
| ASSERT_HOST (word->reject_map.length () == new_len); |
| ASSERT_HOST (word->outword->blob_list ()->length () == new_len); |
| |
| } |
| } // namespace tesseract |