| /********************************************************************** |
| * File: reject.cpp (Formerly reject.c) |
| * Description: Rejection functions used in tessedit |
| * Author: Phil Cheatle |
| * Created: Wed Sep 23 16:50:21 BST 1992 |
| * |
| * (C) Copyright 1992, Hewlett-Packard Ltd. |
| ** Licensed under the Apache License, Version 2.0 (the "License"); |
| ** you may not use this file except in compliance with the License. |
| ** You may obtain a copy of the License at |
| ** http://www.apache.org/licenses/LICENSE-2.0 |
| ** Unless required by applicable law or agreed to in writing, software |
| ** distributed under the License is distributed on an "AS IS" BASIS, |
| ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ** See the License for the specific language governing permissions and |
| ** limitations under the License. |
| * |
| **********************************************************************/ |
| |
| #include "mfcpch.h" |
| #include "tessvars.h" |
| #ifdef __UNIX__ |
| #include <assert.h> |
| #include <errno.h> |
| #endif |
| #include "scanutils.h" |
| #include <ctype.h> |
| #include <string.h> |
| //#include "tessbox.h" |
| #include "memry.h" |
| #include "reject.h" |
| #include "tfacep.h" |
| #include "mainblk.h" |
| #include "charcut.h" |
| #include "imgs.h" |
| #include "scaleimg.h" |
| #include "control.h" |
| #include "docqual.h" |
| #include "secname.h" |
| #include "globals.h" |
| |
| /* #define SECURE_NAMES done in secnames.h when necessary */ |
| |
| //extern "C" { |
| #include "callnet.h" |
| //} |
| #include "tesseractclass.h" |
| #include "notdll.h" |
| |
| CLISTIZEH (STRING) CLISTIZE (STRING) |
| #define EXTERN |
| EXTERN |
| INT_VAR (tessedit_reject_mode, 0, "Rejection algorithm"); |
| EXTERN |
| INT_VAR (tessedit_ok_mode, 5, "Acceptance decision algorithm"); |
| EXTERN |
| BOOL_VAR (tessedit_use_nn, FALSE, ""); |
| EXTERN |
| BOOL_VAR (tessedit_rejection_debug, FALSE, "Adaption debug"); |
| EXTERN |
| BOOL_VAR (tessedit_rejection_stats, FALSE, "Show NN stats"); |
| EXTERN |
| BOOL_VAR (tessedit_flip_0O, TRUE, "Contextual 0O O0 flips"); |
| EXTERN |
| double_VAR (tessedit_lower_flip_hyphen, 1.5, |
| "Aspect ratio dot/hyphen test"); |
| EXTERN |
| double_VAR (tessedit_upper_flip_hyphen, 1.8, |
| "Aspect ratio dot/hyphen test"); |
| |
| EXTERN |
| BOOL_VAR (rej_trust_doc_dawg, FALSE, |
| "Use DOC dawg in 11l conf. detector"); |
| EXTERN |
| BOOL_VAR (rej_1Il_use_dict_word, FALSE, "Use dictword test"); |
| EXTERN |
| BOOL_VAR (rej_1Il_trust_permuter_type, TRUE, "Dont double check"); |
| |
| EXTERN |
| BOOL_VAR (one_ell_conflict_default, TRUE, "one_ell_conflict default"); |
| EXTERN |
| BOOL_VAR (show_char_clipping, FALSE, "Show clip image window?"); |
| EXTERN |
| BOOL_VAR (nn_debug, FALSE, "NN DEBUGGING?"); |
| EXTERN |
| BOOL_VAR (nn_reject_debug, FALSE, "NN DEBUG each char?"); |
| EXTERN |
| BOOL_VAR (nn_lax, FALSE, "Use 2nd rate matches"); |
| EXTERN |
| BOOL_VAR (nn_double_check_dict, FALSE, "Double check"); |
| EXTERN |
| BOOL_VAR (nn_conf_double_check_dict, TRUE, |
| "Double check for confusions"); |
| EXTERN |
| BOOL_VAR (nn_conf_1Il, TRUE, "NN use 1Il conflicts"); |
| EXTERN |
| BOOL_VAR (nn_conf_Ss, TRUE, "NN use Ss conflicts"); |
| EXTERN |
| BOOL_VAR (nn_conf_hyphen, TRUE, "NN hyphen conflicts"); |
| EXTERN |
| BOOL_VAR (nn_conf_test_good_qual, FALSE, "NN dodgy 1Il cross check"); |
| EXTERN |
| BOOL_VAR (nn_conf_test_dict, TRUE, "NN dodgy 1Il cross check"); |
| EXTERN |
| BOOL_VAR (nn_conf_test_sensible, TRUE, "NN dodgy 1Il cross check"); |
| EXTERN |
| BOOL_VAR (nn_conf_strict_on_dodgy_chs, TRUE, |
| "Require stronger NN match"); |
| EXTERN |
| double_VAR (nn_dodgy_char_threshold, 0.99, "min accept score"); |
| EXTERN |
| INT_VAR (nn_conf_accept_level, 4, "NN accept dodgy 1Il matches? "); |
| EXTERN |
| INT_VAR (nn_conf_initial_i_level, 3, |
| "NN accept initial Ii match level "); |
| |
| EXTERN |
| BOOL_VAR (no_unrej_dubious_chars, TRUE, "Dubious chars next to reject?"); |
| EXTERN |
| BOOL_VAR (no_unrej_no_alphanum_wds, TRUE, "Stop unrej of non A/N wds?"); |
| EXTERN |
| BOOL_VAR (no_unrej_1Il, FALSE, "Stop unrej of 1Ilchars?"); |
| EXTERN |
| BOOL_VAR (rej_use_tess_accepted, TRUE, "Individual rejection control"); |
| EXTERN |
| BOOL_VAR (rej_use_tess_blanks, TRUE, "Individual rejection control"); |
| EXTERN |
| BOOL_VAR (rej_use_good_perm, TRUE, "Individual rejection control"); |
| EXTERN |
| BOOL_VAR (rej_use_sensible_wd, FALSE, "Extend permuter check"); |
| EXTERN |
| BOOL_VAR (rej_alphas_in_number_perm, FALSE, "Extend permuter check"); |
| |
| EXTERN |
| double_VAR (rej_whole_of_mostly_reject_word_fract, 0.85, |
| "if >this fract"); |
| EXTERN |
| INT_VAR (rej_mostly_reject_mode, 1, |
| "0-never, 1-afterNN, 2-after new xht"); |
| EXTERN |
| double_VAR (tessed_fullstop_aspect_ratio, 1.2, |
| "if >this fract then reject"); |
| |
| EXTERN |
| INT_VAR (net_image_width, 40, "NN input image width"); |
| EXTERN |
| INT_VAR (net_image_height, 36, "NN input image height"); |
| EXTERN |
| INT_VAR (net_image_x_height, 22, "NN input image x_height"); |
| EXTERN |
| INT_VAR (tessedit_image_border, 2, "Rej blbs near image edge limit"); |
| |
| /* |
| Net input is assumed to have (net_image_width * net_image_height) input |
| units of image pixels, followed by 0, 1, or N units representing the |
| baseline position. 0 implies no baseline information. 1 implies a floating |
| point value. N implies a "guage" of N units. For any char an initial set |
| of these are ON, the remainder OFF to indicate the "level" of the |
| baseline. |
| |
| HOWEVER!!! NOTE THAT EACH NEW INPUT LAYER FORMAT EXPECTS TO BE RUN WITH A |
| DIFFERENT tessed/netmatch/nmatch.c MODULE. - These are classic C modules |
| generated by aspirin with HARD CODED CONSTANTS |
| */ |
| |
| EXTERN |
| INT_VAR (net_bl_nodes, 20, "Number of baseline nodes"); |
| |
| EXTERN |
| double_VAR (nn_reject_threshold, 0.5, "NN min accept score"); |
| EXTERN |
| double_VAR (nn_reject_head_and_shoulders, 0.6, "top scores sep factor"); |
| |
| /* NOTE - ctoh doesn't handle "=" properly, hence \075 */ |
| EXTERN |
| STRING_VAR (ok_single_ch_non_alphanum_wds, "-?\075", |
| "Allow NN to unrej"); |
| EXTERN |
| STRING_VAR (ok_repeated_ch_non_alphanum_wds, "-?*\075", |
| "Allow NN to unrej"); |
| EXTERN |
| STRING_VAR (conflict_set_I_l_1, "Il1[]", "Il1 conflict set"); |
| EXTERN |
| STRING_VAR (conflict_set_S_s, "Ss$", "Ss conflict set"); |
| EXTERN |
| STRING_VAR (conflict_set_hyphen, "-_~", "hyphen conflict set"); |
| EXTERN |
| STRING_VAR (dubious_chars_left_of_reject, "!'+`()-./\\<>;:^_,~\"", |
| "Unreliable chars"); |
| EXTERN |
| STRING_VAR (dubious_chars_right_of_reject, "!'+`()-./\\<>;:^_,~\"", |
| "Unreliable chars"); |
| |
| EXTERN |
| INT_VAR (min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this"); |
| |
| /************************************************************************* |
| * set_done() |
| * |
| * Set the done flag based on the word acceptability criteria |
| *************************************************************************/ |
| |
| namespace tesseract { |
| void Tesseract::set_done( //set done flag |
| WERD_RES *word, |
| inT16 pass) { |
| /* |
| 0: Original heuristic used in Tesseract and Ray's prototype Resaljet |
| */ |
| if (tessedit_ok_mode == 0) { |
| /* NOTE - done even if word contains some or all spaces !!! */ |
| word->done = word->tess_accepted; |
| } |
| /* |
| 1: Reject words containing blanks and on pass 1 reject I/l/1 conflicts |
| */ |
| else if (tessedit_ok_mode == 1) { |
| word->done = word->tess_accepted && |
| (strchr (word->best_choice->unichar_string().string (), ' ') == NULL); |
| |
| if (word->done && (pass == 1) && one_ell_conflict (word, FALSE)) |
| word->done = FALSE; |
| } |
| /* |
| 2: as 1 + only accept dict words or numerics in pass 1 |
| */ |
| else if (tessedit_ok_mode == 2) { |
| word->done = word->tess_accepted && |
| (strchr (word->best_choice->unichar_string().string (), ' ') == NULL); |
| |
| if (word->done && (pass == 1) && one_ell_conflict (word, FALSE)) |
| word->done = FALSE; |
| |
| if (word->done && |
| (pass == 1) && |
| (word->best_choice->permuter () != SYSTEM_DAWG_PERM) && |
| (word->best_choice->permuter () != FREQ_DAWG_PERM) && |
| (word->best_choice->permuter () != USER_DAWG_PERM) && |
| (word->best_choice->permuter () != NUMBER_PERM)) { |
| #ifndef SECURE_NAMES |
| if (tessedit_rejection_debug) |
| tprintf ("\nVETO Tess accepting poor word \"%s\"\n", |
| word->best_choice->unichar_string().string ()); |
| #endif |
| word->done = FALSE; |
| } |
| } |
| /* |
| 3: as 2 + only accept dict words or numerics in pass 2 as well |
| */ |
| else if (tessedit_ok_mode == 3) { |
| word->done = word->tess_accepted && |
| (strchr (word->best_choice->unichar_string().string (), ' ') == NULL); |
| |
| if (word->done && (pass == 1) && one_ell_conflict (word, FALSE)) |
| word->done = FALSE; |
| |
| if (word->done && |
| (word->best_choice->permuter () != SYSTEM_DAWG_PERM) && |
| (word->best_choice->permuter () != FREQ_DAWG_PERM) && |
| (word->best_choice->permuter () != USER_DAWG_PERM) && |
| (word->best_choice->permuter () != NUMBER_PERM)) { |
| #ifndef SECURE_NAMES |
| if (tessedit_rejection_debug) |
| tprintf ("\nVETO Tess accepting poor word \"%s\"\n", |
| word->best_choice->unichar_string().string ()); |
| #endif |
| word->done = FALSE; |
| } |
| } |
| /* |
| 4: as 2 + reject dict ambigs in pass 1 |
| */ |
| else if (tessedit_ok_mode == 4) { |
| word->done = word->tess_accepted && |
| (strchr (word->best_choice->unichar_string().string (), ' ') == NULL); |
| |
| if (word->done && (pass == 1) && one_ell_conflict (word, FALSE)) |
| word->done = FALSE; |
| |
| if (word->done && |
| (pass == 1) && |
| (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) && |
| (word->best_choice->permuter () != FREQ_DAWG_PERM) && |
| (word->best_choice->permuter () != USER_DAWG_PERM) && |
| (word->best_choice->permuter () != NUMBER_PERM)) || |
| (test_ambig_word (word)))) { |
| #ifndef SECURE_NAMES |
| if (tessedit_rejection_debug) |
| tprintf ("\nVETO Tess accepting poor word \"%s\"\n", |
| word->best_choice->unichar_string().string ()); |
| #endif |
| word->done = FALSE; |
| } |
| } |
| /* |
| 5: as 3 + reject dict ambigs in both passes |
| */ |
| else if (tessedit_ok_mode == 5) { |
| word->done = word->tess_accepted && |
| (strchr (word->best_choice->unichar_string().string (), ' ') == NULL); |
| |
| if (word->done && (pass == 1) && one_ell_conflict (word, FALSE)) |
| word->done = FALSE; |
| |
| if (word->done && |
| (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) && |
| (word->best_choice->permuter () != FREQ_DAWG_PERM) && |
| (word->best_choice->permuter () != USER_DAWG_PERM) && |
| (word->best_choice->permuter () != NUMBER_PERM)) || |
| (test_ambig_word (word)))) { |
| #ifndef SECURE_NAMES |
| if (tessedit_rejection_debug) |
| tprintf ("\nVETO Tess accepting poor word \"%s\"\n", |
| word->best_choice->unichar_string().string ()); |
| #endif |
| word->done = FALSE; |
| } |
| } |
| |
| else { |
| tprintf ("BAD tessedit_ok_mode\n"); |
| err_exit(); |
| } |
| } |
| |
| |
| /************************************************************************* |
| * make_reject_map() |
| * |
| * Sets the done flag to indicate whether the resylt is acceptable. |
| * |
| * Sets a reject map for the word. |
| *************************************************************************/ |
| void Tesseract::make_reject_map( //make rej map for wd //detailed results |
| WERD_RES *word, |
| BLOB_CHOICE_LIST_CLIST *blob_choices, |
| ROW *row, |
| inT16 pass //1st or 2nd? |
| ) { |
| int i; |
| int offset; |
| |
| flip_0O(word); |
| check_debug_pt (word, -1); //For trap only |
| set_done(word, pass); //Set acceptance |
| word->reject_map.initialise (word->best_choice->unichar_lengths().length ()); |
| reject_blanks(word); |
| /* |
| 0: Rays original heuristic - the baseline |
| */ |
| if (tessedit_reject_mode == 0) { |
| if (!word->done) |
| reject_poor_matches(word, blob_choices); |
| } |
| /* |
| 5: Reject I/1/l from words where there is no strong contextual confirmation; |
| the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls); |
| and the whole of any words which are very small |
| */ |
| else if (tessedit_reject_mode == 5) { |
| if (bln_x_height / word->denorm.scale () <= min_sane_x_ht_pixels) |
| word->reject_map.rej_word_small_xht (); |
| else { |
| one_ell_conflict(word, TRUE); |
| /* |
| Originally the code here just used the done flag. Now I have duplicated |
| and unpacked the conditions for setting the done flag so that each |
| mechanism can be turned on or off independently. This works WITHOUT |
| affecting the done flag setting. |
| */ |
| if (rej_use_tess_accepted && !word->tess_accepted) |
| word->reject_map.rej_word_not_tess_accepted (); |
| |
| if (rej_use_tess_blanks && |
| (strchr (word->best_choice->unichar_string().string (), ' ') != NULL)) |
| word->reject_map.rej_word_contains_blanks (); |
| |
| if (rej_use_good_perm) { |
| if (((word->best_choice->permuter () == SYSTEM_DAWG_PERM) || |
| (word->best_choice->permuter () == FREQ_DAWG_PERM) || |
| (word->best_choice->permuter () == USER_DAWG_PERM)) && |
| (!rej_use_sensible_wd || |
| (acceptable_word_string |
| (word->best_choice->unichar_string().string (), |
| word->best_choice->unichar_lengths().string ()) != |
| AC_UNACCEPTABLE))) { |
| //PASSED TEST |
| } |
| else if (word->best_choice->permuter () == NUMBER_PERM) { |
| if (rej_alphas_in_number_perm) { |
| for (i = 0, offset = 0; |
| word->best_choice->unichar_string()[offset] != '\0'; |
| offset += word->best_choice->unichar_lengths()[i++]) { |
| if (word->reject_map[i].accepted () && |
| unicharset.get_isalpha( |
| word->best_choice->unichar_string().string() + offset, |
| word->best_choice->unichar_lengths()[i])) |
| word->reject_map[i].setrej_bad_permuter (); |
| //rej alpha |
| } |
| } |
| } |
| else { |
| word->reject_map.rej_word_bad_permuter (); |
| } |
| } |
| |
| /* Ambig word rejection was here once !!*/ |
| |
| } |
| } |
| else { |
| tprintf ("BAD tessedit_reject_mode\n"); |
| err_exit(); |
| } |
| |
| if (tessedit_image_border > -1) |
| reject_edge_blobs(word); |
| |
| check_debug_pt (word, 10); |
| if (tessedit_rejection_debug) { |
| tprintf ("Permuter Type = %d\n", word->best_choice->permuter ()); |
| tprintf ("Certainty: %f Rating: %f\n", |
| word->best_choice->certainty (), word->best_choice->rating ()); |
| tprintf("Dict word: %d\n", dict_word(*(word->best_choice))); |
| } |
| |
| /* Un-reject any rejected characters if NN permits */ |
| |
| if (tessedit_use_nn && (pass == 2) && |
| word->reject_map.recoverable_rejects ()) |
| nn_recover_rejects(word, row); |
| flip_hyphens(word); |
| check_debug_pt (word, 20); |
| } |
| } // namespace tesseract |
| |
| |
| void reject_blanks(WERD_RES *word) { |
| inT16 i; |
| inT16 offset; |
| |
| for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0'; |
| offset += word->best_choice->unichar_lengths()[i], i += 1) { |
| if (word->best_choice->unichar_string()[offset] == ' ') |
| //rej unrecognised blobs |
| word->reject_map[i].setrej_tess_failure (); |
| } |
| } |
| |
| |
| void reject_I_1_L(WERD_RES *word) { |
| inT16 i; |
| inT16 offset; |
| |
| for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0'; |
| offset += word->best_choice->unichar_lengths()[i], i += 1) { |
| if (STRING (conflict_set_I_l_1). |
| contains (word->best_choice->unichar_string()[offset])) { |
| //rej 1Il conflict |
| word->reject_map[i].setrej_1Il_conflict (); |
| } |
| } |
| } |
| |
| |
| void reject_poor_matches( //detailed results |
| WERD_RES *word, |
| BLOB_CHOICE_LIST_CLIST *blob_choices) { |
| float threshold; |
| inT16 i = 0; |
| inT16 offset = 0; |
| //super iterator |
| BLOB_CHOICE_LIST_C_IT list_it = blob_choices; |
| BLOB_CHOICE_IT choice_it; //real iterator |
| |
| #ifndef SECURE_NAMES |
| if (strlen(word->best_choice->unichar_lengths().string()) != |
| list_it.length()) { |
| tprintf |
| ("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n", |
| word->best_choice->unichar_string().string(), |
| strlen (word->best_choice->unichar_lengths().string()), list_it.length(), |
| word->outword->blob_list()->length()); |
| } |
| #endif |
| ASSERT_HOST (strlen (word->best_choice->unichar_lengths().string ()) == |
| list_it.length ()); |
| ASSERT_HOST (word->outword->blob_list ()->length () == list_it.length ()); |
| threshold = compute_reject_threshold (blob_choices); |
| |
| for (list_it.mark_cycle_pt (); |
| !list_it.cycled_list (); list_it.forward (), i++, |
| offset += word->best_choice->unichar_lengths()[i]) { |
| /* NB - only compares the threshold against the TOP choice char in the |
| choices list for a blob !! - the selected one may be below the threshold |
| */ |
| choice_it.set_to_list (list_it.data ()); |
| if ((word->best_choice->unichar_string()[offset] == ' ') || |
| (choice_it.length () == 0)) |
| //rej unrecognised blobs |
| word->reject_map[i].setrej_tess_failure (); |
| else if (choice_it.data ()->certainty () < threshold) |
| //rej poor score blob |
| word->reject_map[i].setrej_poor_match (); |
| } |
| } |
| |
| |
| /********************************************************************** |
| * compute_reject_threshold |
| * |
| * Set a rejection threshold for this word. |
| * Initially this is a trivial function which looks for the largest |
| * gap in the certainty value. |
| **********************************************************************/ |
| |
| float compute_reject_threshold( //compute threshold //detailed results |
| BLOB_CHOICE_LIST_CLIST *blob_choices) { |
| inT16 index; //to ratings |
| inT16 blob_count; //no of blobs in word |
| inT16 ok_blob_count = 0; //non TESS rej blobs in word |
| float *ratings; //array of confidences |
| float threshold; //rejection threshold |
| float bestgap; //biggest gap |
| float gapstart; //bottom of gap |
| //super iterator |
| BLOB_CHOICE_LIST_C_IT list_it = blob_choices; |
| BLOB_CHOICE_IT choice_it; //real iterator |
| |
| blob_count = blob_choices->length (); |
| ratings = (float *) alloc_mem (blob_count * sizeof (float)); |
| for (list_it.mark_cycle_pt (), index = 0; |
| !list_it.cycled_list (); list_it.forward (), index++) { |
| choice_it.set_to_list (list_it.data ()); |
| if (choice_it.length () > 0) { |
| ratings[ok_blob_count] = choice_it.data ()->certainty (); |
| //get in an array |
| // tprintf("Rating[%d]=%c %g %g\n", |
| // index,choice_it.data()->char_class(), |
| // choice_it.data()->rating(),choice_it.data()->certainty()); |
| ok_blob_count++; |
| } |
| } |
| ASSERT_HOST (index == blob_count); |
| qsort (ratings, ok_blob_count, sizeof (float), sort_floats); |
| //sort them |
| bestgap = 0; |
| gapstart = ratings[0] - 1; //all reject if none better |
| if (ok_blob_count >= 3) { |
| for (index = 0; index < ok_blob_count - 1; index++) { |
| if (ratings[index + 1] - ratings[index] > bestgap) { |
| bestgap = ratings[index + 1] - ratings[index]; |
| //find biggest |
| gapstart = ratings[index]; |
| } |
| } |
| } |
| threshold = gapstart + bestgap / 2; |
| // tprintf("First=%g, last=%g, gap=%g, threshold=%g\n", |
| // ratings[0],ratings[index],bestgap,threshold); |
| |
| free_mem(ratings); |
| return threshold; |
| } |
| |
| |
| /********************************************************************** |
| * sort_floats |
| * |
| * qsort function to sort 2 floats. |
| **********************************************************************/ |
| |
| int sort_floats( //qsort function |
| const void *arg1, //ptrs to floats |
| const void *arg2) { |
| float diff; //difference |
| |
| diff = *((float *) arg1) - *((float *) arg2); |
| if (diff > 0) |
| return 1; |
| else if (diff < 0) |
| return -1; |
| else |
| return 0; |
| } |
| |
| |
| /************************************************************************* |
| * reject_edge_blobs() |
| * |
| * If the word is perilously close to the edge of the image, reject those blobs |
| * in the word which are too close to the edge as they could be clipped. |
| *************************************************************************/ |
| |
| void reject_edge_blobs(WERD_RES *word) { |
| TBOX word_box = word->word->bounding_box (); |
| TBOX blob_box; |
| PBLOB_IT blob_it = word->outword->blob_list (); |
| //blobs |
| int blobindex = 0; |
| float centre; |
| |
| if ((word_box.left () < tessedit_image_border) || |
| (word_box.bottom () < tessedit_image_border) || |
| (word_box.right () + tessedit_image_border > |
| page_image.get_xsize () - 1) || |
| (word_box.top () + tessedit_image_border > page_image.get_ysize () - 1)) { |
| ASSERT_HOST (word->reject_map.length () == blob_it.length ()); |
| for (blobindex = 0, blob_it.mark_cycle_pt (); |
| !blob_it.cycled_list (); blobindex++, blob_it.forward ()) { |
| blob_box = blob_it.data ()->bounding_box (); |
| centre = (blob_box.left () + blob_box.right ()) / 2.0; |
| if ((word->denorm.x (blob_box.left ()) < tessedit_image_border) || |
| (word->denorm.y (blob_box.bottom (), centre) < |
| tessedit_image_border) || |
| (word->denorm.x (blob_box.right ()) + tessedit_image_border > |
| page_image.get_xsize () - 1) || |
| (word->denorm.y (blob_box.top (), centre) |
| + tessedit_image_border > page_image.get_ysize () - 1)) { |
| word->reject_map[blobindex].setrej_edge_char (); |
| //close to edge |
| } |
| } |
| } |
| } |
| |
| |
| /********************************************************************** |
| * one_ell_conflict() |
| * |
| * Identify words where there is a potential I/l/1 error. |
| * - A bundle of contextual heuristics! |
| **********************************************************************/ |
| namespace tesseract { |
| BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) { |
| const char *word; |
| const char *lengths; |
| inT16 word_len; //its length |
| inT16 first_alphanum_index_; |
| inT16 first_alphanum_offset_; |
| inT16 i; |
| inT16 offset; |
| BOOL8 non_conflict_set_char; //non conf set a/n? |
| BOOL8 conflict = FALSE; |
| BOOL8 allow_1s; |
| ACCEPTABLE_WERD_TYPE word_type; |
| BOOL8 dict_perm_type; |
| BOOL8 dict_word_ok; |
| int dict_word_type; |
| |
| word = word_res->best_choice->unichar_string().string (); |
| lengths = word_res->best_choice->unichar_lengths().string(); |
| word_len = strlen (lengths); |
| /* |
| If there are no occurrences of the conflict set characters then the word |
| is OK. |
| */ |
| if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL) |
| return FALSE; |
| |
| /* |
| There is a conflict if there are NO other (confirmed) alphanumerics apart |
| from those in the conflict set. |
| */ |
| |
| for (i = 0, offset = 0, non_conflict_set_char = FALSE; |
| (i < word_len) && !non_conflict_set_char; offset += lengths[i++]) |
| non_conflict_set_char = |
| (unicharset.get_isalpha(word + offset, lengths[i]) || |
| unicharset.get_isdigit(word + offset, lengths[i])) && |
| !STRING (conflict_set_I_l_1).contains (word[offset]); |
| if (!non_conflict_set_char) { |
| if (update_map) |
| reject_I_1_L(word_res); |
| return TRUE; |
| } |
| |
| /* |
| If the word is accepted by a dawg permuter, and the first alpha character |
| is "I" or "l", check to see if the alternative is also a dawg word. If it |
| is, then there is a potential error otherwise the word is ok. |
| */ |
| |
| dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) || |
| (word_res->best_choice->permuter () == USER_DAWG_PERM) || |
| (rej_trust_doc_dawg && |
| (word_res->best_choice->permuter () == DOC_DAWG_PERM)) || |
| (word_res->best_choice->permuter () == FREQ_DAWG_PERM); |
| dict_word_type = dict_word(*(word_res->best_choice)); |
| dict_word_ok = (dict_word_type > 0) && |
| (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM)); |
| |
| if ((rej_1Il_use_dict_word && dict_word_ok) || |
| (rej_1Il_trust_permuter_type && dict_perm_type) || |
| (dict_perm_type && dict_word_ok)) { |
| first_alphanum_index_ = first_alphanum_index (word, lengths); |
| first_alphanum_offset_ = first_alphanum_offset (word, lengths); |
| if (lengths[first_alphanum_index_] == 1 && |
| word[first_alphanum_offset_] == 'I') { |
| word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; |
| if (safe_dict_word(*(word_res->best_choice)) > 0) { |
| word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; |
| if (update_map) |
| word_res->reject_map[first_alphanum_index_]. |
| setrej_1Il_conflict(); |
| return TRUE; |
| } |
| else { |
| word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; |
| return FALSE; |
| } |
| } |
| |
| if (lengths[first_alphanum_index_] == 1 && |
| word[first_alphanum_offset_] == 'l') { |
| word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; |
| if (safe_dict_word(*(word_res->best_choice)) > 0) { |
| word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; |
| if (update_map) |
| word_res->reject_map[first_alphanum_index_]. |
| setrej_1Il_conflict(); |
| return TRUE; |
| } |
| else { |
| word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; |
| return FALSE; |
| } |
| } |
| return FALSE; |
| } |
| |
| /* |
| NEW 1Il code. The old code relied on permuter types too much. In fact, |
| tess will use TOP_CHOICE permute for good things like "palette". |
| In this code the string is examined independently to see if it looks like |
| a well formed word. |
| */ |
| |
| /* |
| REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a |
| dictionary word. |
| */ |
| first_alphanum_index_ = first_alphanum_index (word, lengths); |
| first_alphanum_offset_ = first_alphanum_offset (word, lengths); |
| if (lengths[first_alphanum_index_] == 1 && |
| word[first_alphanum_offset_] == 'l') { |
| word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; |
| if (safe_dict_word(*(word_res->best_choice)) > 0) |
| return FALSE; |
| else |
| word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; |
| } |
| else if (lengths[first_alphanum_index_] == 1 && |
| word[first_alphanum_offset_] == 'I') { |
| word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; |
| if (safe_dict_word(*(word_res->best_choice)) > 0) |
| return FALSE; |
| else |
| word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; |
| } |
| /* |
| For strings containing digits: |
| If there are no alphas OR the numeric permuter liked the word, |
| reject any non 1 conflict chs |
| Else reject all conflict chs |
| */ |
| if (word_contains_non_1_digit (word, lengths)) { |
| allow_1s = (alpha_count (word, lengths) == 0) || |
| (word_res->best_choice->permuter () == NUMBER_PERM); |
| |
| inT16 offset; |
| conflict = FALSE; |
| for (i = 0, offset = 0; word[offset] != '\0'; |
| offset += word_res->best_choice->unichar_lengths()[i++]) { |
| if ((!allow_1s || (word[offset] != '1')) && |
| STRING (conflict_set_I_l_1).contains (word[offset])) { |
| if (update_map) |
| word_res->reject_map[i].setrej_1Il_conflict (); |
| conflict = TRUE; |
| } |
| } |
| return conflict; |
| } |
| /* |
| For anything else. See if it conforms to an acceptable word type. If so, |
| treat accordingly. |
| */ |
| word_type = acceptable_word_string (word, lengths); |
| if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) { |
| first_alphanum_index_ = first_alphanum_index (word, lengths); |
| first_alphanum_offset_ = first_alphanum_offset (word, lengths); |
| if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) { |
| if (update_map) |
| word_res->reject_map[first_alphanum_index_]. |
| setrej_1Il_conflict (); |
| return TRUE; |
| } |
| else |
| return FALSE; |
| } |
| else if (word_type == AC_UPPER_CASE) { |
| return FALSE; |
| } |
| else { |
| if (update_map) |
| reject_I_1_L(word_res); |
| return TRUE; |
| } |
| } |
| |
| |
| inT16 Tesseract::first_alphanum_index(const char *word, |
| const char *word_lengths) { |
| inT16 i; |
| inT16 offset; |
| |
| for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { |
| if (unicharset.get_isalpha(word + offset, word_lengths[i]) || |
| unicharset.get_isdigit(word + offset, word_lengths[i])) |
| return i; |
| } |
| return -1; |
| } |
| |
| inT16 Tesseract::first_alphanum_offset(const char *word, |
| const char *word_lengths) { |
| inT16 i; |
| inT16 offset; |
| |
| for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { |
| if (unicharset.get_isalpha(word + offset, word_lengths[i]) || |
| unicharset.get_isdigit(word + offset, word_lengths[i])) |
| return offset; |
| } |
| return -1; |
| } |
| |
| inT16 Tesseract::alpha_count(const char *word, |
| const char *word_lengths) { |
| inT16 i; |
| inT16 offset; |
| inT16 count = 0; |
| |
| for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { |
| if (unicharset.get_isalpha (word + offset, word_lengths[i])) |
| count++; |
| } |
| return count; |
| } |
| |
| |
| BOOL8 Tesseract::word_contains_non_1_digit(const char *word, |
| const char *word_lengths) { |
| inT16 i; |
| inT16 offset; |
| |
| for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { |
| if (unicharset.get_isdigit (word + offset, word_lengths[i]) && |
| (word_lengths[i] != 1 || word[offset] != '1')) |
| return TRUE; |
| } |
| return FALSE; |
| } |
| |
| |
| BOOL8 Tesseract::test_ambig_word( //test for ambiguity |
| WERD_RES *word) { |
| BOOL8 ambig = FALSE; |
| |
| if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) || |
| (word->best_choice->permuter () == FREQ_DAWG_PERM) || |
| (word->best_choice->permuter () == USER_DAWG_PERM)) { |
| ambig = !getDict().NoDangerousAmbig( |
| word->best_choice, NULL, false, NULL, NULL); |
| } |
| return ambig; |
| } |
| |
| /************************************************************************* |
| * char_ambiguities() |
| * |
| * Return a pointer to a string containing the full conflict set of characters |
| * which includes the specified character, if there is one. If the specified |
| * character is not a member of a conflict set, return NULL. |
| * (NOTE that a character is assumed to be a member of only ONE conflict set.) |
| *************************************************************************/ |
| const char *Tesseract::char_ambiguities(char c) { |
| static STRING_CLIST conflict_sets; |
| static BOOL8 read_conflict_sets = FALSE; |
| STRING_C_IT cs_it(&conflict_sets); |
| const char *cs; |
| STRING cs_file_name; |
| FILE *cs_file; |
| char buff[1024]; |
| |
| if (!read_conflict_sets) { |
| cs_file_name = datadir + "confsets"; |
| if (!(cs_file = fopen (cs_file_name.string (), "r"))) { |
| CANTOPENFILE.error ("char_ambiguities", EXIT, "%s %d", |
| cs_file_name.string (), errno); |
| } |
| while (fscanf (cs_file, "%s", buff) == 1) { |
| cs_it.add_after_then_move (new STRING (buff)); |
| } |
| read_conflict_sets = TRUE; |
| cs_it.move_to_first (); |
| if (tessedit_rejection_debug) { |
| for (cs_it.mark_cycle_pt (); |
| !cs_it.cycled_list (); cs_it.forward ()) { |
| tprintf ("\"%s\"\n", cs_it.data ()->string ()); |
| } |
| } |
| } |
| |
| cs_it.move_to_first (); |
| for (cs_it.mark_cycle_pt (); !cs_it.cycled_list (); cs_it.forward ()) { |
| cs = cs_it.data ()->string (); |
| if (strchr (cs, c) != NULL) |
| return cs; |
| } |
| return NULL; |
| } |
| |
| /************************************************************************* |
| * nn_recover_rejects() |
| * Generate the nn_reject_map - a copy of the current reject map, but dont |
| * reject previously rejected chars if the NN matcher agrees with the best |
| * choice. |
| *************************************************************************/ |
| |
| void Tesseract::nn_recover_rejects(WERD_RES *word, ROW *row) { |
| //copy for debug |
| REJMAP old_map = word->reject_map; |
| /* |
| NOTE THAT THIS IS RELATIVELY INEFFICIENT AS THE WHOLE OF THE WERD IS |
| MATCHED BY THE NN MATCHER. IF COULD EASILY BE RESTRICTED TO JUST THE |
| REJECT CHARACTERS (Though initial use is when words are total rejects |
| anyway). |
| */ |
| |
| set_global_subsubloc_code(SUBSUBLOC_NN); |
| nn_match_word(word, row); |
| |
| if (no_unrej_1Il) |
| dont_allow_1Il(word); |
| if (no_unrej_dubious_chars) |
| dont_allow_dubious_chars(word); |
| |
| if (rej_mostly_reject_mode == 1) |
| reject_mostly_rejects(word); |
| /* |
| IF there are no unrejected alphanumerics AND |
| The word is not an acceptable single non alphanum char word AND |
| The word is not an acceptable repeated non alphanum char word |
| THEN Reject whole word |
| */ |
| if (no_unrej_no_alphanum_wds && |
| (count_alphanums (word) < 1) && |
| !((word->best_choice->unichar_lengths().length () == 1) && |
| STRING(ok_single_ch_non_alphanum_wds).contains( |
| word->best_choice->unichar_string()[0])) |
| && !repeated_nonalphanum_wd (word, row)) |
| |
| word->reject_map.rej_word_no_alphanums (); |
| |
| #ifndef SECURE_NAMES |
| |
| if (nn_debug) { |
| tprintf ("\nTess: \"%s\" MAP ", |
| word->best_choice->unichar_string().string()); |
| old_map.print (stdout); |
| tprintf ("->"); |
| word->reject_map.print (stdout); |
| tprintf ("\n"); |
| } |
| #endif |
| set_global_subsubloc_code(SUBSUBLOC_OTHER); |
| } |
| |
| void Tesseract::nn_match_word( //Match a word |
| WERD_RES *word, |
| ROW *row) { |
| PIXROW_LIST *pixrow_list; |
| PIXROW_IT pixrow_it; |
| IMAGELINE *imlines; //lines of the image |
| TBOX pix_box; //box of imlines extent |
| #ifndef GRAPHICS_DISABLED |
| ScrollView* win = NULL; |
| #endif |
| IMAGE clip_image; |
| IMAGE scaled_image; |
| float baseline_pos; |
| inT16 net_image_size; |
| inT16 clip_image_size; |
| WERD copy_outword; // copy to denorm |
| inT16 i; |
| |
| const char *word_string; |
| const char *word_string_lengths; |
| BOOL8 word_in_dict; //Tess wd in dict |
| BOOL8 checked_dict_word; //Tess wd definitely in dict |
| BOOL8 sensible_word; //OK char string |
| BOOL8 centre; //Not at word end chs |
| BOOL8 good_quality_word; |
| inT16 char_quality; |
| inT16 accepted_char_quality; |
| |
| inT16 conf_level; //0:REJECT |
| //1:DODGY ACCEPT |
| //2:DICT ACCEPT |
| //3:CLEAR ACCEPT |
| inT16 first_alphanum_index_; |
| inT16 first_alphanum_offset_; |
| |
| word_string = word->best_choice->unichar_string().string(); |
| word_string_lengths = word->best_choice->unichar_lengths().string(); |
| first_alphanum_index_ = first_alphanum_index (word_string, |
| word_string_lengths); |
| first_alphanum_offset_ = first_alphanum_offset (word_string, |
| word_string_lengths); |
| word_in_dict = ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) || |
| (word->best_choice->permuter () == FREQ_DAWG_PERM) || |
| (word->best_choice->permuter () == USER_DAWG_PERM)); |
| checked_dict_word = word_in_dict && |
| (safe_dict_word(*(word->best_choice)) > 0); |
| sensible_word = acceptable_word_string (word_string, word_string_lengths) != |
| AC_UNACCEPTABLE; |
| |
| word_char_quality(word, row, &char_quality, &accepted_char_quality); |
| good_quality_word = |
| word->best_choice->unichar_lengths().length () == char_quality; |
| |
| #ifndef SECURE_NAMES |
| if (nn_reject_debug) { |
| tprintf ("Dict: %c Checked Dict: %c Sensible: %c Quality: %c\n", |
| word_in_dict ? 'T' : 'F', |
| checked_dict_word ? 'T' : 'F', |
| sensible_word ? 'T' : 'F', good_quality_word ? 'T' : 'F'); |
| } |
| #endif |
| |
| if (word->best_choice->unichar_lengths().length () != |
| word->outword->blob_list ()->length ()) { |
| #ifndef SECURE_NAMES |
| tprintf ("nn_match_word ASSERT FAIL String:\"%s\"; #Blobs=%d\n", |
| word->best_choice->unichar_string().string (), |
| word->outword->blob_list ()->length ()); |
| #endif |
| err_exit(); |
| } |
| |
| copy_outword = *(word->outword); |
| copy_outword.baseline_denormalise (&word->denorm); |
| /* |
| For each character, generate and match a new image, containing JUST the |
| character we have clipped, centered in the image, on a white background. |
| Note that we MUST have a square image so that we can scale it uniformly in |
| x and y. We base the size on x_height as this can be found fairly reliably. |
| */ |
| net_image_size = (net_image_width > net_image_height) ? |
| net_image_width : net_image_height; |
| clip_image_size = (inT16) floor (0.5 + |
| net_image_size * word->x_height / |
| net_image_x_height); |
| if ((clip_image_size <= 1) || (net_image_size <= 1)) { |
| return; |
| } |
| |
| /* |
| Get the image of the word and the pix positions of each char |
| */ |
| char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box); |
| #ifndef GRAPHICS_DISABLED |
| if (show_char_clipping) { |
| win = display_clip_image (©_outword, page_image, |
| pixrow_list, pix_box); |
| } |
| #endif |
| pixrow_it.set_to_list (pixrow_list); |
| pixrow_it.move_to_first (); |
| for (pixrow_it.mark_cycle_pt (), i = 0; |
| !pixrow_it.cycled_list (); pixrow_it.forward (), i++) { |
| if (pixrow_it.data ()-> |
| bad_box (page_image.get_xsize (), page_image.get_ysize ())) |
| continue; |
| clip_image.create (clip_image_size, clip_image_size, 1); |
| //make bin imge |
| if (!copy_outword.flag (W_INVERSE)) |
| invert_image(&clip_image); //white background for black on white |
| pixrow_it.data ()->char_clip_image (imlines, pix_box, row, |
| clip_image, baseline_pos); |
| if (copy_outword.flag (W_INVERSE)) |
| invert_image(&clip_image); //invert white on black for scaling &NN |
| scaled_image.create (net_image_size, net_image_size, 1); |
| scale_image(clip_image, scaled_image); |
| baseline_pos *= net_image_size / clip_image_size; |
| //scale with im |
| centre = !pixrow_it.at_first () && !pixrow_it.at_last (); |
| |
| conf_level = nn_match_char (scaled_image, baseline_pos, |
| word_in_dict, checked_dict_word, |
| sensible_word, centre, |
| good_quality_word, word_string[i]); |
| if (word->reject_map[i].recoverable ()) { |
| if ((i == first_alphanum_index_) && |
| word_string_lengths[first_alphanum_index_] == 1 && |
| ((word_string[first_alphanum_offset_] == 'I') || |
| (word_string[first_alphanum_offset_] == 'i'))) { |
| if (conf_level >= nn_conf_initial_i_level) |
| word->reject_map[i].setrej_nn_accept (); |
| //un-reject char |
| } |
| else if (conf_level > 0) |
| //un-reject char |
| word->reject_map[i].setrej_nn_accept (); |
| } |
| #ifndef GRAPHICS_DISABLED |
| if (show_char_clipping) |
| display_images(clip_image, scaled_image); |
| #endif |
| clip_image.destroy(); |
| scaled_image.destroy(); |
| } |
| |
| delete[]imlines; // Free array of imlines |
| delete pixrow_list; |
| |
| #ifndef GRAPHICS_DISABLED |
| if (show_char_clipping) { |
| // destroy_window(win); |
| // win->Destroy(); |
| delete win; |
| } |
| #endif |
| } |
| } // namespace tesseract |
| |
| |
| /************************************************************************* |
| * nn_match_char() |
| * Call Neural Net matcher to match a single character, given a scaled, |
| * square image |
| *************************************************************************/ |
| |
| inT16 nn_match_char( //of character |
| IMAGE &scaled_image, |
| float baseline_pos, //rel to scaled_image |
| BOOL8 dict_word, //part of dict wd? |
| BOOL8 checked_dict_word, //part of dict wd? |
| BOOL8 sensible_word, //part acceptable str? |
| BOOL8 centre, //not at word ends? |
| BOOL8 good_quality_word, //initial segmentation |
| char tess_ch //confirm this? |
| ) { |
| inT16 conf_level; //0..2 |
| inT32 row; |
| inT32 col; |
| inT32 y_size = scaled_image.get_ysize (); |
| inT32 start_y = y_size - (y_size - net_image_height) / 2 - 1; |
| inT32 end_y = start_y - net_image_height + 1; |
| IMAGELINE imline; |
| float *input_vector; |
| float *input_vec_ptr; |
| char top; |
| float top_score; |
| char next; |
| float next_score; |
| inT16 input_nodes = (net_image_height * net_image_width) + net_bl_nodes; |
| inT16 j; |
| |
| input_vector = (float *) alloc_mem (input_nodes * sizeof (float)); |
| input_vec_ptr = input_vector; |
| |
| invert_image(&scaled_image); //cos nns work better |
| for (row = start_y; row >= end_y; row--) { |
| scaled_image.fast_get_line (0, row, net_image_width, &imline); |
| for (col = 0; col < net_image_width; col++) |
| *input_vec_ptr++ = imline.pixels[col]; |
| } |
| /* |
| The bit map presented to the net may be shorter than the image, so shift |
| the coord to be relative to the bitmap portion. |
| */ |
| baseline_pos -= (y_size - net_image_height) / 2.0; |
| /* |
| Baseline pos is 0 if below bitmap, 1 if above and in proportion otherwise. |
| This is represented to the net as a set of bl_nodes, an initial proportion |
| of which are set to 1.0, indicating the level of the baseline. The |
| remainder are 0.0 |
| */ |
| |
| if (baseline_pos < 0) |
| baseline_pos = 0; |
| else if (baseline_pos >= net_image_height) |
| baseline_pos = net_image_height + 1; |
| else |
| baseline_pos = baseline_pos + 1; |
| baseline_pos = baseline_pos / (net_image_height + 1); |
| |
| if (net_bl_nodes > 0) { |
| baseline_pos *= 1.7; //Use a wider range |
| if (net_bl_nodes > 1) { |
| /* Multi-node baseline representation */ |
| for (j = 0; j < net_bl_nodes; j++) { |
| if (baseline_pos > ((float) j / net_bl_nodes)) |
| *input_vec_ptr++ = 1.0; |
| else |
| *input_vec_ptr++ = 0.0; |
| } |
| } |
| else { |
| /* Single node baseline */ |
| *input_vec_ptr++ = baseline_pos; |
| } |
| } |
| |
| callnet(input_vector, &top, &top_score, &next, &next_score); |
| conf_level = evaluate_net_match (top, top_score, next, next_score, |
| tess_ch, dict_word, checked_dict_word, |
| sensible_word, centre, good_quality_word); |
| #ifndef SECURE_NAMES |
| if (nn_reject_debug) { |
| tprintf ("top:\"%c\" %4.2f next:\"%c\" %4.2f TESS:\"%c\" Conf: %d\n", |
| top, top_score, next, next_score, tess_ch, conf_level); |
| } |
| #endif |
| free_mem(input_vector); |
| return conf_level; |
| } |
| |
| |
| inT16 evaluate_net_match(char top, |
| float top_score, |
| char next, |
| float next_score, |
| char tess_ch, |
| BOOL8 dict_word, |
| BOOL8 checked_dict_word, |
| BOOL8 sensible_word, |
| BOOL8 centre, |
| BOOL8 good_quality_word) { |
| inT16 accept_level; //0 Very clearly matched |
| //1 Clearly top |
| //2 Top but poor match |
| //3 Next & poor top match |
| //4 Next but good top match |
| //5 No chance |
| BOOL8 good_top_choice; |
| BOOL8 excellent_top_choice; |
| BOOL8 confusion_match = FALSE; |
| BOOL8 dodgy_char = !isalnum (tess_ch); |
| |
| good_top_choice = (top_score > nn_reject_threshold) && |
| (nn_reject_head_and_shoulders * top_score > next_score); |
| |
| excellent_top_choice = good_top_choice && |
| (top_score > nn_dodgy_char_threshold); |
| |
| if (top == tess_ch) { |
| if (excellent_top_choice) |
| accept_level = 0; |
| else if (good_top_choice) |
| accept_level = 1; //Top correct and well matched |
| else |
| accept_level = 2; //Top correct but poor match |
| } |
| else if ((nn_conf_1Il && |
| STRING (conflict_set_I_l_1).contains (tess_ch) && |
| STRING (conflict_set_I_l_1).contains (top)) || |
| (nn_conf_hyphen && |
| STRING (conflict_set_hyphen).contains (tess_ch) && |
| STRING (conflict_set_hyphen).contains (top)) || |
| (nn_conf_Ss && |
| STRING (conflict_set_S_s).contains (tess_ch) && |
| STRING (conflict_set_S_s).contains (top))) { |
| confusion_match = TRUE; |
| if (good_top_choice) |
| accept_level = 1; //Good top confusion |
| else |
| accept_level = 2; //Poor top confusion |
| } |
| else if ((nn_conf_1Il && |
| STRING (conflict_set_I_l_1).contains (tess_ch) && |
| STRING (conflict_set_I_l_1).contains (next)) || |
| (nn_conf_hyphen && |
| STRING (conflict_set_hyphen).contains (tess_ch) && |
| STRING (conflict_set_hyphen).contains (next)) || |
| (nn_conf_Ss && |
| STRING (conflict_set_S_s).contains (tess_ch) && |
| STRING (conflict_set_S_s).contains (next))) { |
| confusion_match = TRUE; |
| if (!good_top_choice) |
| accept_level = 3; //Next confusion and top match dodgy |
| else |
| accept_level = 4; //Next confusion and good top match |
| } |
| else if (next == tess_ch) { |
| if (!good_top_choice) |
| accept_level = 3; //Next match and top match dodgy |
| else |
| accept_level = 4; //Next match and good top match |
| } |
| else |
| accept_level = 5; |
| |
| /* Could allow some match flexibility here sS$ etc */ |
| |
| /* Now set confirmation level according to how much we can believe the tess |
| char. */ |
| |
| if ((accept_level == 0) && !confusion_match) |
| return 3; |
| |
| if ((accept_level <= 1) && |
| (!nn_conf_strict_on_dodgy_chs || !dodgy_char) && !confusion_match) |
| return 3; |
| |
| if ((accept_level == 2) && |
| !confusion_match && !dodgy_char && |
| good_quality_word && |
| dict_word && |
| (checked_dict_word || !nn_double_check_dict) && sensible_word) |
| return 2; |
| |
| if (confusion_match && |
| (accept_level <= nn_conf_accept_level) && |
| (good_quality_word || |
| (!nn_conf_test_good_qual && |
| !STRING (conflict_set_I_l_1).contains (tess_ch))) && |
| (dict_word || !nn_conf_test_dict) && |
| (checked_dict_word || !nn_conf_double_check_dict) && |
| (sensible_word || !nn_conf_test_sensible)) |
| return 1; |
| |
| if (!confusion_match && |
| nn_lax && |
| (accept_level == 3) && |
| (good_quality_word || !nn_conf_test_good_qual) && |
| (dict_word || !nn_conf_test_dict) && |
| (sensible_word || !nn_conf_test_sensible)) |
| return 1; |
| else |
| return 0; |
| } |
| |
| |
| /************************************************************************* |
| * dont_allow_dubious_chars() |
| * Let Rejects "eat" into adjacent "dubious" chars. I.e those prone to be wrong |
| * if adjacent to a reject. |
| *************************************************************************/ |
| void dont_allow_dubious_chars(WERD_RES *word) { |
| int i = 0; |
| int offset = 0; |
| int rej_pos; |
| int word_len = word->reject_map.length (); |
| |
| while (i < word_len) { |
| /* Find next reject */ |
| |
| while ((i < word_len) && (word->reject_map[i].accepted ())) |
| { |
| offset += word->best_choice->unichar_lengths()[i]; |
| i++; |
| } |
| |
| if (i < word_len) { |
| rej_pos = i; |
| |
| /* Reject dubious chars to the left */ |
| i--; |
| offset -= word->best_choice->unichar_lengths()[i]; |
| while ((i >= 0) && |
| STRING(dubious_chars_left_of_reject).contains( |
| word->best_choice->unichar_string()[offset])) { |
| word->reject_map[i--].setrej_dubious (); |
| offset -= word->best_choice->unichar_lengths()[i]; |
| } |
| |
| /* Skip adjacent rejects */ |
| |
| for (i = rej_pos; |
| (i < word_len) && (word->reject_map[i].rejected ()); |
| offset += word->best_choice->unichar_lengths()[i++]); |
| |
| /* Reject dubious chars to the right */ |
| |
| while ((i < word_len) && |
| STRING(dubious_chars_right_of_reject).contains( |
| word->best_choice->unichar_string()[offset])) { |
| offset += word->best_choice->unichar_lengths()[i]; |
| word->reject_map[i++].setrej_dubious (); |
| } |
| } |
| } |
| } |
| |
| |
| /************************************************************************* |
| * dont_allow_1Il() |
| * Dont unreject LONE accepted 1Il conflict set chars |
| *************************************************************************/ |
| namespace tesseract { |
| void Tesseract::dont_allow_1Il(WERD_RES *word) { |
| int i = 0; |
| int offset; |
| int word_len = word->reject_map.length (); |
| const char *s = word->best_choice->unichar_string().string (); |
| const char *lengths = word->best_choice->unichar_lengths().string (); |
| BOOL8 accepted_1Il = FALSE; |
| |
| for (i = 0, offset = 0; i < word_len; |
| offset += word->best_choice->unichar_lengths()[i++]) { |
| if (word->reject_map[i].accepted ()) { |
| if (STRING (conflict_set_I_l_1).contains (s[offset])) |
| accepted_1Il = TRUE; |
| else { |
| if (unicharset.get_isalpha (s + offset, lengths[i]) || |
| unicharset.get_isdigit (s + offset, lengths[i])) |
| return; // >=1 non 1Il ch accepted |
| } |
| } |
| } |
| if (!accepted_1Il) |
| return; //Nothing to worry about |
| |
| for (i = 0, offset = 0; i < word_len; |
| offset += word->best_choice->unichar_lengths()[i++]) { |
| if (STRING (conflict_set_I_l_1).contains (s[offset]) && |
| word->reject_map[i].accepted ()) |
| word->reject_map[i].setrej_postNN_1Il (); |
| } |
| } |
| |
| |
| inT16 Tesseract::count_alphanums( //how many alphanums |
| WERD_RES *word_res) { |
| int count = 0; |
| const WERD_CHOICE *best_choice = word_res->best_choice; |
| for (int i = 0; i < word_res->reject_map.length(); ++i) { |
| if ((word_res->reject_map[i].accepted()) && |
| (unicharset.get_isalpha(best_choice->unichar_id(i)) || |
| unicharset.get_isdigit(best_choice->unichar_id(i)))) { |
| count++; |
| } |
| } |
| return count; |
| } |
| } // namespace tesseract |
| |
| |
| void reject_mostly_rejects( //rej all if most rejectd |
| WERD_RES *word) { |
| /* Reject the whole of the word if the fraction of rejects exceeds a limit */ |
| |
| if ((float) word->reject_map.reject_count () / word->reject_map.length () >= |
| rej_whole_of_mostly_reject_word_fract) |
| word->reject_map.rej_word_mostly_rej (); |
| } |
| |
| |
| namespace tesseract { |
| BOOL8 Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) { |
| inT16 char_quality; |
| inT16 accepted_char_quality; |
| |
| if (word->best_choice->unichar_lengths().length () <= 1) |
| return FALSE; |
| |
| if (!STRING (ok_repeated_ch_non_alphanum_wds). |
| contains (word->best_choice->unichar_string()[0])) |
| return FALSE; |
| |
| if (!repeated_ch_string (word->best_choice->unichar_string().string (), |
| word->best_choice->unichar_lengths().string ())) |
| return FALSE; |
| |
| word_char_quality(word, row, &char_quality, &accepted_char_quality); |
| |
| if ((word->best_choice->unichar_lengths().length () == char_quality) && |
| (char_quality == accepted_char_quality)) |
| return TRUE; |
| else |
| return FALSE; |
| } |
| |
| BOOL8 Tesseract::repeated_ch_string(const char *rep_ch_str, |
| const char *lengths) { |
| UNICHAR_ID c; |
| |
| if ((rep_ch_str == NULL) || (*rep_ch_str == '\0')) { |
| return FALSE; |
| } |
| |
| c = unicharset.unichar_to_id(rep_ch_str, *lengths); |
| rep_ch_str += *(lengths++); |
| while (*rep_ch_str != '\0' && |
| unicharset.unichar_to_id(rep_ch_str, *lengths) == c) { |
| rep_ch_str++; |
| } |
| if (*rep_ch_str == '\0') |
| return TRUE; |
| return FALSE; |
| } |
| |
| |
| inT16 Tesseract::safe_dict_word(const WERD_CHOICE &word) { |
| int dict_word_type = dict_word(word); |
| return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type; |
| } |
| |
| |
| void Tesseract::flip_hyphens(WERD_RES *word_res) { |
| WERD_CHOICE *best_choice = word_res->best_choice; |
| int i; |
| PBLOB_IT outword_it; |
| int prev_right = -9999; |
| int next_left; |
| TBOX out_box; |
| float aspect_ratio; |
| |
| if (tessedit_lower_flip_hyphen <= 1) |
| return; |
| |
| outword_it.set_to_list(word_res->outword->blob_list()); |
| UNICHAR_ID unichar_dash = unicharset.unichar_to_id("-"); |
| bool modified = false; |
| for (i = 0, outword_it.mark_cycle_pt(); |
| i < best_choice->length() && !outword_it.cycled_list(); |
| ++i, outword_it.forward()) { |
| out_box = outword_it.data()->bounding_box(); |
| if (outword_it.at_last()) |
| next_left = 9999; |
| else |
| next_left = outword_it.data_relative(1)->bounding_box().left(); |
| // Dont touch small or touching blobs - it is too dangerous. |
| if ((out_box.width() > 8 * word_res->denorm.scale()) && |
| (out_box.left() > prev_right) && (out_box.right() < next_left)) { |
| aspect_ratio = out_box.width() / (float) out_box.height(); |
| if (unicharset.eq(best_choice->unichar_id(i), ".")) { |
| if (aspect_ratio >= tessedit_upper_flip_hyphen && |
| unicharset.contains_unichar_id(unichar_dash) && |
| unicharset.get_enabled(unichar_dash)) { |
| /* Certain HYPHEN */ |
| best_choice->set_unichar_id(unichar_dash, i); |
| modified = true; |
| if (word_res->reject_map[i].rejected()) |
| word_res->reject_map[i].setrej_hyphen_accept(); |
| } |
| if ((aspect_ratio > tessedit_lower_flip_hyphen) && |
| word_res->reject_map[i].accepted()) |
| //Suspected HYPHEN |
| word_res->reject_map[i].setrej_hyphen (); |
| } |
| else if (best_choice->unichar_id(i) == unichar_dash) { |
| if ((aspect_ratio >= tessedit_upper_flip_hyphen) && |
| (word_res->reject_map[i].rejected())) |
| word_res->reject_map[i].setrej_hyphen_accept(); |
| //Certain HYPHEN |
| |
| if ((aspect_ratio <= tessedit_lower_flip_hyphen) && |
| (word_res->reject_map[i].accepted())) |
| //Suspected HYPHEN |
| word_res->reject_map[i].setrej_hyphen(); |
| } |
| } |
| prev_right = out_box.right(); |
| } |
| if (modified) { |
| best_choice->populate_unichars(unicharset); |
| } |
| } |
| |
| void Tesseract::flip_0O(WERD_RES *word_res) { |
| WERD_CHOICE *best_choice = word_res->best_choice; |
| int i; |
| PBLOB_IT outword_it; |
| TBOX out_box; |
| |
| if (!tessedit_flip_0O) |
| return; |
| |
| outword_it.set_to_list(word_res->outword->blob_list ()); |
| |
| for (i = 0, outword_it.mark_cycle_pt (); |
| i < best_choice->length() && !outword_it.cycled_list (); |
| ++i, outword_it.forward ()) { |
| if (unicharset.get_isupper(best_choice->unichar_id(i)) || |
| unicharset.get_isdigit(best_choice->unichar_id(i))) { |
| out_box = outword_it.data()->bounding_box (); |
| if ((out_box.top() < bln_baseline_offset + bln_x_height) || |
| (out_box.bottom() > bln_baseline_offset + bln_x_height / 4)) |
| return; //Beware words with sub/superscripts |
| } |
| } |
| UNICHAR_ID unichar_0 = unicharset.unichar_to_id("0"); |
| UNICHAR_ID unichar_O = unicharset.unichar_to_id("O"); |
| if (unichar_0 == INVALID_UNICHAR_ID || !unicharset.get_enabled(unichar_0) || |
| unichar_O == INVALID_UNICHAR_ID || !unicharset.get_enabled(unichar_O)) { |
| return; // 0 or O are not present/enabled in unicharset |
| } |
| bool modified = false; |
| for (i = 1; i < best_choice->length(); ++i, outword_it.forward ()) { |
| if (best_choice->unichar_id(i) == unichar_0 || |
| best_choice->unichar_id(i) == unichar_O) { |
| /* A0A */ |
| if ((i+1) < best_choice->length() && |
| non_O_upper(best_choice->unichar_id(i-1)) && |
| non_O_upper(best_choice->unichar_id(i+1))) { |
| best_choice->set_unichar_id(unichar_O, i); |
| modified = true; |
| } |
| /* A00A */ |
| if (non_O_upper(best_choice->unichar_id(i-1)) && |
| (i+1) < best_choice->length() && |
| (best_choice->unichar_id(i+1) == unichar_0 || |
| best_choice->unichar_id(i+1) == unichar_O) && |
| (i+2) < best_choice->length() && |
| non_O_upper(best_choice->unichar_id(i+2))) { |
| best_choice->set_unichar_id(unichar_O, i); |
| modified = true; |
| i++; |
| } |
| /* AA0<non digit or end of word> */ |
| if ((i > 1) && |
| non_O_upper(best_choice->unichar_id(i-2)) && |
| non_O_upper(best_choice->unichar_id(i-1)) && |
| (((i+1) < best_choice->length() && |
| !unicharset.get_isdigit(best_choice->unichar_id(i+1)) && |
| !unicharset.eq(best_choice->unichar_id(i+1), "l") && |
| !unicharset.eq(best_choice->unichar_id(i+1), "I")) || |
| (i == best_choice->length() - 1))) { |
| best_choice->set_unichar_id(unichar_O, i); |
| modified = true; |
| } |
| /* 9O9 */ |
| if (non_0_digit(best_choice->unichar_id(i-1)) && |
| (i+1) < best_choice->length() && |
| non_0_digit(best_choice->unichar_id(i+1))) { |
| best_choice->set_unichar_id(unichar_0, i); |
| modified = true; |
| } |
| /* 9OOO */ |
| if (non_0_digit(best_choice->unichar_id(i-1)) && |
| (i+2) < best_choice->length() && |
| (best_choice->unichar_id(i+1) == unichar_0 || |
| best_choice->unichar_id(i+1) == unichar_O) && |
| (best_choice->unichar_id(i+2) == unichar_0 || |
| best_choice->unichar_id(i+2) == unichar_O)) { |
| best_choice->set_unichar_id(unichar_0, i); |
| best_choice->set_unichar_id(unichar_0, i+1); |
| best_choice->set_unichar_id(unichar_0, i+2); |
| modified = true; |
| i += 2; |
| } |
| /* 9OO<non upper> */ |
| if (non_0_digit(best_choice->unichar_id(i-1)) && |
| (i+2) < best_choice->length() && |
| (best_choice->unichar_id(i+1) == unichar_0 || |
| best_choice->unichar_id(i+1) == unichar_O) && |
| !unicharset.get_isupper(best_choice->unichar_id(i+2))) { |
| best_choice->set_unichar_id(unichar_0, i); |
| best_choice->set_unichar_id(unichar_0, i+1); |
| modified = true; |
| i++; |
| } |
| /* 9O<non upper> */ |
| if (non_0_digit(best_choice->unichar_id(i-1)) && |
| (i+1) < best_choice->length() && |
| !unicharset.get_isupper(best_choice->unichar_id(i+1))) { |
| best_choice->set_unichar_id(unichar_0, i); |
| } |
| /* 9[.,]OOO.. */ |
| if ((i > 1) && |
| (unicharset.eq(best_choice->unichar_id(i-1), ".") || |
| unicharset.eq(best_choice->unichar_id(i-1), ",")) && |
| (unicharset.get_isdigit(best_choice->unichar_id(i-2)) || |
| best_choice->unichar_id(i-2) == unichar_O)) { |
| if (best_choice->unichar_id(i-2) == unichar_O) { |
| best_choice->set_unichar_id(unichar_0, i-2); |
| modified = true; |
| } |
| while (i < best_choice->length() && |
| (best_choice->unichar_id(i) == unichar_O || |
| best_choice->unichar_id(i) == unichar_0)) { |
| best_choice->set_unichar_id(unichar_0, i); |
| modified = true; |
| i++; |
| } |
| i--; |
| } |
| } |
| } |
| if (modified) { |
| best_choice->populate_unichars(unicharset); |
| } |
| } |
| |
| BOOL8 Tesseract::non_O_upper(UNICHAR_ID unichar_id) { |
| return (unicharset.get_isupper(unichar_id) && |
| (!unicharset.eq(unichar_id, "O"))); |
| } |
| |
| BOOL8 Tesseract::non_0_digit(UNICHAR_ID unichar_id) { |
| return (unicharset.get_isdigit(unichar_id) && |
| (!unicharset.eq(unichar_id, "0"))); |
| } |
| } // namespace tesseract |