| /* -*-C-*- |
| ******************************************************************************** |
| * |
| * File: permute.c (Formerly permute.c) |
| * Description: Choose OCR text given character-probability maps |
| * for sequences of glyph fragments and a dictionary provided as |
| * a Dual Acyclic Word Graph. |
| * In this file, "permute" should be read "combine." |
| * Author: Mark Seaman, OCR Technology |
| * Created: Fri Sep 22 14:05:51 1989 |
| * Modified: Thu Jan 3 16:38:46 1991 (Mark Seaman) marks@hpgrlt |
| * Language: C |
| * Package: N/A |
| * Status: Experimental (Do Not Distribute) |
| * |
| * (c) Copyright 1989, Hewlett-Packard Company. |
| ** Licensed under the Apache License, Version 2.0 (the "License"); |
| ** you may not use this file except in compliance with the License. |
| ** You may obtain a copy of the License at |
| ** http://www.apache.org/licenses/LICENSE-2.0 |
| ** Unless required by applicable law or agreed to in writing, software |
| ** distributed under the License is distributed on an "AS IS" BASIS, |
| ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ** See the License for the specific language governing permissions and |
| ** limitations under the License. |
| * |
| *********************************************************************************/ |
| /*---------------------------------------------------------------------- |
| I n c l u d e s |
| ---------------------------------------------------------------------*/ |
| |
| #include <assert.h> |
| #include <math.h> |
| |
| #ifndef FST_DISABLED |
| #include "fstmodel.h" |
| #endif |
| |
| #include "const.h" |
| |
| #include "permute.h" |
| |
| #include "callcpp.h" |
| #include "choices.h" |
| #include "context.h" |
| #include "conversion.h" |
| #include "debug.h" |
| #include "freelist.h" |
| #include "globals.h" |
| #include "hyphen.h" |
| #include "ndminx.h" |
| #include "permdawg.h" |
| #include "permngram.h" |
| #include "permnum.h" |
| #include "ratngs.h" |
| #include "stopper.h" |
| #include "tordvars.h" |
| #include "tprintf.h" |
| #include "trie.h" |
| #include "varable.h" |
| #include "unicharset.h" |
| #include "dict.h" |
| #include "image.h" |
| #include "ccutil.h" |
| |
| int permutation_count; // Used in metrics.cpp. |
| /*---------------------------------------------------------------------- |
| V a r i a b l e s |
| ----------------------------------------------------------------------*/ |
| // TODO(tkielbus) Choose a value for the MAX_NUM_EDGES constant |
| // (or make it dynamic) |
| #define MAX_NUM_EDGES 2000000 |
| #ifdef DISABLE_DOC_DICT |
| #define MAX_DOC_EDGES 150 |
| #define RESERVED_DOC_EDGES 10 |
| #else |
| #define MAX_DOC_EDGES 250000 |
| #define RESERVED_DOC_EDGES 10000 |
| #endif |
| #define MAX_USER_EDGES 50000 |
| #define USER_RESERVED_EDGES 2000 |
| /* Weights for adjustment */ |
| #define NON_WERD 1.25 |
| #define GARBAGE_STRING 1.5 |
| #define MAX_PERM_LENGTH 128 |
| |
| EDGE_ARRAY pending_words; |
| EDGE_ARRAY document_words; |
| EDGE_ARRAY user_words; |
| EDGE_ARRAY word_dawg; |
| |
| INT_VAR(fragments_debug, 0, "Debug character fragments"); |
| |
| BOOL_VAR(segment_debug, 0, "Debug the whole segmentation process"); |
| |
| BOOL_VAR(segment_adjust_debug, 0, "Segmentation adjustment debug"); |
| |
| double_VAR(segment_penalty_dict_nonword, NON_WERD, |
| "Score multiplier for glyph fragment segmentations which do not " |
| "match a dictionary word (lower is better)."); |
| |
| double_VAR(segment_penalty_garbage, GARBAGE_STRING, |
| "Score multiplier for poorly cased strings that are not in the " |
| "dictionary and generally look like garbage (lower is better)."); |
| |
| BOOL_VAR(save_doc_words, 0, "Save Document Words"); |
| |
| |
| #ifdef DISABLE_DOC_DICT |
| BOOL_VAR(doc_dict_enable, 0, "Enable Document Dictionary "); |
| #else |
| BOOL_VAR(doc_dict_enable, 1, "Enable Document Dictionary "); |
| /* PREV DEFAULT 0 */ |
| #endif |
| |
| BOOL_VAR(ngram_permuter_activated, FALSE, |
| "Activate character-level n-gram-based permuter"); |
| |
| #ifndef FST_DISABLED |
| BOOL_VAR(fst_activated, FALSE, "Activate fst"); |
| #endif |
| |
| int permute_only_top = 0; |
| |
| #if 0 |
| //0x0=. |
| static inT32 bigram_counts[256][3] = { { |
| 0, 0, 0 |
| }, |
| { //0x1=. |
| 0, 0, 0 |
| }, |
| { //0x2=. |
| 0, 0, 0 |
| }, |
| { //0x3=. |
| 0, 0, 0 |
| }, |
| { //0x4=. |
| 0, 0, 0 |
| }, |
| { //0x5=. |
| 0, 0, 0 |
| }, |
| { //0x6=. |
| 0, 0, 0 |
| }, |
| { //0x7=. |
| 0, 0, 0 |
| }, |
| { //0x8=. |
| 0, 0, 0 |
| }, |
| { //0x9=. |
| 0, 0, 0 |
| }, |
| { //0xa=. |
| 93, 28, 0 |
| }, |
| { //0xb=. |
| 0, 0, 0 |
| }, |
| { //0xc=. |
| 0, 0, 0 |
| }, |
| { //0xd=. |
| 0, 0, 0 |
| }, |
| { //0xe=. |
| 0, 0, 0 |
| }, |
| { //0xf=. |
| 0, 0, 0 |
| }, |
| { //0x10=. |
| 0, 0, 0 |
| }, |
| { //0x11=. |
| 0, 0, 0 |
| }, |
| { //0x12=. |
| 0, 0, 0 |
| }, |
| { //0x13=. |
| 0, 0, 0 |
| }, |
| { //0x14=. |
| 0, 0, 0 |
| }, |
| { //0x15=. |
| 0, 0, 0 |
| }, |
| { //0x16=. |
| 0, 0, 0 |
| }, |
| { //0x17=. |
| 0, 0, 0 |
| }, |
| { //0x18=. |
| 0, 0, 0 |
| }, |
| { //0x19=. |
| 0, 0, 0 |
| }, |
| { //0x1a=. |
| 0, 0, 0 |
| }, |
| { //0x1b=. |
| 0, 0, 0 |
| }, |
| { //0x1c=. |
| 0, 0, 0 |
| }, |
| { //0x1d=. |
| 0, 0, 0 |
| }, |
| { //0x1e=. |
| 0, 0, 0 |
| }, |
| { //0x1f=. |
| 0, 0, 0 |
| }, |
| { //0x20= |
| 324, 377, 2 |
| }, |
| { //0x21=! |
| 2, 1, 0 |
| }, |
| { //0x22=" |
| 2, 1, 0 |
| }, |
| { //0x23=# |
| 1, 0, 1 |
| }, |
| { //0x24=$ |
| 2, 1, 0 |
| }, |
| { //0x25=% |
| 2, 0, 0 |
| }, |
| { //0x26=& |
| 2, 1, 0 |
| }, |
| { //0x27=' |
| 1, 21, 8 |
| }, |
| { //0x28=( |
| 2, 1, 0 |
| }, |
| { //0x29=) |
| 19, 0, 0 |
| }, |
| { //0x2a=* |
| 2, 1, 0 |
| }, |
| { //0x2b=+ |
| 1, 0, 0 |
| }, |
| { //0x2c=, |
| 75, 4, 0 |
| }, |
| { //0x2d=- |
| 52, 7, 0 |
| }, |
| { //0x2e=. |
| 190, 16, 3 |
| }, |
| { //0x2f=/ |
| 53, 2, 0 |
| }, |
| { //0x30=0 |
| 399, 0, 0 |
| }, |
| { //0x31=1 |
| 220, 0, 0 |
| }, |
| { //0x32=2 |
| 226, 0, 0 |
| }, |
| { //0x33=3 |
| 128, 0, 0 |
| }, |
| { //0x34=4 |
| 147, 0, 0 |
| }, |
| { //0x35=5 |
| 179, 0, 1 |
| }, |
| { //0x36=6 |
| 173, 0, 0 |
| }, |
| { //0x37=7 |
| 115, 0, 0 |
| }, |
| { //0x38=8 |
| 107, 0, 0 |
| }, |
| { //0x39=9 |
| 934, 0, 1 |
| }, |
| { //0x3a=: |
| 27, 0, 1 |
| }, |
| { //0x3b=; |
| 2, 1, 0 |
| }, |
| { //0x3c=< |
| 2, 1, 0 |
| }, |
| { //0x3d== |
| 2, 1, 0 |
| }, |
| { //0x3e=> |
| 2, 1, 0 |
| }, |
| { //0x3f=? |
| 2, 1, 0 |
| }, |
| { //0x40=@ |
| 2, 1, 0 |
| }, |
| { //0x41=A |
| 3, 1, 0 |
| }, |
| { //0x42=B |
| 1, 73, 0 |
| }, |
| { //0x43=C |
| 1, 6, 0 |
| }, |
| { //0x44=D |
| 1, 24, 0 |
| }, |
| { //0x45=E |
| 1, 2, 0 |
| }, |
| { //0x46=F |
| 1, 19, 0 |
| }, |
| { //0x47=G |
| 1, 2, 0 |
| }, |
| { //0x48=H |
| 3, 2, 1 |
| }, |
| { //0x49=I |
| 0, 68, 0 |
| }, |
| { //0x4a=J |
| 1, 2, 0 |
| }, |
| { //0x4b=K |
| 1, 2, 0 |
| }, |
| { //0x4c=L |
| 1, 82, 0 |
| }, |
| { //0x4d=M |
| 10, 10, 0 |
| }, |
| { //0x4e=N |
| 3, 239, 0 |
| }, |
| { //0x4f=O |
| 1, 10, 0 |
| }, |
| { //0x50=P |
| 0, 1, 3 |
| }, |
| { //0x51=Q |
| 2, 3, 0 |
| }, |
| { //0x52=R |
| 1, 43, 0 |
| }, |
| { //0x53=S |
| 1, 53, 0 |
| }, |
| { //0x54=T |
| 2, 18, 0 |
| }, |
| { //0x55=U |
| 1, 2, 0 |
| }, |
| { //0x56=V |
| 1, 17, 0 |
| }, |
| { //0x57=W |
| 1, 5, 0 |
| }, |
| { //0x58=X |
| 1, 6, 0 |
| }, |
| { //0x59=Y |
| 1, 2, 0 |
| }, |
| { //0x5a=Z |
| 1, 2, 0 |
| }, |
| { //0x5b=[ |
| 2, 1, 0 |
| }, |
| { //0x5c=backslash |
| 2, 1, 0 |
| }, |
| { //0x5d=] |
| 2, 1, 0 |
| }, |
| { //0x5e=^ |
| 2, 1, 0 |
| }, |
| { //0x5f=_ |
| 2, 1, 0 |
| }, |
| { //0x60=` |
| 1, 0, 2 |
| }, |
| { //0x61=a |
| 0, 0, 671 |
| }, |
| { //0x62=b |
| 0, 1, 16 |
| }, |
| { //0x63=c |
| 0, 2, 1 |
| }, |
| { //0x64=d |
| 0, 14, 0 |
| }, |
| { //0x65=e |
| 0, 0, 763 |
| }, |
| { //0x66=f |
| 0, 186, 0 |
| }, |
| { //0x67=g |
| 0, 2, 1 |
| }, |
| { //0x68=h |
| 0, 2, 1 |
| }, |
| { //0x69=i |
| 0, 0, 818 |
| }, |
| { //0x6a=j |
| 0, 2, 1 |
| }, |
| { //0x6b=k |
| 0, 4, 1 |
| }, |
| { //0x6c=l |
| 0, 26, 3 |
| }, |
| { //0x6d=m |
| 0, 69, 0 |
| }, |
| { //0x6e=n |
| 0, 885, 0 |
| }, |
| { //0x6f=o |
| 0, 17, 722 |
| }, |
| { //0x70=p |
| 0, 1, 5 |
| }, |
| { //0x71=q |
| 2, 1, 0 |
| }, |
| { //0x72=r |
| 0, 21, 0 |
| }, |
| { //0x73=s |
| 3, 49, 0 |
| }, |
| { //0x74=t |
| 0, 219, 5 |
| }, |
| { //0x75=u |
| 0, 0, 56 |
| }, |
| { //0x76=v |
| 0, 4, 0 |
| }, |
| { //0x77=w |
| 0, 2, 1 |
| }, |
| { //0x78=x |
| 0, 2, 1 |
| }, |
| { //0x79=y |
| 0, 1, 23 |
| }, |
| { //0x7a=z |
| 0, 2, 1 |
| }, |
| { //0x7b={ |
| 2, 1, 0 |
| }, |
| { //0x7c=| |
| 59, 0, 3 |
| }, |
| { //0x7d=} |
| 2, 1, 0 |
| }, |
| { //0x7e=~ |
| 2, 1, 0 |
| }, |
| { //0x7f=. |
| 0, 0, 0 |
| }, |
| { //0x80=. |
| 0, 0, 0 |
| }, |
| { //0x81=. |
| 0, 0, 0 |
| }, |
| { //0x82=. |
| 0, 0, 0 |
| }, |
| { //0x83=. |
| 0, 0, 0 |
| }, |
| { //0x84=. |
| 0, 0, 0 |
| }, |
| { //0x85=. |
| 0, 0, 0 |
| }, |
| { //0x86=. |
| 0, 0, 0 |
| }, |
| { //0x87=. |
| 0, 0, 0 |
| }, |
| { //0x88=. |
| 0, 0, 0 |
| }, |
| { //0x89=. |
| 0, 0, 0 |
| }, |
| { //0x8a=. |
| 0, 0, 0 |
| }, |
| { //0x8b=. |
| 0, 0, 0 |
| }, |
| { //0x8c=. |
| 0, 0, 0 |
| }, |
| { //0x8d=. |
| 0, 0, 0 |
| }, |
| { //0x8e=. |
| 0, 0, 0 |
| }, |
| { //0x8f=. |
| 0, 0, 0 |
| }, |
| { //0x90=. |
| 0, 0, 0 |
| }, |
| { //0x91=. |
| 0, 0, 0 |
| }, |
| { //0x92=. |
| 0, 0, 0 |
| }, |
| { //0x93=. |
| 0, 0, 0 |
| }, |
| { //0x94=. |
| 0, 0, 0 |
| }, |
| { //0x95=. |
| 0, 0, 0 |
| }, |
| { //0x96=. |
| 0, 0, 0 |
| }, |
| { //0x97=. |
| 0, 0, 0 |
| }, |
| { //0x98=. |
| 0, 0, 0 |
| }, |
| { //0x99=. |
| 0, 0, 0 |
| }, |
| { //0x9a=. |
| 0, 0, 0 |
| }, |
| { //0x9b=. |
| 0, 0, 0 |
| }, |
| { //0x9c=. |
| 0, 0, 0 |
| }, |
| { //0x9d=. |
| 0, 0, 0 |
| }, |
| { //0x9e=. |
| 0, 0, 0 |
| }, |
| { //0x9f=. |
| 0, 0, 0 |
| }, |
| { //0xa0=. |
| 0, 0, 0 |
| }, |
| { //0xa1=. |
| 0, 0, 0 |
| }, |
| { //0xa2=. |
| 0, 0, 0 |
| }, |
| { //0xa3=. |
| 0, 0, 0 |
| }, |
| { //0xa4=. |
| 0, 0, 0 |
| }, |
| { //0xa5=. |
| 0, 0, 0 |
| }, |
| { //0xa6=. |
| 0, 0, 0 |
| }, |
| { //0xa7=. |
| 0, 0, 0 |
| }, |
| { //0xa8=. |
| 0, 0, 0 |
| }, |
| { //0xa9=. |
| 0, 0, 0 |
| }, |
| { //0xaa=. |
| 0, 0, 0 |
| }, |
| { //0xab=. |
| 0, 0, 0 |
| }, |
| { //0xac=. |
| 0, 0, 0 |
| }, |
| { //0xad=. |
| 0, 0, 0 |
| }, |
| { //0xae=. |
| 0, 0, 0 |
| }, |
| { //0xaf=. |
| 0, 0, 0 |
| }, |
| { //0xb0=. |
| 0, 0, 0 |
| }, |
| { //0xb1=. |
| 0, 0, 0 |
| }, |
| { //0xb2=. |
| 0, 0, 0 |
| }, |
| { //0xb3=. |
| 0, 0, 0 |
| }, |
| { //0xb4=. |
| 0, 0, 0 |
| }, |
| { //0xb5=. |
| 0, 0, 0 |
| }, |
| { //0xb6=. |
| 0, 0, 0 |
| }, |
| { //0xb7=. |
| 0, 0, 0 |
| }, |
| { //0xb8=. |
| 0, 0, 0 |
| }, |
| { //0xb9=. |
| 0, 0, 0 |
| }, |
| { //0xba=. |
| 0, 0, 0 |
| }, |
| { //0xbb=. |
| 0, 0, 0 |
| }, |
| { //0xbc=. |
| 0, 0, 0 |
| }, |
| { //0xbd=. |
| 0, 0, 0 |
| }, |
| { //0xbe=. |
| 0, 0, 0 |
| }, |
| { //0xbf=. |
| 0, 0, 0 |
| }, |
| { //0xc0=. |
| 0, 0, 0 |
| }, |
| { //0xc1=. |
| 0, 0, 0 |
| }, |
| { //0xc2=. |
| 0, 0, 0 |
| }, |
| { //0xc3=. |
| 0, 0, 0 |
| }, |
| { //0xc4=. |
| 0, 0, 0 |
| }, |
| { //0xc5=. |
| 0, 0, 0 |
| }, |
| { //0xc6=. |
| 0, 0, 0 |
| }, |
| { //0xc7=. |
| 0, 0, 0 |
| }, |
| { //0xc8=. |
| 0, 0, 0 |
| }, |
| { //0xc9=. |
| 0, 0, 0 |
| }, |
| { //0xca=. |
| 0, 0, 0 |
| }, |
| { //0xcb=. |
| 0, 0, 0 |
| }, |
| { //0xcc=. |
| 0, 0, 0 |
| }, |
| { //0xcd=. |
| 0, 0, 0 |
| }, |
| { //0xce=. |
| 0, 0, 0 |
| }, |
| { //0xcf=. |
| 0, 0, 0 |
| }, |
| { //0xd0=. |
| 0, 0, 0 |
| }, |
| { //0xd1=. |
| 0, 0, 0 |
| }, |
| { //0xd2=. |
| 0, 0, 0 |
| }, |
| { //0xd3=. |
| 0, 0, 0 |
| }, |
| { //0xd4=. |
| 0, 0, 0 |
| }, |
| { //0xd5=. |
| 0, 0, 0 |
| }, |
| { //0xd6=. |
| 0, 0, 0 |
| }, |
| { //0xd7=. |
| 0, 0, 0 |
| }, |
| { //0xd8=. |
| 0, 0, 0 |
| }, |
| { //0xd9=. |
| 0, 0, 0 |
| }, |
| { //0xda=. |
| 0, 0, 0 |
| }, |
| { //0xdb=. |
| 0, 0, 0 |
| }, |
| { //0xdc=. |
| 0, 0, 0 |
| }, |
| { //0xdd=. |
| 0, 0, 0 |
| }, |
| { //0xde=. |
| 0, 0, 0 |
| }, |
| { //0xdf=. |
| 0, 0, 0 |
| }, |
| { //0xe0=. |
| 0, 0, 0 |
| }, |
| { //0xe1=. |
| 0, 0, 0 |
| }, |
| { //0xe2=. |
| 0, 0, 0 |
| }, |
| { //0xe3=. |
| 0, 0, 0 |
| }, |
| { //0xe4=. |
| 0, 0, 0 |
| }, |
| { //0xe5=. |
| 0, 0, 0 |
| }, |
| { //0xe6=. |
| 0, 0, 0 |
| }, |
| { //0xe7=. |
| 0, 0, 0 |
| }, |
| { //0xe8=. |
| 0, 0, 0 |
| }, |
| { //0xe9=. |
| 0, 0, 0 |
| }, |
| { //0xea=. |
| 0, 0, 0 |
| }, |
| { //0xeb=. |
| 0, 0, 0 |
| }, |
| { //0xec=. |
| 0, 0, 0 |
| }, |
| { //0xed=. |
| 0, 0, 0 |
| }, |
| { //0xee=. |
| 0, 0, 0 |
| }, |
| { //0xef=. |
| 0, 0, 0 |
| }, |
| { //0xf0=. |
| 0, 0, 0 |
| }, |
| { //0xf1=. |
| 0, 0, 0 |
| }, |
| { //0xf2=. |
| 0, 0, 0 |
| }, |
| { //0xf3=. |
| 0, 0, 0 |
| }, |
| { //0xf4=. |
| 0, 0, 0 |
| }, |
| { //0xf5=. |
| 0, 0, 0 |
| }, |
| { //0xf6=. |
| 0, 0, 0 |
| }, |
| { //0xf7=. |
| 0, 0, 0 |
| }, |
| { //0xf8=. |
| 0, 0, 0 |
| }, |
| { //0xf9=. |
| 0, 0, 0 |
| }, |
| { //0xfa=. |
| 0, 0, 0 |
| }, |
| { //0xfb=. |
| 0, 0, 0 |
| }, |
| { //0xfc=. |
| 0, 0, 0 |
| }, |
| { //0xfd=. |
| 0, 0, 0 |
| }, |
| { //0xfe=. |
| 0, 0, 0 |
| }, |
| { //0xff=. |
| 0, 0, 0 |
| }, |
| }; |
| #endif |
| |
| //extern "C" double permuter_pending_threshold; |
| |
| /* Similarity matcher values */ |
| #define SIM_CERTAINTY_SCALE -10.0 |
| /* Similarity matcher values */ |
| #define SIM_CERTAINTY_OFFSET -10.0 |
| /* Worst E*L product to stop on */ |
| #define SIMILARITY_FLOOR 100.0 |
| /*---------------------------------------------------------------------- |
| F u n c t i o n s |
| ----------------------------------------------------------------------*/ |
| |
| /********************************************************************** |
| * get_best_delete_other |
| * |
| * Returns the best of two choices and deletes the other (worse) choice. |
| * A choice is better if it has a non-empty string and has a lower |
| * rating than the other choice. If the ratings are the same, |
| * choice2 is preferred over choice1. |
| **********************************************************************/ |
| WERD_CHOICE *get_best_delete_other(WERD_CHOICE *choice1, |
| WERD_CHOICE *choice2) { |
| if (!choice1) return choice2; |
| if (!choice2) return choice1; |
| if (choice1->rating() < choice2->rating() || choice2->length() == 0) { |
| delete choice2; |
| return choice1; |
| } else { |
| delete choice1; |
| return choice2; |
| } |
| } |
| |
| |
| /********************************************************************** |
| * good_choice |
| * |
| * Return TRUE if a good answer is found for the unknown blob rating. |
| **********************************************************************/ |
| int good_choice(const WERD_CHOICE &choice) { |
| register float certainty; |
| if (similarity_enable) { |
| if ((choice.rating() + 1) * choice.certainty() > SIMILARITY_FLOOR) |
| return false; |
| certainty = |
| SIM_CERTAINTY_OFFSET + choice.rating() * SIM_CERTAINTY_SCALE; |
| } else { |
| certainty = choice.certainty(); |
| } |
| |
| return (certainty > certainty_threshold) ? true : false; |
| } |
| |
| |
| /********************************************************************** |
| * add_document_word |
| * |
| * Add a word found on this document to the document specific |
| * dictionary. |
| **********************************************************************/ |
| namespace tesseract { |
| void Dict::add_document_word(const WERD_CHOICE &best_choice) { |
| char filename[CHARS_PER_LINE]; |
| FILE *doc_word_file; |
| const char *string = best_choice.unichar_string().string(); |
| int stringlen = best_choice.length(); |
| |
| if (!doc_dict_enable |
| || valid_word (string) || CurrentWordAmbig () || stringlen < 2) |
| return; |
| |
| if (!good_choice(best_choice) || stringlen == 2) { |
| if (best_choice.certainty() < permuter_pending_threshold) |
| return; |
| if (!word_in_dawg(pending_words, string)) { |
| if (stringlen > 2 || |
| (stringlen == 2 && |
| getUnicharset().get_isupper(best_choice.unichar_id(0)) && |
| getUnicharset().get_isupper(best_choice.unichar_id(1)))) { |
| add_word_to_dawg(pending_words, string, |
| MAX_DOC_EDGES, RESERVED_DOC_EDGES); |
| } |
| return; |
| } |
| } |
| |
| if (save_doc_words) { |
| strcpy(filename, getImage()->getCCUtil()->imagefile.string()); |
| strcat (filename, ".doc"); |
| doc_word_file = open_file (filename, "a"); |
| fprintf (doc_word_file, "%s\n", string); |
| fclose(doc_word_file); |
| } |
| add_word_to_dawg(document_words, string, MAX_DOC_EDGES, RESERVED_DOC_EDGES); |
| } |
| |
| |
| /********************************************************************** |
| * adjust_non_word |
| * |
| * Assign an adjusted value to a string that is a non-word. The value |
| * that this word choice has is based on case and punctuation rules. |
| **********************************************************************/ |
| void Dict::adjust_non_word(const char *word, const char *word_lengths, |
| float rating, float *new_rating, |
| float *adjust_factor) { |
| if (segment_adjust_debug) |
| cprintf("Non-word: %s %4.2f ", word, rating); |
| |
| *new_rating = rating + RATING_PAD; |
| if (case_ok(word, word_lengths) && |
| punctuation_ok(word, word_lengths) != -1) { |
| *new_rating *= segment_penalty_dict_nonword; |
| *adjust_factor = segment_penalty_dict_nonword; |
| if (segment_adjust_debug) |
| cprintf(", %4.2f ", (double)segment_penalty_dict_nonword); |
| } else { |
| *new_rating *= segment_penalty_garbage; |
| *adjust_factor = segment_penalty_garbage; |
| if (segment_adjust_debug) { |
| if (!case_ok(word, word_lengths)) |
| cprintf (", C"); |
| if (punctuation_ok(word, word_lengths) == -1) |
| cprintf (", P"); |
| cprintf (", %4.2f ", (double)segment_penalty_garbage); |
| } |
| } |
| |
| *new_rating -= RATING_PAD; |
| |
| if (segment_adjust_debug) |
| cprintf (" --> %4.2f\n", *new_rating); |
| } |
| |
| |
| /********************************************************************** |
| * init_permute |
| * |
| * Initialize anything that needs to be set up for the permute |
| * functions. |
| **********************************************************************/ |
| void Dict::init_permute() { |
| if (word_dawg != NULL) |
| end_permute(); |
| init_permdawg(); |
| STRING name; |
| name = getImage()->getCCUtil()->language_data_path_prefix; |
| name += "word-dawg"; |
| word_dawg = read_squished_dawg(name.string()); |
| |
| document_words = |
| (EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_DOC_EDGES); |
| initialize_dawg(document_words, MAX_DOC_EDGES); |
| |
| pending_words = |
| (EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_DOC_EDGES); |
| initialize_dawg(pending_words, MAX_DOC_EDGES); |
| |
| user_words = (EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_USER_EDGES); |
| name = getImage()->getCCUtil()->language_data_path_prefix; |
| name += "user-words"; |
| read_word_list(name.string(), user_words, MAX_USER_EDGES, USER_RESERVED_EDGES); |
| #ifndef FST_DISABLED |
| if (fst_activated) { |
| STRING name; |
| name = getImage()->getCCUtil()->language_data_path_prefix; |
| name += "fst"; |
| LanguageModel::Instance()->InitWithLanguage(name.string()); |
| letter_is_okay_ = &tesseract::Dict::fst_letter_is_okay; |
| } |
| #endif |
| } |
| |
| void Dict::end_permute() { |
| if (word_dawg == NULL) |
| return; // Not safe to call twice. |
| memfree(word_dawg); |
| word_dawg = NULL; |
| memfree(document_words); |
| document_words = NULL; |
| memfree(pending_words); |
| pending_words = NULL; |
| memfree(user_words); |
| user_words = NULL; |
| end_permdawg(); |
| } |
| |
| /********************************************************************** |
| * permute_all |
| * |
| * Permute all the characters together using all of the different types |
| * of permuters/selectors available. Each of the characters must have |
| * a non-NULL choice list. |
| * |
| * Note: order of applying permuters does matter, since the latter |
| * permuter will be recorded if the resulting word ratings are the same. |
| **********************************************************************/ |
| WERD_CHOICE *Dict::permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices, |
| float rating_limit, |
| WERD_CHOICE *raw_choice) { |
| A_CHOICE *a_choice; |
| WERD_CHOICE *result1; |
| WERD_CHOICE *result2 = NULL; |
| BOOL8 any_alpha; |
| int free_index; |
| float top_choice_rating_limit = rating_limit; |
| CHOICES_LIST old_char_choices = NULL; |
| |
| // Initialize result1 from the result of permute_top_choice. |
| result1 = permute_top_choice(char_choices, &top_choice_rating_limit, |
| raw_choice, &any_alpha); |
| |
| // Permute character fragments if necessary. |
| if (result1 == NULL || result1->fragment_mark()) { |
| result2 = top_fragments_permute_and_select(char_choices, |
| top_choice_rating_limit); |
| result1 = get_best_delete_other(result1, result2); |
| } |
| |
| if (ngram_permuter_activated) { |
| old_char_choices = convert_to_choices_list(char_choices, getUnicharset()); |
| A_CHOICE *ngram_choice = |
| ngram_permute_and_select(old_char_choices, rating_limit, word_dawg); |
| free_all_choices(old_char_choices, free_index); |
| if (ngram_choice == NULL) return NULL; |
| result1 = new WERD_CHOICE(); |
| convert_to_word_choice(ngram_choice, getUnicharset(), result1); |
| return result1; |
| } |
| |
| if (result1 == NULL) |
| return (NULL); |
| if (permute_only_top) |
| return result1; |
| |
| old_char_choices = convert_to_choices_list(char_choices, getUnicharset()); |
| |
| if (display_ratings > 1) { |
| int i; |
| cprintf("\nold_char_choices in permute_characters: "); |
| for_each_choice(old_char_choices, i) { |
| print_choices("", (CHOICES) array_index (old_char_choices, i)); |
| cprintf("\n"); |
| } |
| } |
| |
| if (any_alpha && char_choices.length() <= MAX_WERD_LENGTH) { |
| a_choice = permute_words(old_char_choices, rating_limit); |
| result1 = get_best_delete_other(getUnicharset(), result1, a_choice); |
| } |
| |
| a_choice = number_permute_and_select(old_char_choices, rating_limit); |
| result1 = get_best_delete_other(getUnicharset(), result1, a_choice); |
| |
| result2 = permute_compound_words(char_choices, rating_limit); |
| result1 = get_best_delete_other(result1, result2); |
| |
| free_all_choices(old_char_choices, free_index); |
| return (result1); |
| } |
| |
| |
| /********************************************************************** |
| * permute_characters |
| * |
| * Permute these characters together according to each of the different |
| * permuters that are enabled. |
| **********************************************************************/ |
| void Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices, |
| float limit, |
| WERD_CHOICE *best_choice, |
| WERD_CHOICE *raw_choice) { |
| float old_raw_choice_rating = raw_choice->rating(); |
| |
| if (display_ratings > 1) { |
| cprintf("\nchar_choices in permute_characters:\n"); |
| for (int i = 0; i < char_choices.length(); ++i) { |
| print_ratings_list("", char_choices.get(i), getUnicharset()); |
| cprintf("\n"); |
| } |
| } |
| |
| permutation_count++; /* Global counter */ |
| |
| WERD_CHOICE *this_choice = permute_all(char_choices, limit, raw_choice); |
| |
| if (raw_choice->rating() < old_raw_choice_rating) { |
| // Populate unichars_ and unichar_lengths_ of raw_choice. This is |
| // needed for various components that still work with unichars rather |
| // than unichar ids (e.g. AdaptToWord). |
| raw_choice->populate_unichars(getUnicharset()); |
| } |
| if (this_choice && this_choice->rating() < best_choice->rating()) { |
| *best_choice = *this_choice; |
| // Populate unichars_ and unichar_lengths_ of best_choice. This is |
| // needed for various components that still work with unichars rather |
| // than unichar ids (dawg, *_ok functions, various hard-coded hacks). |
| best_choice->populate_unichars(getUnicharset()); |
| |
| if (display_ratings) { |
| cprintf("permute_characters: %s\n", |
| best_choice->debug_string(getUnicharset()).string()); |
| } |
| } |
| delete this_choice; |
| } |
| |
| |
| /********************************************************************** |
| * permute_compound_words |
| * |
| * Return the top choice for each character as the choice for the word. |
| **********************************************************************/ |
| WERD_CHOICE *Dict::permute_compound_words( |
| const BLOB_CHOICE_LIST_VECTOR &char_choices, |
| float rating_limit) { |
| BLOB_CHOICE *first_choice; |
| WERD_CHOICE *best_choice = NULL; |
| WERD_CHOICE current_word(MAX_WERD_LENGTH); |
| int first_index = 0; |
| int x; |
| BLOB_CHOICE_IT blob_choice_it; |
| |
| if (char_choices.length() > MAX_WERD_LENGTH) { |
| WERD_CHOICE *bad_word_choice = new WERD_CHOICE(); |
| bad_word_choice->make_bad(); |
| return bad_word_choice; |
| } |
| |
| UNICHAR_ID slash = getUnicharset().unichar_to_id("/"); |
| UNICHAR_ID dash = getUnicharset().unichar_to_id("-"); |
| for (x = 0; x < char_choices.length(); ++x) { |
| blob_choice_it.set_to_list(char_choices.get(x)); |
| first_choice = blob_choice_it.data(); |
| if (first_choice->unichar_id() == slash || |
| first_choice->unichar_id() == dash) { |
| if (x > first_index) { |
| if (segment_debug) |
| cprintf ("Hyphenated word found\n"); |
| permute_subword(char_choices, rating_limit, first_index, |
| x - 1, ¤t_word); |
| if (current_word.rating() > rating_limit) |
| break; |
| } |
| // Append hyphen/slash separator to current_word. |
| current_word.append_unichar_id_space_allocated( |
| first_choice->unichar_id(), 1, |
| first_choice->rating(), first_choice->certainty()); |
| |
| first_index = x + 1; // update first_index |
| } |
| } |
| |
| if (first_index > 0 && first_index < x && |
| current_word.rating() <= rating_limit) { |
| permute_subword(char_choices, rating_limit, first_index, |
| x - 1, ¤t_word); |
| best_choice = new WERD_CHOICE(current_word); |
| best_choice->set_permuter(COMPOUND_PERM); |
| } |
| return (best_choice); |
| } |
| |
| |
| /********************************************************************** |
| * permute_subword |
| * |
| * Permute a part of a compound word this subword is bounded by hyphens |
| * and the start and end of the word. Call the standard word permute |
| * function on a set of choices covering only part of the original |
| * word. When it is done reclaim the memory that was used in the |
| * excercise. |
| **********************************************************************/ |
| void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices, |
| float rating_limit, |
| int start, |
| int end, |
| WERD_CHOICE *current_word) { |
| int x; |
| BLOB_CHOICE_LIST_VECTOR subchoices; |
| WERD_CHOICE *best_choice = NULL; |
| WERD_CHOICE raw_choice; |
| raw_choice.make_bad(); |
| |
| DisableChoiceAccum(); |
| |
| for (x = start; x <= end; x++) { |
| if (char_choices.get(x) != NULL) { |
| subchoices += char_choices.get(x); |
| } |
| } |
| |
| if (!subchoices.empty()) { |
| if (segment_debug) |
| segment_dawg_debug.set_value(true); |
| best_choice = permute_all(subchoices, rating_limit, &raw_choice); |
| |
| if (segment_debug) |
| segment_dawg_debug.set_value(false); |
| |
| if (best_choice && best_choice->length() > 0) { |
| *current_word += *best_choice; |
| } else { |
| current_word->set_rating(MAX_FLOAT32); |
| } |
| } else { |
| current_word->set_rating(MAX_FLOAT32); |
| } |
| |
| if (best_choice) |
| delete best_choice; |
| |
| if (segment_debug && current_word->rating() < MAX_FLOAT32) { |
| cprintf ("Subword permuted = %s, %5.2f, %5.2f\n\n", |
| current_word->debug_string(getUnicharset()).string(), |
| current_word->rating(), current_word->certainty()); |
| } |
| |
| EnableChoiceAccum(); |
| } |
| |
| |
| /********************************************************************** |
| * permute_top_choice |
| * |
| * Return the top choice for each character as the choice for the word. |
| * In addition a choice is created for the best lower and upper case |
| * non-words. In each character position the best lower (or upper) case |
| * character is substituted for the best overall character. |
| **********************************************************************/ |
| WERD_CHOICE *Dict::permute_top_choice( |
| const BLOB_CHOICE_LIST_VECTOR &char_choices, |
| float* rating_limit, |
| WERD_CHOICE *raw_choice, |
| BOOL8 *any_alpha) { |
| BLOB_CHOICE *first_choice; |
| const char *first_char; //first choice |
| const char *second_char; //second choice |
| const char *third_char; //third choice |
| char prev_char[UNICHAR_LEN + 1]; //prev in word |
| const char *next_char = ""; //next in word |
| const char *next_next_char = ""; //after next next in word |
| |
| WERD_CHOICE word(MAX_PERM_LENGTH); |
| word.set_permuter(TOP_CHOICE_PERM); |
| WERD_CHOICE capital_word(MAX_PERM_LENGTH); |
| capital_word.set_permuter(UPPER_CASE_PERM); |
| WERD_CHOICE lower_word(MAX_PERM_LENGTH); |
| lower_word.set_permuter(LOWER_CASE_PERM); |
| |
| int x; |
| BOOL8 char_alpha; |
| |
| float first_rating = 0; |
| |
| float new_rating; |
| float adjust_factor; |
| |
| float certainties[MAX_PERM_LENGTH + 1]; |
| float lower_certainties[MAX_PERM_LENGTH + 1]; |
| float upper_certainties[MAX_PERM_LENGTH + 1]; |
| |
| BLOB_CHOICE_IT blob_choice_it; |
| UNICHAR_ID temp_id; |
| UNICHAR_ID unichar_id; |
| UNICHAR_ID space = getUnicharset().unichar_to_id(" "); |
| register const char* ch; |
| register inT8 lower_done; |
| register inT8 upper_done; |
| |
| STRING unichar_str; |
| STRING unichar_lengths; |
| |
| prev_char[0] = '\0'; |
| |
| if (any_alpha != NULL) |
| *any_alpha = FALSE; |
| |
| if (char_choices.length() > MAX_PERM_LENGTH) { |
| return (NULL); |
| } |
| |
| for (x = 0; x < char_choices.length(); ++x) { |
| if (x + 1 < char_choices.length()) { |
| blob_choice_it.set_to_list(char_choices.get(x+1)); |
| unichar_id = blob_choice_it.data()->unichar_id(); |
| next_char = unichar_id != INVALID_UNICHAR_ID ? |
| getUnicharset().id_to_unichar(unichar_id) : ""; |
| } else { |
| next_char = ""; |
| } |
| |
| if (x + 2 < char_choices.length()) { |
| blob_choice_it.set_to_list(char_choices.get(x+2)); |
| unichar_id = blob_choice_it.data()->unichar_id(); |
| next_next_char = unichar_id != INVALID_UNICHAR_ID ? |
| getUnicharset().id_to_unichar(unichar_id) : ""; |
| } else { |
| next_next_char = ""; |
| } |
| |
| blob_choice_it.set_to_list(char_choices.get(x)); |
| first_choice = NULL; |
| for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); |
| blob_choice_it.forward()) { // find the best non-fragment char choice |
| temp_id = blob_choice_it.data()->unichar_id(); |
| if (!(getUnicharset().get_fragment(temp_id))) { |
| first_choice = blob_choice_it.data(); |
| break; |
| } else if (char_choices.length() > 1) { |
| word.set_fragment_mark(true); |
| capital_word.set_fragment_mark(true); |
| lower_word.set_fragment_mark(true); |
| } |
| } |
| if (first_choice == NULL) { |
| cprintf("Permuter found only fragments for" |
| " character at position %d; word=%s\n", |
| x, word.debug_string(getUnicharset()).string()); |
| } |
| ASSERT_HOST(first_choice != NULL); |
| |
| unichar_id = first_choice->unichar_id() != INVALID_UNICHAR_ID ? |
| first_choice->unichar_id() : space; |
| first_char = getUnicharset().id_to_unichar(unichar_id); |
| first_rating = first_choice->rating(); |
| word.append_unichar_id_space_allocated( |
| unichar_id, 1, first_choice->rating(), first_choice->certainty()); |
| capital_word.append_unichar_id_space_allocated( |
| unichar_id, 1, first_choice->rating(), first_choice->certainty()); |
| lower_word.append_unichar_id_space_allocated( |
| unichar_id, 1, first_choice->rating(), first_choice->certainty()); |
| |
| certainties[x] = first_choice->certainty(); |
| lower_certainties[x] = first_choice->certainty(); |
| upper_certainties[x] = first_choice->certainty(); |
| |
| lower_done = FALSE; |
| upper_done = FALSE; |
| char_alpha = FALSE; |
| second_char = ""; |
| third_char = ""; |
| for (; !blob_choice_it.cycled_list(); blob_choice_it.forward()) { |
| unichar_id = blob_choice_it.data()->unichar_id(); |
| if (getUnicharset().eq(unichar_id, "l") && !blob_choice_it.at_last() && |
| blob_choice_it.data_relative(1)->rating() == first_rating) { |
| blob_choice_it.forward(); |
| temp_id = blob_choice_it.data()->unichar_id(); |
| if (getUnicharset().eq(temp_id, "1") || |
| getUnicharset().eq(temp_id, "I")) { |
| second_char = getUnicharset().id_to_unichar(temp_id); |
| if (!blob_choice_it.at_last() && |
| blob_choice_it.data_relative(1)->rating() == first_rating) { |
| blob_choice_it.forward(); |
| temp_id = blob_choice_it.data()->unichar_id(); |
| if (getUnicharset().eq(temp_id, "1") || |
| getUnicharset().eq(temp_id, "I")) { |
| third_char = getUnicharset().id_to_unichar(temp_id); |
| blob_choice_it.forward(); |
| } |
| } |
| ch = choose_il1 (first_char, second_char, third_char, |
| prev_char, next_char, next_next_char); |
| unichar_id = (ch != NULL && *ch != '\0') ? |
| getUnicharset().unichar_to_id(ch) : INVALID_UNICHAR_ID; |
| if (strcmp(ch, "l") != 0 && |
| getUnicharset().eq(word.unichar_id(x), "l")) { |
| word.set_unichar_id(unichar_id, x); |
| lower_word.set_unichar_id(unichar_id, x); |
| capital_word.set_unichar_id(unichar_id, x); |
| } |
| } |
| } |
| if (unichar_id != INVALID_UNICHAR_ID) { |
| /* Find lower case */ |
| if (!lower_done && |
| (getUnicharset().get_islower(unichar_id) || |
| (getUnicharset().get_isupper(unichar_id) && x == 0))) { |
| lower_word.set_unichar_id(unichar_id, x); |
| lower_word.set_rating(lower_word.rating() - |
| first_choice->rating() + blob_choice_it.data()->rating()); |
| if (blob_choice_it.data()->certainty() < lower_word.certainty()) { |
| lower_word.set_certainty(blob_choice_it.data()->certainty()); |
| } |
| lower_certainties[x] = blob_choice_it.data()->certainty(); |
| lower_done = TRUE; |
| } |
| /* Find upper case */ |
| if (!upper_done && getUnicharset().get_isupper(unichar_id)) { |
| capital_word.set_unichar_id(unichar_id, x); |
| capital_word.set_rating(capital_word.rating() - |
| first_choice->rating() + blob_choice_it.data()->rating()); |
| if (blob_choice_it.data()->certainty() < capital_word.certainty()) { |
| capital_word.set_certainty(blob_choice_it.data()->certainty()); |
| } |
| upper_certainties[x] = blob_choice_it.data()->certainty(); |
| upper_done = TRUE; |
| } |
| if (!char_alpha) { |
| const CHAR_FRAGMENT *fragment = |
| getUnicharset().get_fragment(unichar_id); |
| temp_id = !fragment ? unichar_id : |
| getUnicharset().unichar_to_id(fragment->get_unichar()); |
| if (getUnicharset().get_isalpha(temp_id)) { |
| char_alpha = TRUE; |
| } |
| } |
| if (lower_done && upper_done) |
| break; |
| } |
| } |
| if (char_alpha && any_alpha != NULL) |
| *any_alpha = TRUE; |
| |
| if (word.rating() > *rating_limit) |
| return (NULL); |
| |
| *prev_char = '\0'; |
| temp_id = word.unichar_id(word.length()-1); |
| if (temp_id != INVALID_UNICHAR_ID) { |
| strcpy(prev_char, getUnicharset().id_to_unichar(temp_id)); |
| } |
| } |
| |
| if (word.rating() < raw_choice->rating()) { |
| *raw_choice = word; |
| LogNewRawChoice(*raw_choice, 1.0, certainties); |
| } |
| |
| if (ngram_permuter_activated) |
| return NULL; |
| |
| float rating = word.rating(); |
| word.string_and_lengths(getUnicharset(), &unichar_str, &unichar_lengths); |
| adjust_non_word(unichar_str.string(), unichar_lengths.string(), |
| word.rating(), &new_rating, &adjust_factor); |
| word.set_rating(new_rating); |
| LogNewWordChoice(word, adjust_factor, certainties); |
| |
| float lower_rating = lower_word.rating(); |
| lower_word.string_and_lengths(getUnicharset(), |
| &unichar_str, &unichar_lengths); |
| adjust_non_word(unichar_str.string(), unichar_lengths.string(), |
| lower_word.rating(), &new_rating, &adjust_factor); |
| lower_word.set_rating(new_rating); |
| LogNewWordChoice(lower_word, adjust_factor, lower_certainties); |
| |
| float upper_rating = capital_word.rating(); |
| capital_word.string_and_lengths(getUnicharset(), |
| &unichar_str, &unichar_lengths); |
| adjust_non_word(unichar_str.string(), unichar_lengths.string(), |
| capital_word.rating(), &new_rating, &adjust_factor); |
| capital_word.set_rating(new_rating); |
| LogNewWordChoice(capital_word, adjust_factor, upper_certainties); |
| |
| |
| WERD_CHOICE *best_choice = &word; |
| *rating_limit = rating; |
| if (lower_word.rating() < best_choice->rating()) { |
| best_choice = &lower_word; |
| *rating_limit = lower_rating; |
| } |
| if (capital_word.rating() < best_choice->rating()) { |
| best_choice = &capital_word; |
| *rating_limit = upper_rating; |
| } |
| return new WERD_CHOICE(*best_choice); |
| } |
| |
| |
| /********************************************************************** |
| * choose_il1 |
| * |
| * Choose between the candidate il1 chars. |
| **********************************************************************/ |
| const char* Dict::choose_il1(const char *first_char, //first choice |
| const char *second_char, //second choice |
| const char *third_char, //third choice |
| const char *prev_char, //prev in word |
| const char *next_char, //next in word |
| const char *next_next_char) { //after next next in word |
| inT32 type1; //1/I/l type of first choice |
| inT32 type2; //1/I/l type of second choice |
| inT32 type3; //1/I/l type of third choice |
| |
| int first_char_length = strlen(first_char); |
| int prev_char_length = strlen(prev_char); |
| int next_char_length = strlen(next_char); |
| int next_next_char_length = strlen(next_next_char); |
| |
| if (*first_char == 'l' && *second_char != '\0') { |
| if (*second_char == 'I' |
| && (((prev_char_length != 0 && |
| getUnicharset().get_isupper (prev_char, prev_char_length)) && |
| (next_char_length == 0 || |
| !getUnicharset().get_islower (next_char, next_char_length)) && |
| (next_char_length == 0 || |
| !getUnicharset().get_isdigit (next_char, next_char_length))) || |
| ((next_char_length != 0 && |
| getUnicharset().get_isupper (next_char, next_char_length)) && |
| (prev_char_length == 0 || |
| !getUnicharset().get_islower (prev_char, prev_char_length)) && |
| (prev_char_length == 0 || |
| !getUnicharset().get_isdigit (prev_char, prev_char_length))))) |
| first_char = second_char; //override |
| else if (*second_char == '1' || *third_char == '1') { |
| if ((next_char_length != 0 && |
| getUnicharset().get_isdigit (next_char, next_char_length)) || |
| (prev_char_length != 0 && |
| getUnicharset().get_isdigit (prev_char, prev_char_length)) |
| || (*next_char == 'l' && |
| (next_next_char_length != 0 && |
| getUnicharset().get_isdigit (next_next_char, |
| next_next_char_length)))) { |
| first_char = "1"; |
| first_char_length = 1; |
| } |
| else if ((prev_char_length == 0 || |
| !getUnicharset().get_islower (prev_char, prev_char_length)) && |
| ((next_char_length == 0 || |
| !getUnicharset().get_islower (next_char, next_char_length)) || |
| (*next_char == 's' && |
| *next_next_char == 't'))) { |
| if (((*prev_char != '\'' && *prev_char != '`') || *next_char != '\0') |
| && ((*next_char != '\'' && *next_char != '`') |
| || *prev_char != '\0')) { |
| first_char = "1"; |
| first_char_length = 1; |
| } |
| } |
| } |
| if (*first_char == 'l' && *next_char != '\0' && |
| (prev_char_length == 0 || |
| !getUnicharset().get_isalpha (prev_char, prev_char_length))) { |
| type1 = 2; |
| |
| if (*second_char == '1') |
| type2 = 0; |
| else if (*second_char == 'I') |
| type2 = 1; |
| else if (*second_char == 'l') |
| type2 = 2; |
| else |
| type2 = type1; |
| |
| if (*third_char == '1') |
| type3 = 0; |
| else if (*third_char == 'I') |
| type3 = 1; |
| else if (*third_char == 'l') |
| type3 = 2; |
| else |
| type3 = type1; |
| |
| #if 0 |
| if (bigram_counts[*next_char][type2] > |
| bigram_counts[*next_char][type1]) { |
| first_char = second_char; |
| type1 = type2; |
| } |
| if (bigram_counts[*next_char][type3] > |
| bigram_counts[*next_char][type1]) { |
| first_char = third_char; |
| } |
| #endif |
| } |
| } |
| return first_char; |
| } |
| |
| /********************************************************************** |
| * permute_words |
| * |
| * Permute all the characters together using the dawg to prune all |
| * but the valid words. |
| **********************************************************************/ |
| A_CHOICE *Dict::permute_words(CHOICES_LIST char_choices, float rating_limit) { |
| A_CHOICE *best_choice; |
| |
| best_choice = new_choice (NULL, NULL, rating_limit, -MAX_FLOAT32, -1, NO_PERM); |
| |
| if (hyphen_base_size() + array_count (char_choices) > MAX_WERD_LENGTH) { |
| class_rating (best_choice) = MAX_FLOAT32; |
| } |
| else { |
| |
| dawg_permute_and_select("system words:", word_dawg, SYSTEM_DAWG_PERM, |
| char_choices, best_choice); |
| |
| dawg_permute_and_select("document_words", document_words, |
| DOC_DAWG_PERM, char_choices, best_choice); |
| |
| dawg_permute_and_select("user words", user_words, USER_DAWG_PERM, |
| char_choices, best_choice); |
| } |
| |
| return (best_choice); |
| } |
| |
| |
| /********************************************************************** |
| * valid_word |
| * |
| * Check all the DAWGs to see if this word is in any of them. |
| **********************************************************************/ |
| int Dict::valid_word(const char *string) { |
| int result = NO_PERM; |
| |
| if (word_in_dawg (word_dawg, string)) |
| result = SYSTEM_DAWG_PERM; |
| else { |
| if (word_in_dawg (document_words, string)) |
| result = DOC_DAWG_PERM; |
| else if (word_in_dawg (user_words, string)) |
| result = USER_DAWG_PERM; |
| } |
| return (result); |
| } |
| |
| /********************************************************************** |
| * fragment_state |
| * |
| * Given the current char choice and information about previously seen |
| * fragments, determines whether adjacent character fragments are |
| * present and whether they can be concatenated. |
| * |
| * The given prev_char_frag_info contains: |
| * -- fragment: if not NULL contains information about immediately |
| * preceeding fragmented character choice |
| * -- num_fragments: number of fragments that have been used so far |
| * to construct a character |
| * -- certainty: certainty of the current choice or minimum |
| * certainty of all fragments concatenated so far |
| * -- rating: rating of the current choice or sum of fragment |
| * ratings concatenated so far |
| * |
| * The output char_frag_info is filled in as follows: |
| * -- character: is set to be NULL if the choice is a non-matching |
| * or non-ending fragment piece; is set to unichar of the given choice |
| * if it represents a regular character or a matching ending fragment |
| * -- fragment,num_fragments,certainty,rating are set as described above |
| * |
| * Returns false if a non-matching fragment is discovered, true otherwise. |
| **********************************************************************/ |
| bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id, |
| float curr_rating, float curr_certainty, |
| const CHAR_FRAGMENT_INFO *prev_char_frag_info, |
| const char *debug, int word_ending, |
| CHAR_FRAGMENT_INFO *char_frag_info) { |
| const CHAR_FRAGMENT *this_fragment = |
| getUnicharset().get_fragment(curr_unichar_id); |
| const CHAR_FRAGMENT *prev_fragment = |
| prev_char_frag_info != NULL ? prev_char_frag_info->fragment : NULL; |
| |
| // Print debug info for fragments. |
| if (debug && (prev_fragment || this_fragment)) { |
| cprintf("%s check fragments: choice=%s word_ending=%d\n", debug, |
| getUnicharset().debug_str(curr_unichar_id).string(), |
| word_ending); |
| if (prev_fragment) { |
| cprintf("prev_fragment %s\n", prev_fragment->to_string().string()); |
| } |
| if (this_fragment) { |
| cprintf("this_fragment %s\n", this_fragment->to_string().string()); |
| } |
| } |
| |
| char_frag_info->unichar_id = curr_unichar_id; |
| char_frag_info->fragment = this_fragment; |
| char_frag_info->rating = curr_rating; |
| char_frag_info->certainty = curr_certainty; |
| char_frag_info->num_fragments = 1; |
| if (prev_fragment && !this_fragment) { |
| if (debug) cprintf("Skip choice with incomplete fragment\n"); |
| return false; |
| } |
| if (this_fragment) { |
| // We are dealing with a fragment. |
| char_frag_info->unichar_id = INVALID_UNICHAR_ID; |
| if (prev_fragment) { |
| if (!this_fragment->is_continuation_of(prev_fragment)) { |
| if (debug) cprintf("Non-matching fragment piece\n"); |
| return false; |
| } |
| if (this_fragment->is_ending()) { |
| char_frag_info->unichar_id = |
| getUnicharset().unichar_to_id(this_fragment->get_unichar()); |
| char_frag_info->fragment = NULL; |
| if (debug) { |
| cprintf("Built character %s from fragments\n", |
| getUnicharset().debug_str( |
| char_frag_info->unichar_id).string()); |
| } |
| } else { |
| if (debug) cprintf("Record fragment continuation\n"); |
| char_frag_info->fragment = this_fragment; |
| } |
| // Update certainty and rating. |
| char_frag_info->rating = |
| prev_char_frag_info->rating + curr_rating; |
| char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1; |
| char_frag_info->certainty = |
| MIN(curr_certainty, prev_char_frag_info->certainty); |
| } else { |
| if (this_fragment->is_beginning()) { |
| if (debug) cprintf("Record fragment beginning\n"); |
| } else { |
| if (debug) { |
| cprintf("Non-starting fragment piece with no prev_fragment\n"); |
| } |
| return false; |
| } |
| } |
| } |
| if (word_ending && char_frag_info->fragment) { |
| if (debug) cprintf("Word can not end with a fragment\n"); |
| return false; |
| } |
| return true; |
| } |
| /********************************************************************** |
| * top_fragments_permute_and_select |
| * |
| * Creates a copy of character choices list that contain only fragments |
| * and the best non-fragmented character choice. |
| * Permutes character in this shortened list, builds characters from |
| * fragments if possible and returns a better choice if found. |
| * |
| * TODO(daria): part of this code is similar to the code in permdag, |
| * permngram and permnum - need to figure out a way to combine them. |
| **********************************************************************/ |
| WERD_CHOICE *Dict::top_fragments_permute_and_select( |
| const BLOB_CHOICE_LIST_VECTOR &char_choices, |
| float rating_limit) { |
| if (char_choices.length() <= 1 || |
| char_choices.length() > MAX_PERM_LENGTH) { |
| return NULL; |
| } |
| // See it would be possible to benefit from permuting fragments. |
| int x; |
| float min_rating = 0.0; |
| BLOB_CHOICE_IT blob_choice_it; |
| for (x = 0; x < char_choices.length(); ++x) { |
| blob_choice_it.set_to_list(char_choices.get(x)); |
| if (blob_choice_it.data()) { |
| min_rating += blob_choice_it.data()->rating(); |
| } |
| if (min_rating >= rating_limit) { |
| return NULL; |
| } |
| } |
| if (fragments_debug > 1) { |
| tprintf("A choice with fragment beats top choice\n"); |
| tprintf("Running fragment permuter...\n"); |
| } |
| |
| // Construct a modified choices list that contains (for each position): |
| // the best choice, all fragments and at least one choice for |
| // a non-fragmented character. |
| BLOB_CHOICE_LIST_VECTOR frag_char_choices(char_choices.length()); |
| for (x = 0; x < char_choices.length(); ++x) { |
| bool need_nonfrag_char = true; |
| BLOB_CHOICE_LIST *frag_choices = new BLOB_CHOICE_LIST(); |
| BLOB_CHOICE_IT frag_choices_it; |
| frag_choices_it.set_to_list(frag_choices); |
| blob_choice_it.set_to_list(char_choices.get(x)); |
| for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); |
| blob_choice_it.forward()) { |
| if (getUnicharset().get_fragment(blob_choice_it.data()->unichar_id())) { |
| frag_choices_it.add_after_then_move( |
| new BLOB_CHOICE(*(blob_choice_it.data()))); |
| } else if (need_nonfrag_char) { |
| frag_choices_it.add_after_then_move( |
| new BLOB_CHOICE(*(blob_choice_it.data()))); |
| need_nonfrag_char = false; |
| } |
| } |
| frag_char_choices += frag_choices; |
| } |
| |
| WERD_CHOICE *best_choice = new WERD_CHOICE(); |
| best_choice->make_bad(); |
| WERD_CHOICE word(MAX_PERM_LENGTH); |
| word.set_permuter(TOP_CHOICE_PERM); |
| float certainties[MAX_PERM_LENGTH]; |
| top_fragments_permute(frag_char_choices, 0, min_rating, NULL, |
| &word, certainties, &rating_limit, best_choice); |
| |
| frag_char_choices.delete_data_pointers(); |
| return best_choice; |
| } |
| |
| /********************************************************************** |
| * top_fragments_permute |
| * |
| * Try to put together fragments that could have a better rating |
| * than non-fragmented choices. |
| * |
| * TODO(daria): this code is very similar to the code in permdag, |
| * permngram and permnum - need to figure out a way to combine them. |
| **********************************************************************/ |
| void Dict::top_fragments_permute( |
| const BLOB_CHOICE_LIST_VECTOR &char_choices, |
| int char_choice_index, |
| float min_rating, |
| const CHAR_FRAGMENT_INFO *prev_char_frag_info, |
| WERD_CHOICE *word, |
| float certainties[], |
| float *limit, |
| WERD_CHOICE *best_choice) { |
| if (fragments_debug > 1) { |
| tprintf("top_fragments_permute: char_choice_index=%d" |
| " limit=%4.2f rating=%4.f, certainty=%4.f word=%s\n", |
| char_choice_index, *limit, |
| word->rating(), word->certainty(), |
| word->debug_string(getUnicharset()).string()); |
| } |
| if (char_choice_index < char_choices.length()) { |
| BLOB_CHOICE_IT blob_choice_it; |
| blob_choice_it.set_to_list(char_choices.get(char_choice_index)); |
| for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); |
| blob_choice_it.forward()) { |
| append_top_fragments_choices(char_choices, *(blob_choice_it.data()), |
| char_choice_index, min_rating, |
| prev_char_frag_info, word, certainties, |
| limit, best_choice); |
| |
| } |
| } |
| } |
| |
| /********************************************************************** |
| * append_top_fragments_choices |
| * |
| * Check to see whether or not the next choice is worth appending to |
| * the string being generated. If so then keep going deeper into the |
| * word. |
| * |
| * TODO(daria): this code is very similar to the code in permdag, |
| * permngram and permnum - need to figure out a way to combine them. |
| **********************************************************************/ |
| void Dict::append_top_fragments_choices( |
| const BLOB_CHOICE_LIST_VECTOR &char_choices, |
| const BLOB_CHOICE &blob_choice, |
| int char_choice_index, |
| float min_rating, |
| const CHAR_FRAGMENT_INFO *prev_char_frag_info, |
| WERD_CHOICE *word, |
| float certainties[], |
| float *limit, |
| WERD_CHOICE *best_choice) { |
| int word_ending = |
| (char_choice_index == char_choices.length() - 1) ? TRUE : FALSE; |
| |
| /* Deal with fragments */ |
| CHAR_FRAGMENT_INFO char_frag_info; |
| if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), |
| blob_choice.certainty(), prev_char_frag_info, |
| (fragments_debug > 1) ? "fragments_debug" : NULL, |
| word_ending, &char_frag_info)) { |
| return; // blob_choice must be an invalid fragment |
| } |
| // Search the next letter if this character is a fragment. |
| if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) { |
| top_fragments_permute(char_choices, char_choice_index + 1, |
| min_rating, &char_frag_info, |
| word, certainties, limit, best_choice); |
| return; |
| } |
| |
| /* Add new character */ |
| word->append_unichar_id_space_allocated( |
| char_frag_info.unichar_id, char_frag_info.num_fragments, |
| char_frag_info.rating, char_frag_info.certainty); |
| certainties[word->length()-1] = char_frag_info.certainty; |
| |
| if (word->rating() < *limit) { |
| if (word_ending) { |
| if (fragments_debug > 1) { |
| tprintf("new choice = %s\n", |
| word->debug_string(getUnicharset()).string()); |
| } |
| *limit = word->rating(); |
| |
| // TODO(daria): modify this when adjust_non_word |
| // is updated to use unichar ids. |
| STRING word_str; |
| STRING word_lengths_str; |
| word->string_and_lengths(getUnicharset(), &word_str, &word_lengths_str); |
| float new_rating; |
| float adjust_factor; |
| adjust_non_word(word_str.string(), word_lengths_str.string(), |
| word->rating(), &new_rating, &adjust_factor); |
| word->set_rating(new_rating); |
| LogNewWordChoice(*word, adjust_factor, certainties); |
| |
| if (word->rating() < best_choice->rating()) { |
| *best_choice = *word; |
| } |
| } else { |
| top_fragments_permute(char_choices, char_choice_index + 1, |
| min_rating, &char_frag_info, word, |
| certainties, limit, best_choice); |
| } |
| } else { |
| if (fragments_debug > 1) { |
| tprintf("pruned word (%s, rating=%4.2f, limit=%4.2f)\n", |
| word->debug_string(getUnicharset()).string(), |
| word->rating(), *limit); |
| } |
| } |
| } |
| } // namespace tesseract |