blob: 7079e97196f5d4babef9c63470f1c03feddf8893 [file] [log] [blame]
/* -*-C-*-
********************************************************************************
*
* File: permute.c (Formerly permute.c)
* Description: Choose OCR text given character-probability maps
* for sequences of glyph fragments and a dictionary provided as
* a Dual Acyclic Word Graph.
* In this file, "permute" should be read "combine."
* Author: Mark Seaman, OCR Technology
* Created: Fri Sep 22 14:05:51 1989
* Modified: Thu Jan 3 16:38:46 1991 (Mark Seaman) marks@hpgrlt
* Language: C
* Package: N/A
* Status: Experimental (Do Not Distribute)
*
* (c) Copyright 1989, Hewlett-Packard Company.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
*********************************************************************************/
/*----------------------------------------------------------------------
I n c l u d e s
---------------------------------------------------------------------*/
#include <assert.h>
#include <math.h>
#ifndef FST_DISABLED
#include "fstmodel.h"
#endif
#include "const.h"
#include "permute.h"
#include "callcpp.h"
#include "choices.h"
#include "context.h"
#include "conversion.h"
#include "debug.h"
#include "freelist.h"
#include "globals.h"
#include "hyphen.h"
#include "ndminx.h"
#include "permdawg.h"
#include "permngram.h"
#include "permnum.h"
#include "ratngs.h"
#include "stopper.h"
#include "tordvars.h"
#include "tprintf.h"
#include "trie.h"
#include "varable.h"
#include "unicharset.h"
#include "dict.h"
#include "image.h"
#include "ccutil.h"
int permutation_count; // Used in metrics.cpp.
/*----------------------------------------------------------------------
V a r i a b l e s
----------------------------------------------------------------------*/
// TODO(tkielbus) Choose a value for the MAX_NUM_EDGES constant
// (or make it dynamic)
#define MAX_NUM_EDGES 2000000
#ifdef DISABLE_DOC_DICT
#define MAX_DOC_EDGES 150
#define RESERVED_DOC_EDGES 10
#else
#define MAX_DOC_EDGES 250000
#define RESERVED_DOC_EDGES 10000
#endif
#define MAX_USER_EDGES 50000
#define USER_RESERVED_EDGES 2000
/* Weights for adjustment */
#define NON_WERD 1.25
#define GARBAGE_STRING 1.5
#define MAX_PERM_LENGTH 128
EDGE_ARRAY pending_words;
EDGE_ARRAY document_words;
EDGE_ARRAY user_words;
EDGE_ARRAY word_dawg;
INT_VAR(fragments_debug, 0, "Debug character fragments");
BOOL_VAR(segment_debug, 0, "Debug the whole segmentation process");
BOOL_VAR(segment_adjust_debug, 0, "Segmentation adjustment debug");
double_VAR(segment_penalty_dict_nonword, NON_WERD,
"Score multiplier for glyph fragment segmentations which do not "
"match a dictionary word (lower is better).");
double_VAR(segment_penalty_garbage, GARBAGE_STRING,
"Score multiplier for poorly cased strings that are not in the "
"dictionary and generally look like garbage (lower is better).");
BOOL_VAR(save_doc_words, 0, "Save Document Words");
#ifdef DISABLE_DOC_DICT
BOOL_VAR(doc_dict_enable, 0, "Enable Document Dictionary ");
#else
BOOL_VAR(doc_dict_enable, 1, "Enable Document Dictionary ");
/* PREV DEFAULT 0 */
#endif
BOOL_VAR(ngram_permuter_activated, FALSE,
"Activate character-level n-gram-based permuter");
#ifndef FST_DISABLED
BOOL_VAR(fst_activated, FALSE, "Activate fst");
#endif
int permute_only_top = 0;
#if 0
//0x0=.
static inT32 bigram_counts[256][3] = { {
0, 0, 0
},
{ //0x1=.
0, 0, 0
},
{ //0x2=.
0, 0, 0
},
{ //0x3=.
0, 0, 0
},
{ //0x4=.
0, 0, 0
},
{ //0x5=.
0, 0, 0
},
{ //0x6=.
0, 0, 0
},
{ //0x7=.
0, 0, 0
},
{ //0x8=.
0, 0, 0
},
{ //0x9=.
0, 0, 0
},
{ //0xa=.
93, 28, 0
},
{ //0xb=.
0, 0, 0
},
{ //0xc=.
0, 0, 0
},
{ //0xd=.
0, 0, 0
},
{ //0xe=.
0, 0, 0
},
{ //0xf=.
0, 0, 0
},
{ //0x10=.
0, 0, 0
},
{ //0x11=.
0, 0, 0
},
{ //0x12=.
0, 0, 0
},
{ //0x13=.
0, 0, 0
},
{ //0x14=.
0, 0, 0
},
{ //0x15=.
0, 0, 0
},
{ //0x16=.
0, 0, 0
},
{ //0x17=.
0, 0, 0
},
{ //0x18=.
0, 0, 0
},
{ //0x19=.
0, 0, 0
},
{ //0x1a=.
0, 0, 0
},
{ //0x1b=.
0, 0, 0
},
{ //0x1c=.
0, 0, 0
},
{ //0x1d=.
0, 0, 0
},
{ //0x1e=.
0, 0, 0
},
{ //0x1f=.
0, 0, 0
},
{ //0x20=
324, 377, 2
},
{ //0x21=!
2, 1, 0
},
{ //0x22="
2, 1, 0
},
{ //0x23=#
1, 0, 1
},
{ //0x24=$
2, 1, 0
},
{ //0x25=%
2, 0, 0
},
{ //0x26=&
2, 1, 0
},
{ //0x27='
1, 21, 8
},
{ //0x28=(
2, 1, 0
},
{ //0x29=)
19, 0, 0
},
{ //0x2a=*
2, 1, 0
},
{ //0x2b=+
1, 0, 0
},
{ //0x2c=,
75, 4, 0
},
{ //0x2d=-
52, 7, 0
},
{ //0x2e=.
190, 16, 3
},
{ //0x2f=/
53, 2, 0
},
{ //0x30=0
399, 0, 0
},
{ //0x31=1
220, 0, 0
},
{ //0x32=2
226, 0, 0
},
{ //0x33=3
128, 0, 0
},
{ //0x34=4
147, 0, 0
},
{ //0x35=5
179, 0, 1
},
{ //0x36=6
173, 0, 0
},
{ //0x37=7
115, 0, 0
},
{ //0x38=8
107, 0, 0
},
{ //0x39=9
934, 0, 1
},
{ //0x3a=:
27, 0, 1
},
{ //0x3b=;
2, 1, 0
},
{ //0x3c=<
2, 1, 0
},
{ //0x3d==
2, 1, 0
},
{ //0x3e=>
2, 1, 0
},
{ //0x3f=?
2, 1, 0
},
{ //0x40=@
2, 1, 0
},
{ //0x41=A
3, 1, 0
},
{ //0x42=B
1, 73, 0
},
{ //0x43=C
1, 6, 0
},
{ //0x44=D
1, 24, 0
},
{ //0x45=E
1, 2, 0
},
{ //0x46=F
1, 19, 0
},
{ //0x47=G
1, 2, 0
},
{ //0x48=H
3, 2, 1
},
{ //0x49=I
0, 68, 0
},
{ //0x4a=J
1, 2, 0
},
{ //0x4b=K
1, 2, 0
},
{ //0x4c=L
1, 82, 0
},
{ //0x4d=M
10, 10, 0
},
{ //0x4e=N
3, 239, 0
},
{ //0x4f=O
1, 10, 0
},
{ //0x50=P
0, 1, 3
},
{ //0x51=Q
2, 3, 0
},
{ //0x52=R
1, 43, 0
},
{ //0x53=S
1, 53, 0
},
{ //0x54=T
2, 18, 0
},
{ //0x55=U
1, 2, 0
},
{ //0x56=V
1, 17, 0
},
{ //0x57=W
1, 5, 0
},
{ //0x58=X
1, 6, 0
},
{ //0x59=Y
1, 2, 0
},
{ //0x5a=Z
1, 2, 0
},
{ //0x5b=[
2, 1, 0
},
{ //0x5c=backslash
2, 1, 0
},
{ //0x5d=]
2, 1, 0
},
{ //0x5e=^
2, 1, 0
},
{ //0x5f=_
2, 1, 0
},
{ //0x60=`
1, 0, 2
},
{ //0x61=a
0, 0, 671
},
{ //0x62=b
0, 1, 16
},
{ //0x63=c
0, 2, 1
},
{ //0x64=d
0, 14, 0
},
{ //0x65=e
0, 0, 763
},
{ //0x66=f
0, 186, 0
},
{ //0x67=g
0, 2, 1
},
{ //0x68=h
0, 2, 1
},
{ //0x69=i
0, 0, 818
},
{ //0x6a=j
0, 2, 1
},
{ //0x6b=k
0, 4, 1
},
{ //0x6c=l
0, 26, 3
},
{ //0x6d=m
0, 69, 0
},
{ //0x6e=n
0, 885, 0
},
{ //0x6f=o
0, 17, 722
},
{ //0x70=p
0, 1, 5
},
{ //0x71=q
2, 1, 0
},
{ //0x72=r
0, 21, 0
},
{ //0x73=s
3, 49, 0
},
{ //0x74=t
0, 219, 5
},
{ //0x75=u
0, 0, 56
},
{ //0x76=v
0, 4, 0
},
{ //0x77=w
0, 2, 1
},
{ //0x78=x
0, 2, 1
},
{ //0x79=y
0, 1, 23
},
{ //0x7a=z
0, 2, 1
},
{ //0x7b={
2, 1, 0
},
{ //0x7c=|
59, 0, 3
},
{ //0x7d=}
2, 1, 0
},
{ //0x7e=~
2, 1, 0
},
{ //0x7f=.
0, 0, 0
},
{ //0x80=.
0, 0, 0
},
{ //0x81=.
0, 0, 0
},
{ //0x82=.
0, 0, 0
},
{ //0x83=.
0, 0, 0
},
{ //0x84=.
0, 0, 0
},
{ //0x85=.
0, 0, 0
},
{ //0x86=.
0, 0, 0
},
{ //0x87=.
0, 0, 0
},
{ //0x88=.
0, 0, 0
},
{ //0x89=.
0, 0, 0
},
{ //0x8a=.
0, 0, 0
},
{ //0x8b=.
0, 0, 0
},
{ //0x8c=.
0, 0, 0
},
{ //0x8d=.
0, 0, 0
},
{ //0x8e=.
0, 0, 0
},
{ //0x8f=.
0, 0, 0
},
{ //0x90=.
0, 0, 0
},
{ //0x91=.
0, 0, 0
},
{ //0x92=.
0, 0, 0
},
{ //0x93=.
0, 0, 0
},
{ //0x94=.
0, 0, 0
},
{ //0x95=.
0, 0, 0
},
{ //0x96=.
0, 0, 0
},
{ //0x97=.
0, 0, 0
},
{ //0x98=.
0, 0, 0
},
{ //0x99=.
0, 0, 0
},
{ //0x9a=.
0, 0, 0
},
{ //0x9b=.
0, 0, 0
},
{ //0x9c=.
0, 0, 0
},
{ //0x9d=.
0, 0, 0
},
{ //0x9e=.
0, 0, 0
},
{ //0x9f=.
0, 0, 0
},
{ //0xa0=.
0, 0, 0
},
{ //0xa1=.
0, 0, 0
},
{ //0xa2=.
0, 0, 0
},
{ //0xa3=.
0, 0, 0
},
{ //0xa4=.
0, 0, 0
},
{ //0xa5=.
0, 0, 0
},
{ //0xa6=.
0, 0, 0
},
{ //0xa7=.
0, 0, 0
},
{ //0xa8=.
0, 0, 0
},
{ //0xa9=.
0, 0, 0
},
{ //0xaa=.
0, 0, 0
},
{ //0xab=.
0, 0, 0
},
{ //0xac=.
0, 0, 0
},
{ //0xad=.
0, 0, 0
},
{ //0xae=.
0, 0, 0
},
{ //0xaf=.
0, 0, 0
},
{ //0xb0=.
0, 0, 0
},
{ //0xb1=.
0, 0, 0
},
{ //0xb2=.
0, 0, 0
},
{ //0xb3=.
0, 0, 0
},
{ //0xb4=.
0, 0, 0
},
{ //0xb5=.
0, 0, 0
},
{ //0xb6=.
0, 0, 0
},
{ //0xb7=.
0, 0, 0
},
{ //0xb8=.
0, 0, 0
},
{ //0xb9=.
0, 0, 0
},
{ //0xba=.
0, 0, 0
},
{ //0xbb=.
0, 0, 0
},
{ //0xbc=.
0, 0, 0
},
{ //0xbd=.
0, 0, 0
},
{ //0xbe=.
0, 0, 0
},
{ //0xbf=.
0, 0, 0
},
{ //0xc0=.
0, 0, 0
},
{ //0xc1=.
0, 0, 0
},
{ //0xc2=.
0, 0, 0
},
{ //0xc3=.
0, 0, 0
},
{ //0xc4=.
0, 0, 0
},
{ //0xc5=.
0, 0, 0
},
{ //0xc6=.
0, 0, 0
},
{ //0xc7=.
0, 0, 0
},
{ //0xc8=.
0, 0, 0
},
{ //0xc9=.
0, 0, 0
},
{ //0xca=.
0, 0, 0
},
{ //0xcb=.
0, 0, 0
},
{ //0xcc=.
0, 0, 0
},
{ //0xcd=.
0, 0, 0
},
{ //0xce=.
0, 0, 0
},
{ //0xcf=.
0, 0, 0
},
{ //0xd0=.
0, 0, 0
},
{ //0xd1=.
0, 0, 0
},
{ //0xd2=.
0, 0, 0
},
{ //0xd3=.
0, 0, 0
},
{ //0xd4=.
0, 0, 0
},
{ //0xd5=.
0, 0, 0
},
{ //0xd6=.
0, 0, 0
},
{ //0xd7=.
0, 0, 0
},
{ //0xd8=.
0, 0, 0
},
{ //0xd9=.
0, 0, 0
},
{ //0xda=.
0, 0, 0
},
{ //0xdb=.
0, 0, 0
},
{ //0xdc=.
0, 0, 0
},
{ //0xdd=.
0, 0, 0
},
{ //0xde=.
0, 0, 0
},
{ //0xdf=.
0, 0, 0
},
{ //0xe0=.
0, 0, 0
},
{ //0xe1=.
0, 0, 0
},
{ //0xe2=.
0, 0, 0
},
{ //0xe3=.
0, 0, 0
},
{ //0xe4=.
0, 0, 0
},
{ //0xe5=.
0, 0, 0
},
{ //0xe6=.
0, 0, 0
},
{ //0xe7=.
0, 0, 0
},
{ //0xe8=.
0, 0, 0
},
{ //0xe9=.
0, 0, 0
},
{ //0xea=.
0, 0, 0
},
{ //0xeb=.
0, 0, 0
},
{ //0xec=.
0, 0, 0
},
{ //0xed=.
0, 0, 0
},
{ //0xee=.
0, 0, 0
},
{ //0xef=.
0, 0, 0
},
{ //0xf0=.
0, 0, 0
},
{ //0xf1=.
0, 0, 0
},
{ //0xf2=.
0, 0, 0
},
{ //0xf3=.
0, 0, 0
},
{ //0xf4=.
0, 0, 0
},
{ //0xf5=.
0, 0, 0
},
{ //0xf6=.
0, 0, 0
},
{ //0xf7=.
0, 0, 0
},
{ //0xf8=.
0, 0, 0
},
{ //0xf9=.
0, 0, 0
},
{ //0xfa=.
0, 0, 0
},
{ //0xfb=.
0, 0, 0
},
{ //0xfc=.
0, 0, 0
},
{ //0xfd=.
0, 0, 0
},
{ //0xfe=.
0, 0, 0
},
{ //0xff=.
0, 0, 0
},
};
#endif
//extern "C" double permuter_pending_threshold;
/* Similarity matcher values */
#define SIM_CERTAINTY_SCALE -10.0
/* Similarity matcher values */
#define SIM_CERTAINTY_OFFSET -10.0
/* Worst E*L product to stop on */
#define SIMILARITY_FLOOR 100.0
/*----------------------------------------------------------------------
F u n c t i o n s
----------------------------------------------------------------------*/
/**********************************************************************
* get_best_delete_other
*
* Returns the best of two choices and deletes the other (worse) choice.
* A choice is better if it has a non-empty string and has a lower
* rating than the other choice. If the ratings are the same,
* choice2 is preferred over choice1.
**********************************************************************/
WERD_CHOICE *get_best_delete_other(WERD_CHOICE *choice1,
WERD_CHOICE *choice2) {
if (!choice1) return choice2;
if (!choice2) return choice1;
if (choice1->rating() < choice2->rating() || choice2->length() == 0) {
delete choice2;
return choice1;
} else {
delete choice1;
return choice2;
}
}
/**********************************************************************
* good_choice
*
* Return TRUE if a good answer is found for the unknown blob rating.
**********************************************************************/
int good_choice(const WERD_CHOICE &choice) {
register float certainty;
if (similarity_enable) {
if ((choice.rating() + 1) * choice.certainty() > SIMILARITY_FLOOR)
return false;
certainty =
SIM_CERTAINTY_OFFSET + choice.rating() * SIM_CERTAINTY_SCALE;
} else {
certainty = choice.certainty();
}
return (certainty > certainty_threshold) ? true : false;
}
/**********************************************************************
* add_document_word
*
* Add a word found on this document to the document specific
* dictionary.
**********************************************************************/
namespace tesseract {
void Dict::add_document_word(const WERD_CHOICE &best_choice) {
char filename[CHARS_PER_LINE];
FILE *doc_word_file;
const char *string = best_choice.unichar_string().string();
int stringlen = best_choice.length();
if (!doc_dict_enable
|| valid_word (string) || CurrentWordAmbig () || stringlen < 2)
return;
if (!good_choice(best_choice) || stringlen == 2) {
if (best_choice.certainty() < permuter_pending_threshold)
return;
if (!word_in_dawg(pending_words, string)) {
if (stringlen > 2 ||
(stringlen == 2 &&
getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
add_word_to_dawg(pending_words, string,
MAX_DOC_EDGES, RESERVED_DOC_EDGES);
}
return;
}
}
if (save_doc_words) {
strcpy(filename, getImage()->getCCUtil()->imagefile.string());
strcat (filename, ".doc");
doc_word_file = open_file (filename, "a");
fprintf (doc_word_file, "%s\n", string);
fclose(doc_word_file);
}
add_word_to_dawg(document_words, string, MAX_DOC_EDGES, RESERVED_DOC_EDGES);
}
/**********************************************************************
* adjust_non_word
*
* Assign an adjusted value to a string that is a non-word. The value
* that this word choice has is based on case and punctuation rules.
**********************************************************************/
void Dict::adjust_non_word(const char *word, const char *word_lengths,
float rating, float *new_rating,
float *adjust_factor) {
if (segment_adjust_debug)
cprintf("Non-word: %s %4.2f ", word, rating);
*new_rating = rating + RATING_PAD;
if (case_ok(word, word_lengths) &&
punctuation_ok(word, word_lengths) != -1) {
*new_rating *= segment_penalty_dict_nonword;
*adjust_factor = segment_penalty_dict_nonword;
if (segment_adjust_debug)
cprintf(", %4.2f ", (double)segment_penalty_dict_nonword);
} else {
*new_rating *= segment_penalty_garbage;
*adjust_factor = segment_penalty_garbage;
if (segment_adjust_debug) {
if (!case_ok(word, word_lengths))
cprintf (", C");
if (punctuation_ok(word, word_lengths) == -1)
cprintf (", P");
cprintf (", %4.2f ", (double)segment_penalty_garbage);
}
}
*new_rating -= RATING_PAD;
if (segment_adjust_debug)
cprintf (" --> %4.2f\n", *new_rating);
}
/**********************************************************************
* init_permute
*
* Initialize anything that needs to be set up for the permute
* functions.
**********************************************************************/
void Dict::init_permute() {
if (word_dawg != NULL)
end_permute();
init_permdawg();
STRING name;
name = getImage()->getCCUtil()->language_data_path_prefix;
name += "word-dawg";
word_dawg = read_squished_dawg(name.string());
document_words =
(EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_DOC_EDGES);
initialize_dawg(document_words, MAX_DOC_EDGES);
pending_words =
(EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_DOC_EDGES);
initialize_dawg(pending_words, MAX_DOC_EDGES);
user_words = (EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_USER_EDGES);
name = getImage()->getCCUtil()->language_data_path_prefix;
name += "user-words";
read_word_list(name.string(), user_words, MAX_USER_EDGES, USER_RESERVED_EDGES);
#ifndef FST_DISABLED
if (fst_activated) {
STRING name;
name = getImage()->getCCUtil()->language_data_path_prefix;
name += "fst";
LanguageModel::Instance()->InitWithLanguage(name.string());
letter_is_okay_ = &tesseract::Dict::fst_letter_is_okay;
}
#endif
}
void Dict::end_permute() {
if (word_dawg == NULL)
return; // Not safe to call twice.
memfree(word_dawg);
word_dawg = NULL;
memfree(document_words);
document_words = NULL;
memfree(pending_words);
pending_words = NULL;
memfree(user_words);
user_words = NULL;
end_permdawg();
}
/**********************************************************************
* permute_all
*
* Permute all the characters together using all of the different types
* of permuters/selectors available. Each of the characters must have
* a non-NULL choice list.
*
* Note: order of applying permuters does matter, since the latter
* permuter will be recorded if the resulting word ratings are the same.
**********************************************************************/
WERD_CHOICE *Dict::permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices,
float rating_limit,
WERD_CHOICE *raw_choice) {
A_CHOICE *a_choice;
WERD_CHOICE *result1;
WERD_CHOICE *result2 = NULL;
BOOL8 any_alpha;
int free_index;
float top_choice_rating_limit = rating_limit;
CHOICES_LIST old_char_choices = NULL;
// Initialize result1 from the result of permute_top_choice.
result1 = permute_top_choice(char_choices, &top_choice_rating_limit,
raw_choice, &any_alpha);
// Permute character fragments if necessary.
if (result1 == NULL || result1->fragment_mark()) {
result2 = top_fragments_permute_and_select(char_choices,
top_choice_rating_limit);
result1 = get_best_delete_other(result1, result2);
}
if (ngram_permuter_activated) {
old_char_choices = convert_to_choices_list(char_choices, getUnicharset());
A_CHOICE *ngram_choice =
ngram_permute_and_select(old_char_choices, rating_limit, word_dawg);
free_all_choices(old_char_choices, free_index);
if (ngram_choice == NULL) return NULL;
result1 = new WERD_CHOICE();
convert_to_word_choice(ngram_choice, getUnicharset(), result1);
return result1;
}
if (result1 == NULL)
return (NULL);
if (permute_only_top)
return result1;
old_char_choices = convert_to_choices_list(char_choices, getUnicharset());
if (display_ratings > 1) {
int i;
cprintf("\nold_char_choices in permute_characters: ");
for_each_choice(old_char_choices, i) {
print_choices("", (CHOICES) array_index (old_char_choices, i));
cprintf("\n");
}
}
if (any_alpha && char_choices.length() <= MAX_WERD_LENGTH) {
a_choice = permute_words(old_char_choices, rating_limit);
result1 = get_best_delete_other(getUnicharset(), result1, a_choice);
}
a_choice = number_permute_and_select(old_char_choices, rating_limit);
result1 = get_best_delete_other(getUnicharset(), result1, a_choice);
result2 = permute_compound_words(char_choices, rating_limit);
result1 = get_best_delete_other(result1, result2);
free_all_choices(old_char_choices, free_index);
return (result1);
}
/**********************************************************************
* permute_characters
*
* Permute these characters together according to each of the different
* permuters that are enabled.
**********************************************************************/
void Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
float limit,
WERD_CHOICE *best_choice,
WERD_CHOICE *raw_choice) {
float old_raw_choice_rating = raw_choice->rating();
if (display_ratings > 1) {
cprintf("\nchar_choices in permute_characters:\n");
for (int i = 0; i < char_choices.length(); ++i) {
print_ratings_list("", char_choices.get(i), getUnicharset());
cprintf("\n");
}
}
permutation_count++; /* Global counter */
WERD_CHOICE *this_choice = permute_all(char_choices, limit, raw_choice);
if (raw_choice->rating() < old_raw_choice_rating) {
// Populate unichars_ and unichar_lengths_ of raw_choice. This is
// needed for various components that still work with unichars rather
// than unichar ids (e.g. AdaptToWord).
raw_choice->populate_unichars(getUnicharset());
}
if (this_choice && this_choice->rating() < best_choice->rating()) {
*best_choice = *this_choice;
// Populate unichars_ and unichar_lengths_ of best_choice. This is
// needed for various components that still work with unichars rather
// than unichar ids (dawg, *_ok functions, various hard-coded hacks).
best_choice->populate_unichars(getUnicharset());
if (display_ratings) {
cprintf("permute_characters: %s\n",
best_choice->debug_string(getUnicharset()).string());
}
}
delete this_choice;
}
/**********************************************************************
* permute_compound_words
*
* Return the top choice for each character as the choice for the word.
**********************************************************************/
WERD_CHOICE *Dict::permute_compound_words(
const BLOB_CHOICE_LIST_VECTOR &char_choices,
float rating_limit) {
BLOB_CHOICE *first_choice;
WERD_CHOICE *best_choice = NULL;
WERD_CHOICE current_word(MAX_WERD_LENGTH);
int first_index = 0;
int x;
BLOB_CHOICE_IT blob_choice_it;
if (char_choices.length() > MAX_WERD_LENGTH) {
WERD_CHOICE *bad_word_choice = new WERD_CHOICE();
bad_word_choice->make_bad();
return bad_word_choice;
}
UNICHAR_ID slash = getUnicharset().unichar_to_id("/");
UNICHAR_ID dash = getUnicharset().unichar_to_id("-");
for (x = 0; x < char_choices.length(); ++x) {
blob_choice_it.set_to_list(char_choices.get(x));
first_choice = blob_choice_it.data();
if (first_choice->unichar_id() == slash ||
first_choice->unichar_id() == dash) {
if (x > first_index) {
if (segment_debug)
cprintf ("Hyphenated word found\n");
permute_subword(char_choices, rating_limit, first_index,
x - 1, &current_word);
if (current_word.rating() > rating_limit)
break;
}
// Append hyphen/slash separator to current_word.
current_word.append_unichar_id_space_allocated(
first_choice->unichar_id(), 1,
first_choice->rating(), first_choice->certainty());
first_index = x + 1; // update first_index
}
}
if (first_index > 0 && first_index < x &&
current_word.rating() <= rating_limit) {
permute_subword(char_choices, rating_limit, first_index,
x - 1, &current_word);
best_choice = new WERD_CHOICE(current_word);
best_choice->set_permuter(COMPOUND_PERM);
}
return (best_choice);
}
/**********************************************************************
* permute_subword
*
* Permute a part of a compound word this subword is bounded by hyphens
* and the start and end of the word. Call the standard word permute
* function on a set of choices covering only part of the original
* word. When it is done reclaim the memory that was used in the
* excercise.
**********************************************************************/
void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
float rating_limit,
int start,
int end,
WERD_CHOICE *current_word) {
int x;
BLOB_CHOICE_LIST_VECTOR subchoices;
WERD_CHOICE *best_choice = NULL;
WERD_CHOICE raw_choice;
raw_choice.make_bad();
DisableChoiceAccum();
for (x = start; x <= end; x++) {
if (char_choices.get(x) != NULL) {
subchoices += char_choices.get(x);
}
}
if (!subchoices.empty()) {
if (segment_debug)
segment_dawg_debug.set_value(true);
best_choice = permute_all(subchoices, rating_limit, &raw_choice);
if (segment_debug)
segment_dawg_debug.set_value(false);
if (best_choice && best_choice->length() > 0) {
*current_word += *best_choice;
} else {
current_word->set_rating(MAX_FLOAT32);
}
} else {
current_word->set_rating(MAX_FLOAT32);
}
if (best_choice)
delete best_choice;
if (segment_debug && current_word->rating() < MAX_FLOAT32) {
cprintf ("Subword permuted = %s, %5.2f, %5.2f\n\n",
current_word->debug_string(getUnicharset()).string(),
current_word->rating(), current_word->certainty());
}
EnableChoiceAccum();
}
/**********************************************************************
* permute_top_choice
*
* Return the top choice for each character as the choice for the word.
* In addition a choice is created for the best lower and upper case
* non-words. In each character position the best lower (or upper) case
* character is substituted for the best overall character.
**********************************************************************/
WERD_CHOICE *Dict::permute_top_choice(
const BLOB_CHOICE_LIST_VECTOR &char_choices,
float* rating_limit,
WERD_CHOICE *raw_choice,
BOOL8 *any_alpha) {
BLOB_CHOICE *first_choice;
const char *first_char; //first choice
const char *second_char; //second choice
const char *third_char; //third choice
char prev_char[UNICHAR_LEN + 1]; //prev in word
const char *next_char = ""; //next in word
const char *next_next_char = ""; //after next next in word
WERD_CHOICE word(MAX_PERM_LENGTH);
word.set_permuter(TOP_CHOICE_PERM);
WERD_CHOICE capital_word(MAX_PERM_LENGTH);
capital_word.set_permuter(UPPER_CASE_PERM);
WERD_CHOICE lower_word(MAX_PERM_LENGTH);
lower_word.set_permuter(LOWER_CASE_PERM);
int x;
BOOL8 char_alpha;
float first_rating = 0;
float new_rating;
float adjust_factor;
float certainties[MAX_PERM_LENGTH + 1];
float lower_certainties[MAX_PERM_LENGTH + 1];
float upper_certainties[MAX_PERM_LENGTH + 1];
BLOB_CHOICE_IT blob_choice_it;
UNICHAR_ID temp_id;
UNICHAR_ID unichar_id;
UNICHAR_ID space = getUnicharset().unichar_to_id(" ");
register const char* ch;
register inT8 lower_done;
register inT8 upper_done;
STRING unichar_str;
STRING unichar_lengths;
prev_char[0] = '\0';
if (any_alpha != NULL)
*any_alpha = FALSE;
if (char_choices.length() > MAX_PERM_LENGTH) {
return (NULL);
}
for (x = 0; x < char_choices.length(); ++x) {
if (x + 1 < char_choices.length()) {
blob_choice_it.set_to_list(char_choices.get(x+1));
unichar_id = blob_choice_it.data()->unichar_id();
next_char = unichar_id != INVALID_UNICHAR_ID ?
getUnicharset().id_to_unichar(unichar_id) : "";
} else {
next_char = "";
}
if (x + 2 < char_choices.length()) {
blob_choice_it.set_to_list(char_choices.get(x+2));
unichar_id = blob_choice_it.data()->unichar_id();
next_next_char = unichar_id != INVALID_UNICHAR_ID ?
getUnicharset().id_to_unichar(unichar_id) : "";
} else {
next_next_char = "";
}
blob_choice_it.set_to_list(char_choices.get(x));
first_choice = NULL;
for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
blob_choice_it.forward()) { // find the best non-fragment char choice
temp_id = blob_choice_it.data()->unichar_id();
if (!(getUnicharset().get_fragment(temp_id))) {
first_choice = blob_choice_it.data();
break;
} else if (char_choices.length() > 1) {
word.set_fragment_mark(true);
capital_word.set_fragment_mark(true);
lower_word.set_fragment_mark(true);
}
}
if (first_choice == NULL) {
cprintf("Permuter found only fragments for"
" character at position %d; word=%s\n",
x, word.debug_string(getUnicharset()).string());
}
ASSERT_HOST(first_choice != NULL);
unichar_id = first_choice->unichar_id() != INVALID_UNICHAR_ID ?
first_choice->unichar_id() : space;
first_char = getUnicharset().id_to_unichar(unichar_id);
first_rating = first_choice->rating();
word.append_unichar_id_space_allocated(
unichar_id, 1, first_choice->rating(), first_choice->certainty());
capital_word.append_unichar_id_space_allocated(
unichar_id, 1, first_choice->rating(), first_choice->certainty());
lower_word.append_unichar_id_space_allocated(
unichar_id, 1, first_choice->rating(), first_choice->certainty());
certainties[x] = first_choice->certainty();
lower_certainties[x] = first_choice->certainty();
upper_certainties[x] = first_choice->certainty();
lower_done = FALSE;
upper_done = FALSE;
char_alpha = FALSE;
second_char = "";
third_char = "";
for (; !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
unichar_id = blob_choice_it.data()->unichar_id();
if (getUnicharset().eq(unichar_id, "l") && !blob_choice_it.at_last() &&
blob_choice_it.data_relative(1)->rating() == first_rating) {
blob_choice_it.forward();
temp_id = blob_choice_it.data()->unichar_id();
if (getUnicharset().eq(temp_id, "1") ||
getUnicharset().eq(temp_id, "I")) {
second_char = getUnicharset().id_to_unichar(temp_id);
if (!blob_choice_it.at_last() &&
blob_choice_it.data_relative(1)->rating() == first_rating) {
blob_choice_it.forward();
temp_id = blob_choice_it.data()->unichar_id();
if (getUnicharset().eq(temp_id, "1") ||
getUnicharset().eq(temp_id, "I")) {
third_char = getUnicharset().id_to_unichar(temp_id);
blob_choice_it.forward();
}
}
ch = choose_il1 (first_char, second_char, third_char,
prev_char, next_char, next_next_char);
unichar_id = (ch != NULL && *ch != '\0') ?
getUnicharset().unichar_to_id(ch) : INVALID_UNICHAR_ID;
if (strcmp(ch, "l") != 0 &&
getUnicharset().eq(word.unichar_id(x), "l")) {
word.set_unichar_id(unichar_id, x);
lower_word.set_unichar_id(unichar_id, x);
capital_word.set_unichar_id(unichar_id, x);
}
}
}
if (unichar_id != INVALID_UNICHAR_ID) {
/* Find lower case */
if (!lower_done &&
(getUnicharset().get_islower(unichar_id) ||
(getUnicharset().get_isupper(unichar_id) && x == 0))) {
lower_word.set_unichar_id(unichar_id, x);
lower_word.set_rating(lower_word.rating() -
first_choice->rating() + blob_choice_it.data()->rating());
if (blob_choice_it.data()->certainty() < lower_word.certainty()) {
lower_word.set_certainty(blob_choice_it.data()->certainty());
}
lower_certainties[x] = blob_choice_it.data()->certainty();
lower_done = TRUE;
}
/* Find upper case */
if (!upper_done && getUnicharset().get_isupper(unichar_id)) {
capital_word.set_unichar_id(unichar_id, x);
capital_word.set_rating(capital_word.rating() -
first_choice->rating() + blob_choice_it.data()->rating());
if (blob_choice_it.data()->certainty() < capital_word.certainty()) {
capital_word.set_certainty(blob_choice_it.data()->certainty());
}
upper_certainties[x] = blob_choice_it.data()->certainty();
upper_done = TRUE;
}
if (!char_alpha) {
const CHAR_FRAGMENT *fragment =
getUnicharset().get_fragment(unichar_id);
temp_id = !fragment ? unichar_id :
getUnicharset().unichar_to_id(fragment->get_unichar());
if (getUnicharset().get_isalpha(temp_id)) {
char_alpha = TRUE;
}
}
if (lower_done && upper_done)
break;
}
}
if (char_alpha && any_alpha != NULL)
*any_alpha = TRUE;
if (word.rating() > *rating_limit)
return (NULL);
*prev_char = '\0';
temp_id = word.unichar_id(word.length()-1);
if (temp_id != INVALID_UNICHAR_ID) {
strcpy(prev_char, getUnicharset().id_to_unichar(temp_id));
}
}
if (word.rating() < raw_choice->rating()) {
*raw_choice = word;
LogNewRawChoice(*raw_choice, 1.0, certainties);
}
if (ngram_permuter_activated)
return NULL;
float rating = word.rating();
word.string_and_lengths(getUnicharset(), &unichar_str, &unichar_lengths);
adjust_non_word(unichar_str.string(), unichar_lengths.string(),
word.rating(), &new_rating, &adjust_factor);
word.set_rating(new_rating);
LogNewWordChoice(word, adjust_factor, certainties);
float lower_rating = lower_word.rating();
lower_word.string_and_lengths(getUnicharset(),
&unichar_str, &unichar_lengths);
adjust_non_word(unichar_str.string(), unichar_lengths.string(),
lower_word.rating(), &new_rating, &adjust_factor);
lower_word.set_rating(new_rating);
LogNewWordChoice(lower_word, adjust_factor, lower_certainties);
float upper_rating = capital_word.rating();
capital_word.string_and_lengths(getUnicharset(),
&unichar_str, &unichar_lengths);
adjust_non_word(unichar_str.string(), unichar_lengths.string(),
capital_word.rating(), &new_rating, &adjust_factor);
capital_word.set_rating(new_rating);
LogNewWordChoice(capital_word, adjust_factor, upper_certainties);
WERD_CHOICE *best_choice = &word;
*rating_limit = rating;
if (lower_word.rating() < best_choice->rating()) {
best_choice = &lower_word;
*rating_limit = lower_rating;
}
if (capital_word.rating() < best_choice->rating()) {
best_choice = &capital_word;
*rating_limit = upper_rating;
}
return new WERD_CHOICE(*best_choice);
}
/**********************************************************************
* choose_il1
*
* Choose between the candidate il1 chars.
**********************************************************************/
const char* Dict::choose_il1(const char *first_char, //first choice
const char *second_char, //second choice
const char *third_char, //third choice
const char *prev_char, //prev in word
const char *next_char, //next in word
const char *next_next_char) { //after next next in word
inT32 type1; //1/I/l type of first choice
inT32 type2; //1/I/l type of second choice
inT32 type3; //1/I/l type of third choice
int first_char_length = strlen(first_char);
int prev_char_length = strlen(prev_char);
int next_char_length = strlen(next_char);
int next_next_char_length = strlen(next_next_char);
if (*first_char == 'l' && *second_char != '\0') {
if (*second_char == 'I'
&& (((prev_char_length != 0 &&
getUnicharset().get_isupper (prev_char, prev_char_length)) &&
(next_char_length == 0 ||
!getUnicharset().get_islower (next_char, next_char_length)) &&
(next_char_length == 0 ||
!getUnicharset().get_isdigit (next_char, next_char_length))) ||
((next_char_length != 0 &&
getUnicharset().get_isupper (next_char, next_char_length)) &&
(prev_char_length == 0 ||
!getUnicharset().get_islower (prev_char, prev_char_length)) &&
(prev_char_length == 0 ||
!getUnicharset().get_isdigit (prev_char, prev_char_length)))))
first_char = second_char; //override
else if (*second_char == '1' || *third_char == '1') {
if ((next_char_length != 0 &&
getUnicharset().get_isdigit (next_char, next_char_length)) ||
(prev_char_length != 0 &&
getUnicharset().get_isdigit (prev_char, prev_char_length))
|| (*next_char == 'l' &&
(next_next_char_length != 0 &&
getUnicharset().get_isdigit (next_next_char,
next_next_char_length)))) {
first_char = "1";
first_char_length = 1;
}
else if ((prev_char_length == 0 ||
!getUnicharset().get_islower (prev_char, prev_char_length)) &&
((next_char_length == 0 ||
!getUnicharset().get_islower (next_char, next_char_length)) ||
(*next_char == 's' &&
*next_next_char == 't'))) {
if (((*prev_char != '\'' && *prev_char != '`') || *next_char != '\0')
&& ((*next_char != '\'' && *next_char != '`')
|| *prev_char != '\0')) {
first_char = "1";
first_char_length = 1;
}
}
}
if (*first_char == 'l' && *next_char != '\0' &&
(prev_char_length == 0 ||
!getUnicharset().get_isalpha (prev_char, prev_char_length))) {
type1 = 2;
if (*second_char == '1')
type2 = 0;
else if (*second_char == 'I')
type2 = 1;
else if (*second_char == 'l')
type2 = 2;
else
type2 = type1;
if (*third_char == '1')
type3 = 0;
else if (*third_char == 'I')
type3 = 1;
else if (*third_char == 'l')
type3 = 2;
else
type3 = type1;
#if 0
if (bigram_counts[*next_char][type2] >
bigram_counts[*next_char][type1]) {
first_char = second_char;
type1 = type2;
}
if (bigram_counts[*next_char][type3] >
bigram_counts[*next_char][type1]) {
first_char = third_char;
}
#endif
}
}
return first_char;
}
/**********************************************************************
* permute_words
*
* Permute all the characters together using the dawg to prune all
* but the valid words.
**********************************************************************/
A_CHOICE *Dict::permute_words(CHOICES_LIST char_choices, float rating_limit) {
A_CHOICE *best_choice;
best_choice = new_choice (NULL, NULL, rating_limit, -MAX_FLOAT32, -1, NO_PERM);
if (hyphen_base_size() + array_count (char_choices) > MAX_WERD_LENGTH) {
class_rating (best_choice) = MAX_FLOAT32;
}
else {
dawg_permute_and_select("system words:", word_dawg, SYSTEM_DAWG_PERM,
char_choices, best_choice);
dawg_permute_and_select("document_words", document_words,
DOC_DAWG_PERM, char_choices, best_choice);
dawg_permute_and_select("user words", user_words, USER_DAWG_PERM,
char_choices, best_choice);
}
return (best_choice);
}
/**********************************************************************
* valid_word
*
* Check all the DAWGs to see if this word is in any of them.
**********************************************************************/
int Dict::valid_word(const char *string) {
int result = NO_PERM;
if (word_in_dawg (word_dawg, string))
result = SYSTEM_DAWG_PERM;
else {
if (word_in_dawg (document_words, string))
result = DOC_DAWG_PERM;
else if (word_in_dawg (user_words, string))
result = USER_DAWG_PERM;
}
return (result);
}
/**********************************************************************
* fragment_state
*
* Given the current char choice and information about previously seen
* fragments, determines whether adjacent character fragments are
* present and whether they can be concatenated.
*
* The given prev_char_frag_info contains:
* -- fragment: if not NULL contains information about immediately
* preceeding fragmented character choice
* -- num_fragments: number of fragments that have been used so far
* to construct a character
* -- certainty: certainty of the current choice or minimum
* certainty of all fragments concatenated so far
* -- rating: rating of the current choice or sum of fragment
* ratings concatenated so far
*
* The output char_frag_info is filled in as follows:
* -- character: is set to be NULL if the choice is a non-matching
* or non-ending fragment piece; is set to unichar of the given choice
* if it represents a regular character or a matching ending fragment
* -- fragment,num_fragments,certainty,rating are set as described above
*
* Returns false if a non-matching fragment is discovered, true otherwise.
**********************************************************************/
bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id,
float curr_rating, float curr_certainty,
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
const char *debug, int word_ending,
CHAR_FRAGMENT_INFO *char_frag_info) {
const CHAR_FRAGMENT *this_fragment =
getUnicharset().get_fragment(curr_unichar_id);
const CHAR_FRAGMENT *prev_fragment =
prev_char_frag_info != NULL ? prev_char_frag_info->fragment : NULL;
// Print debug info for fragments.
if (debug && (prev_fragment || this_fragment)) {
cprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
getUnicharset().debug_str(curr_unichar_id).string(),
word_ending);
if (prev_fragment) {
cprintf("prev_fragment %s\n", prev_fragment->to_string().string());
}
if (this_fragment) {
cprintf("this_fragment %s\n", this_fragment->to_string().string());
}
}
char_frag_info->unichar_id = curr_unichar_id;
char_frag_info->fragment = this_fragment;
char_frag_info->rating = curr_rating;
char_frag_info->certainty = curr_certainty;
char_frag_info->num_fragments = 1;
if (prev_fragment && !this_fragment) {
if (debug) cprintf("Skip choice with incomplete fragment\n");
return false;
}
if (this_fragment) {
// We are dealing with a fragment.
char_frag_info->unichar_id = INVALID_UNICHAR_ID;
if (prev_fragment) {
if (!this_fragment->is_continuation_of(prev_fragment)) {
if (debug) cprintf("Non-matching fragment piece\n");
return false;
}
if (this_fragment->is_ending()) {
char_frag_info->unichar_id =
getUnicharset().unichar_to_id(this_fragment->get_unichar());
char_frag_info->fragment = NULL;
if (debug) {
cprintf("Built character %s from fragments\n",
getUnicharset().debug_str(
char_frag_info->unichar_id).string());
}
} else {
if (debug) cprintf("Record fragment continuation\n");
char_frag_info->fragment = this_fragment;
}
// Update certainty and rating.
char_frag_info->rating =
prev_char_frag_info->rating + curr_rating;
char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
char_frag_info->certainty =
MIN(curr_certainty, prev_char_frag_info->certainty);
} else {
if (this_fragment->is_beginning()) {
if (debug) cprintf("Record fragment beginning\n");
} else {
if (debug) {
cprintf("Non-starting fragment piece with no prev_fragment\n");
}
return false;
}
}
}
if (word_ending && char_frag_info->fragment) {
if (debug) cprintf("Word can not end with a fragment\n");
return false;
}
return true;
}
/**********************************************************************
* top_fragments_permute_and_select
*
* Creates a copy of character choices list that contain only fragments
* and the best non-fragmented character choice.
* Permutes character in this shortened list, builds characters from
* fragments if possible and returns a better choice if found.
*
* TODO(daria): part of this code is similar to the code in permdag,
* permngram and permnum - need to figure out a way to combine them.
**********************************************************************/
WERD_CHOICE *Dict::top_fragments_permute_and_select(
const BLOB_CHOICE_LIST_VECTOR &char_choices,
float rating_limit) {
if (char_choices.length() <= 1 ||
char_choices.length() > MAX_PERM_LENGTH) {
return NULL;
}
// See it would be possible to benefit from permuting fragments.
int x;
float min_rating = 0.0;
BLOB_CHOICE_IT blob_choice_it;
for (x = 0; x < char_choices.length(); ++x) {
blob_choice_it.set_to_list(char_choices.get(x));
if (blob_choice_it.data()) {
min_rating += blob_choice_it.data()->rating();
}
if (min_rating >= rating_limit) {
return NULL;
}
}
if (fragments_debug > 1) {
tprintf("A choice with fragment beats top choice\n");
tprintf("Running fragment permuter...\n");
}
// Construct a modified choices list that contains (for each position):
// the best choice, all fragments and at least one choice for
// a non-fragmented character.
BLOB_CHOICE_LIST_VECTOR frag_char_choices(char_choices.length());
for (x = 0; x < char_choices.length(); ++x) {
bool need_nonfrag_char = true;
BLOB_CHOICE_LIST *frag_choices = new BLOB_CHOICE_LIST();
BLOB_CHOICE_IT frag_choices_it;
frag_choices_it.set_to_list(frag_choices);
blob_choice_it.set_to_list(char_choices.get(x));
for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
blob_choice_it.forward()) {
if (getUnicharset().get_fragment(blob_choice_it.data()->unichar_id())) {
frag_choices_it.add_after_then_move(
new BLOB_CHOICE(*(blob_choice_it.data())));
} else if (need_nonfrag_char) {
frag_choices_it.add_after_then_move(
new BLOB_CHOICE(*(blob_choice_it.data())));
need_nonfrag_char = false;
}
}
frag_char_choices += frag_choices;
}
WERD_CHOICE *best_choice = new WERD_CHOICE();
best_choice->make_bad();
WERD_CHOICE word(MAX_PERM_LENGTH);
word.set_permuter(TOP_CHOICE_PERM);
float certainties[MAX_PERM_LENGTH];
top_fragments_permute(frag_char_choices, 0, min_rating, NULL,
&word, certainties, &rating_limit, best_choice);
frag_char_choices.delete_data_pointers();
return best_choice;
}
/**********************************************************************
* top_fragments_permute
*
* Try to put together fragments that could have a better rating
* than non-fragmented choices.
*
* TODO(daria): this code is very similar to the code in permdag,
* permngram and permnum - need to figure out a way to combine them.
**********************************************************************/
void Dict::top_fragments_permute(
const BLOB_CHOICE_LIST_VECTOR &char_choices,
int char_choice_index,
float min_rating,
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
WERD_CHOICE *word,
float certainties[],
float *limit,
WERD_CHOICE *best_choice) {
if (fragments_debug > 1) {
tprintf("top_fragments_permute: char_choice_index=%d"
" limit=%4.2f rating=%4.f, certainty=%4.f word=%s\n",
char_choice_index, *limit,
word->rating(), word->certainty(),
word->debug_string(getUnicharset()).string());
}
if (char_choice_index < char_choices.length()) {
BLOB_CHOICE_IT blob_choice_it;
blob_choice_it.set_to_list(char_choices.get(char_choice_index));
for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
blob_choice_it.forward()) {
append_top_fragments_choices(char_choices, *(blob_choice_it.data()),
char_choice_index, min_rating,
prev_char_frag_info, word, certainties,
limit, best_choice);
}
}
}
/**********************************************************************
* append_top_fragments_choices
*
* Check to see whether or not the next choice is worth appending to
* the string being generated. If so then keep going deeper into the
* word.
*
* TODO(daria): this code is very similar to the code in permdag,
* permngram and permnum - need to figure out a way to combine them.
**********************************************************************/
void Dict::append_top_fragments_choices(
const BLOB_CHOICE_LIST_VECTOR &char_choices,
const BLOB_CHOICE &blob_choice,
int char_choice_index,
float min_rating,
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
WERD_CHOICE *word,
float certainties[],
float *limit,
WERD_CHOICE *best_choice) {
int word_ending =
(char_choice_index == char_choices.length() - 1) ? TRUE : FALSE;
/* Deal with fragments */
CHAR_FRAGMENT_INFO char_frag_info;
if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
blob_choice.certainty(), prev_char_frag_info,
(fragments_debug > 1) ? "fragments_debug" : NULL,
word_ending, &char_frag_info)) {
return; // blob_choice must be an invalid fragment
}
// Search the next letter if this character is a fragment.
if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
top_fragments_permute(char_choices, char_choice_index + 1,
min_rating, &char_frag_info,
word, certainties, limit, best_choice);
return;
}
/* Add new character */
word->append_unichar_id_space_allocated(
char_frag_info.unichar_id, char_frag_info.num_fragments,
char_frag_info.rating, char_frag_info.certainty);
certainties[word->length()-1] = char_frag_info.certainty;
if (word->rating() < *limit) {
if (word_ending) {
if (fragments_debug > 1) {
tprintf("new choice = %s\n",
word->debug_string(getUnicharset()).string());
}
*limit = word->rating();
// TODO(daria): modify this when adjust_non_word
// is updated to use unichar ids.
STRING word_str;
STRING word_lengths_str;
word->string_and_lengths(getUnicharset(), &word_str, &word_lengths_str);
float new_rating;
float adjust_factor;
adjust_non_word(word_str.string(), word_lengths_str.string(),
word->rating(), &new_rating, &adjust_factor);
word->set_rating(new_rating);
LogNewWordChoice(*word, adjust_factor, certainties);
if (word->rating() < best_choice->rating()) {
*best_choice = *word;
}
} else {
top_fragments_permute(char_choices, char_choice_index + 1,
min_rating, &char_frag_info, word,
certainties, limit, best_choice);
}
} else {
if (fragments_debug > 1) {
tprintf("pruned word (%s, rating=%4.2f, limit=%4.2f)\n",
word->debug_string(getUnicharset()).string(),
word->rating(), *limit);
}
}
}
} // namespace tesseract