blob: 6791fdb1c7622ecb1c4e52dbe3af16e430dc3ef9 [file] [log] [blame]
///////////////////////////////////////////////////////////////////////
// File: dict.h
// Description: dict class.
// Author: Samuel Charron
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_DICT_DICT_H_
#define TESSERACT_DICT_DICT_H_
#include "choices.h"
#include "choicearr.h"
#include "dawg.h"
#include "image.h"
#include "ratngs.h"
#include "stopper.h"
#include "unicharset.h"
typedef LIST AMBIG_TABLE;
// Struct used to hold temprorary information about fragments.
struct CHAR_FRAGMENT_INFO {
UNICHAR_ID unichar_id;
const CHAR_FRAGMENT *fragment;
int num_fragments;
float rating;
float certainty;
};
namespace tesseract {
class Dict
{
public:
Dict(Image* image_ptr);
~Dict();
Image* getImage() {
return image_ptr_;
}
UNICHARSET& getUnicharset() {
return getImage()->getCCUtil()->unicharset;
}
/* permdawg.cpp ************************************************************/
void init_permdawg();
void end_permdawg();
void append_next_choice( /*previous option */
EDGE_ARRAY dawg,
NODE_REF node,
char permuter,
char *word,
char unichar_lengths[],
int unichar_offsets[],
CHOICES_LIST choices,
int char_choice_index,
int word_index,
A_CHOICE *this_choice,
const char *prevchar,
float *limit,
float rating,
float certainty,
float *rating_array,
float *certainty_array,
int word_ending,
int last_word,
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
char fragment_lengths[],
CHOICES *result);
int test_freq_words(const char *word);
CHOICES dawg_permute(EDGE_ARRAY dawg,
NODE_REF node,
char permuter,
CHOICES_LIST choices,
int char_choice_index,
int word_index,
float *limit,
char *word,
char unichar_lengths[],
int unichar_offsets[],
float rating,
float certainty,
float *rating_array,
float *certainty_array,
int last_word,
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
char fragment_lengths[]);
void dawg_permute_and_select(const char *string,
EDGE_ARRAY dawg,
char permuter,
CHOICES_LIST character_choices,
A_CHOICE *best_choice);
void adjust_word(A_CHOICE *best_choice, float *certainty_array);
/* permute.cpp *************************************************************/
void add_document_word(const WERD_CHOICE &best_choice);
void init_permute();
WERD_CHOICE *permute_top_choice(
const BLOB_CHOICE_LIST_VECTOR &char_choices,
float* rating_limit,
WERD_CHOICE *raw_choice,
BOOL8 *any_alpha);
const char* choose_il1(const char *first_char, //first choice
const char *second_char, //second choice
const char *third_char, //third choice
const char *prev_char, //prev in word
const char *next_char, //next in word
const char *next_next_char); //after next next in word
int valid_word(const char *string);
A_CHOICE *permute_words(CHOICES_LIST char_choices,
float rating_limit);
WERD_CHOICE *permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices,
float rating_limit,
WERD_CHOICE *raw_choice);
void end_permute();
void adjust_non_word(const char *word, const char *word_lengths,
float rating, float *new_rating,
float *adjust_factor);
void permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
float rating_limit,
int start,
int end,
WERD_CHOICE *current_word);
void permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
float limit,
WERD_CHOICE *best_choice,
WERD_CHOICE *raw_choice);
WERD_CHOICE *permute_compound_words(
const BLOB_CHOICE_LIST_VECTOR &char_choices,
float rating_limit);
bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
float curr_rating, float curr_certainty,
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
const char *debug, int word_ending,
CHAR_FRAGMENT_INFO *char_frag_info);
WERD_CHOICE *top_fragments_permute_and_select(
const BLOB_CHOICE_LIST_VECTOR &char_choices,
float rating_limit);
void top_fragments_permute(
const BLOB_CHOICE_LIST_VECTOR &char_choices,
int char_choice_index,
float min_rating,
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
WERD_CHOICE *word,
float certainties[],
float *limit,
WERD_CHOICE *best_choice);
void append_top_fragments_choices(
const BLOB_CHOICE_LIST_VECTOR &char_choices,
const BLOB_CHOICE &blob_choice,
int char_choice_index,
float min_rating,
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
WERD_CHOICE *word,
float certainties[],
float *limit,
WERD_CHOICE *best_choice);
/* permnum.cpp *************************************************************/
int pure_number(const char *string, const char *lengths);
void append_number_choices(int state,
char *word,
char unichar_lengths[],
int unichar_offsets[],
CHOICES_LIST choices,
int char_index,
A_CHOICE *this_choice,
float *limit,
float rating,
float certainty,
float *certainty_array,
CHOICES *result);
void append_number_choices(int state,
char *word,
char unichar_lengths[],
int unichar_offsets[],
CHOICES_LIST choices,
int char_choice_index,
int word_index,
A_CHOICE *this_choice,
float *limit,
float rating,
float certainty,
float *certainty_array,
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
char fragment_lengths[],
CHOICES *result);
int number_character_type( //current state
const char* ch,
int length,
int state);
int valid_number(const char *string, const char *lengths);
int number_state_change(int state, //current state
const char *word, //current char
const char *lengths); //length of current char
A_CHOICE *number_permute_and_select(CHOICES_LIST char_choices,
float rating_limit);
CHOICES number_permute(int state,
CHOICES_LIST choices,
int char_choice_index,
int word_index,
float *limit,
char *word,
char unichar_lengths[],
int unichar_offsets[],
float rating,
float certainty,
float *certainty_array,
const CHAR_FRAGMENT_INFO
*prev_char_frag_info,
char fragment_lengths[]);
void adjust_number(A_CHOICE *best_choice, float *certainty_array);
/* context.cpp *************************************************************/
int punctuation_ok(const char *word, const char *lengths);
int case_ok(const char *word, const char *lengths);
/* stopper.cpp *************************************************************/
int NoDangerousAmbig(const char *Word,
const char *Word_lengths,
DANGERR *fixpt);
AMBIG_TABLE *FillAmbigTable();
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice);
VIABLE_CHOICE NewViableChoice(const WERD_CHOICE &WordChoice,
FLOAT32 AdjustFactor,
const float Certainties[]);
void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice);
int StringSameAs(const char *String,
const char *String_lengths,
VIABLE_CHOICE ViableChoice);
bool StringSameAs(const WERD_CHOICE &WordChoice,
VIABLE_CHOICE ViableChoice);
int AcceptableChoice(const BLOB_CHOICE_LIST_VECTOR &Choices,
const WERD_CHOICE &BestChoice,
const WERD_CHOICE &RawChoice,
DANGERR *fixpt,
ACCEPTABLE_CHOICE_CALLER caller);
int AcceptableResult(const WERD_CHOICE &BestChoice,
const WERD_CHOICE &RawChoice);
int AmbigsFound(char *Word,
char *CurrentChar,
const char *Tail,
const char *Tail_lengths,
LIST Ambigs,
DANGERR *fixpt);
int ChoiceSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice);
void LogNewRawChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor,
const float Certainties[]);
void LogNewWordChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor,
const float Certainties[]);
void EndDangerousAmbigs();
int CurrentBestChoiceIs(const WERD_CHOICE &WordChoice);
FLOAT32 CurrentBestChoiceAdjustFactor();
int CurrentWordAmbig();
void DebugWordChoices();
void FillViableChoice(const WERD_CHOICE &WordChoice,
FLOAT32 AdjustFactor, const float Certainties[],
bool SameString, VIABLE_CHOICE ViableChoice);
/* choices.cpp *************************************************************/
void print_word_string(const char* str);
/* dawg.cpp ****************************************************************/
int def_letter_is_okay(void* dawg, void* node, int char_index,
char prevchar, const char* word, int word_end);
int new_letter_is_okay(void* dawg, void* node, int char_index,
char prevchar, const char* word, int word_end);
int fst_letter_is_okay(void* dawg, void* node, int char_index,
char prevchar, const char* word, int word_end);
inT32 word_in_dawg(EDGE_ARRAY dawg, const char *string);
inT32 verify_trailing_punct(EDGE_ARRAY dawg, char *word, inT32 char_index);
void print_word_choice(const char *label, A_CHOICE* choice);
void print_choices(const char *label,
CHOICES rating); // List of (A_CHOICE*).
/* permngram.cpp ***********************************************************/
A_CHOICE *ngram_permute_and_select(CHOICES_LIST char_choices,
float rating_limit,
EDGE_ARRAY dawg);
/* lookdawg.cpp ************************************************************/
void print_lost_words (EDGE_ARRAY dawg,
char *filename);
void check_for_words (EDGE_ARRAY dawg,
char *filename);
/* trie.cpp ****************************************************************/
void read_word_list(const char *filename,
EDGE_ARRAY dawg,
inT32 max_num_edges,
inT32 reserved_edges);
// pointer to function to allow it to be modified.
int (Dict::*letter_is_okay_)(void* dawg, void* node, int char_index,
char prevchar, const char *word, int word_end);
/* conversion.cpp **********************************************************/
// TODO(daria): remove this declaration when conversion.cpp is deprecated.
void LogNewWordChoice(A_CHOICE *a_choice,
FLOAT32 adjust_factor,
const float certainties[],
const UNICHARSET &unicharset);
private:
Image* image_ptr_;
};
} // namespace tesseract
#endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_