| /* -*-C-*- |
| ******************************************************************************** |
| * |
| * File: dawg.h (Formerly dawg.h) |
| * Description: Definition of a class that represents Directed Accyclic Word |
| * Graph (DAWG), functions to build and manipulate the DAWG. |
| * Author: Mark Seaman, SW Productivity |
| * Created: Fri Oct 16 14:37:00 1987 |
| * Modified: Wed Jun 19 16:50:24 1991 (Mark Seaman) marks@hpgrlt |
| * Language: C |
| * Package: N/A |
| * Status: Reusable Software Component |
| * |
| * (c) Copyright 1987, Hewlett-Packard Company. |
| ** Licensed under the Apache License, Version 2.0 (the "License"); |
| ** you may not use this file except in compliance with the License. |
| ** You may obtain a copy of the License at |
| ** http://www.apache.org/licenses/LICENSE-2.0 |
| ** Unless required by applicable law or agreed to in writing, software |
| ** distributed under the License is distributed on an "AS IS" BASIS, |
| ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ** See the License for the specific language governing permissions and |
| ** limitations under the License. |
| * |
| *********************************************************************************/ |
| |
| #ifndef DAWG_H |
| #define DAWG_H |
| |
| /*---------------------------------------------------------------------- |
| I n c l u d e s |
| ----------------------------------------------------------------------*/ |
| |
| #include "elst.h" |
| #include "general.h" |
| #include "ratngs.h" |
| #include "varable.h" |
| |
| /*---------------------------------------------------------------------- |
| V a r i a b l e s |
| ----------------------------------------------------------------------*/ |
| |
| extern INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info, to" |
| " 2 for more details, to 3 to see all the debug messages"); |
| |
| #ifdef __MSW32__ |
| #define NO_EDGE (inT64) 0xffffffffffffffffi64 |
| #else |
| #define NO_EDGE (inT64) 0xffffffffffffffffll |
| #endif |
| |
| /*---------------------------------------------------------------------- |
| T y p e s |
| ----------------------------------------------------------------------*/ |
| class UNICHARSET; |
| |
| typedef uinT64 EDGE_RECORD; |
| typedef EDGE_RECORD *EDGE_ARRAY; |
| typedef inT64 EDGE_REF; |
| typedef inT64 NODE_REF; |
| typedef EDGE_REF *NODE_MAP; |
| |
| namespace tesseract { |
| |
| struct NodeChild { |
| UNICHAR_ID unichar_id; |
| EDGE_REF edge_ref; |
| NodeChild(UNICHAR_ID id, EDGE_REF ref): unichar_id(id), edge_ref(ref) {} |
| NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {} |
| }; |
| |
| typedef GenericVector<NodeChild> NodeChildVector; |
| typedef GenericVector<int> SuccessorList; |
| typedef GenericVector<SuccessorList *> SuccessorListsVector; |
| |
| enum DawgType { |
| DAWG_TYPE_PUNCTUATION, |
| DAWG_TYPE_PREFIX, |
| DAWG_TYPE_ROOT, |
| DAWG_TYPE_WORD, |
| DAWG_TYPE_SUFFIX, |
| DAWG_TYPE_NUMBER, |
| |
| DAWG_TYPE_COUNT // number of enum entries |
| }; |
| |
| /*---------------------------------------------------------------------- |
| C o n s t a n t s |
| ----------------------------------------------------------------------*/ |
| #define FORWARD_EDGE (inT32) 0 |
| #define BACKWARD_EDGE (inT32) 1 |
| #define MAX_NODE_EDGES_DISPLAY (inT64) 100 |
| #define LAST_FLAG (inT64) 1 |
| #define DIRECTION_FLAG (inT64) 2 |
| #define WERD_END_FLAG (inT64) 4 |
| #define LETTER_START_BIT 0 |
| #define NUM_FLAG_BITS 3 |
| #define REFFORMAT "%lld" |
| |
| // Set kBeginningDawgsType[i] to true if a Dawg of |
| // DawgType i can contain the beginning of a word. |
| static const bool kBeginningDawgsType[] = {1, 1, 0, 1, 0, 1 }; |
| |
| static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = { |
| { 0, 1, 0, 1, 0, 0 }, // for DAWG_TYPE_PUNCTUATION |
| { 0, 0, 1, 1, 0, 0 }, // for DAWG_TYPE_PREFIX |
| { 0, 0, 0, 0, 1, 0 }, // for DAWG_TYPE_ROOT |
| { 1, 0, 0, 0, 0, 0 }, // for DAWG_TYPE_WORD |
| { 1, 0, 0, 0, 0, 0 }, // for DAWG_TYPE_SUFFIX |
| { 0, 0, 0, 0, 0, 0 } // for DAWG_TYPE_NUMBER |
| }; |
| |
| static const char kWildcard[] = "*"; |
| |
| |
| /*---------------------------------------------------------------------- |
| C l a s s e s a n d S t r u c t s |
| ----------------------------------------------------------------------*/ |
| // |
| // Abstract class (an interface) that declares methods needed by the |
| // various tesseract classes to operate on SquishedDawg and Trie objects. |
| // |
| // This class initializes all the edge masks (since their usage by |
| // SquishedDawg and Trie is identical) and implements simple accessors |
| // for each of the fields encoded in an EDGE_RECORD. |
| // This class also implements word_in_dawg() and check_for_words() |
| // (since they use only the public methods of SquishedDawg and Trie |
| // classes that are inherited from the Dawg base class). |
| // |
| class Dawg { |
| public: |
| // Magic number to determine endianness when reading the Dawg from file. |
| static const inT16 kDawgMagicNumber = 42; |
| // A special unichar id that indicates that any appropriate pattern |
| // (e.g.dicitonary word, 0-9 digit, etc) can be inserted instead |
| // Used for expressing patterns in punctuation and number Dawgs. |
| static const UNICHAR_ID kPatternUnicharID = 0; |
| |
| inline DawgType type() const { return type_; } |
| inline const STRING &lang() const { return lang_; } |
| inline PermuterType permuter() const { return perm_; } |
| |
| virtual ~Dawg() {}; |
| |
| // Returns true if the given word is in the Dawg. |
| bool word_in_dawg(const WERD_CHOICE &word) const; |
| |
| // Checks the Dawg for the words that are listed in the requested file. |
| // Returns the number of words in the given file missing from the Dawg. |
| int check_for_words(const char *filename, |
| const UNICHARSET &unicharset, |
| bool enable_wildcard) const; |
| |
| // Pure virtual function that should be implemented by the derived classes. |
| |
| // Returns the edge that corresponds to the letter out of this node. |
| virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, |
| bool word_end) const = 0; |
| |
| // Fills the given NodeChildVector with all the unichar ids (and the |
| // corresponding EDGE_REFs) for which there is an edge out of this node. |
| virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const = 0; |
| |
| // Returns the next node visited by following the edge |
| // indicated by the given EDGE_REF. |
| virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0; |
| |
| // Returns true if the edge indicated by the given EDGE_REF |
| // marks the end of a word. |
| virtual bool end_of_word(EDGE_REF edge_ref) const = 0; |
| |
| // Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. |
| virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0; |
| |
| // Prints the contents of the node indicated by the given NODE_REF. |
| // At most max_num_edges will be printed. |
| virtual void print_node(NODE_REF node, int max_num_edges) const = 0; |
| |
| protected: |
| Dawg() {} |
| |
| // Returns the next node visited by following this edge. |
| inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const { |
| return ((edge_rec & next_node_mask_) >> next_node_start_bit_); |
| } |
| // Returns the direction flag of this edge. |
| inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const { |
| return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ? |
| BACKWARD_EDGE : FORWARD_EDGE; |
| } |
| // Returns true if this edge marks the end of a word. |
| inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const { |
| return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0; |
| } |
| // Returns UNICHAR_ID recorded in this edge. |
| inline UNICHAR_ID unichar_id_from_edge_rec( |
| const EDGE_RECORD &edge_rec) const { |
| return ((edge_rec & letter_mask_) >> LETTER_START_BIT); |
| } |
| // Sets the next node link for this edge in the Dawg. |
| inline void set_next_node_in_edge_rec( |
| EDGE_RECORD *edge_rec, EDGE_REF value) { |
| *edge_rec &= (~next_node_mask_); |
| *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_); |
| } |
| // Sets this edge record to be the last one in a sequence of edges. |
| inline void set_last_flag_in_edge_rec(EDGE_RECORD *edge_rec) { |
| *edge_rec |= (LAST_FLAG << flag_start_bit_); |
| } |
| // Sequentially compares the given values of unichar ID, next node |
| // and word end marker with the values in the given EDGE_RECORD. |
| // Returns: 1 if at any step the given input value exceeds |
| // that of edge_rec (and all the values already |
| // checked are the same) |
| // 0 if edge_rec_match() returns true |
| // -1 otherwise |
| inline int given_greater_than_edge_rec(NODE_REF next_node, |
| bool word_end, |
| UNICHAR_ID unichar_id, |
| const EDGE_RECORD &edge_rec) const { |
| UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec); |
| NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec); |
| bool curr_word_end = end_of_word_from_edge_rec(edge_rec); |
| if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node, |
| curr_word_end, curr_unichar_id)) return 0; |
| if (unichar_id > curr_unichar_id) return 1; |
| if (unichar_id == curr_unichar_id) { |
| if (next_node > curr_next_node) return 1; |
| if (next_node == curr_next_node) { |
| if (word_end > curr_word_end) return 1; |
| } |
| } |
| return -1; |
| } |
| // Returns true if all the values are equal (any value matches |
| // next_node if next_node == NO_EDGE, any value matches word_end |
| // if word_end is false). |
| inline bool edge_rec_match(NODE_REF next_node, |
| bool word_end, |
| UNICHAR_ID unichar_id, |
| NODE_REF other_next_node, |
| bool other_word_end, |
| UNICHAR_ID other_unichar_id) const { |
| return ((unichar_id == other_unichar_id) && |
| (next_node == NO_EDGE || next_node == other_next_node) && |
| (!word_end || (word_end == other_word_end))); |
| } |
| |
| // Sets type_, lang_, perm_, unicharset_size_. |
| // Initializes the values of various masks from unicharset_size_. |
| void init(DawgType type, const STRING &lang, |
| PermuterType perm, int unicharset_size); |
| |
| // Matches all of the words that are represented by this string. |
| // If wilcard is set to something other than INVALID_UNICHAR_ID, |
| // the *'s in this string are interpreted as wildcards. |
| // WERD_CHOICE param is not passed by const so that wildcard searches |
| // can modify it and work without having to copy WERD_CHOICEs. |
| bool match_words(WERD_CHOICE *word, inT32 index, |
| NODE_REF node, UNICHAR_ID wildcard) const; |
| |
| // Member Variables. |
| DawgType type_; |
| STRING lang_; |
| // Permuter code that should be used if the word is found in this Dawg. |
| PermuterType perm_; |
| // Variables to construct various edge masks. Formerly: |
| // #define NEXT_EDGE_MASK (inT64) 0xfffffff800000000i64 |
| // #define FLAGS_MASK (inT64) 0x0000000700000000i64 |
| // #define LETTER_MASK (inT64) 0x00000000ffffffffi64 |
| int unicharset_size_; |
| int flag_start_bit_; |
| int next_node_start_bit_; |
| uinT64 next_node_mask_; |
| uinT64 flags_mask_; |
| uinT64 letter_mask_; |
| }; |
| |
| // |
| // DawgInfo struct and DawgInfoVector class are used for |
| // storing information about the current Dawg search state. |
| // |
| struct DawgInfo { |
| DawgInfo() : dawg_index(-1), ref(NO_EDGE) {} |
| DawgInfo(int i, EDGE_REF r) : dawg_index(i), ref(r) {} |
| bool operator==(const DawgInfo &other) { |
| return (this->dawg_index == other.dawg_index && |
| this->ref == other.ref); |
| } |
| int dawg_index; |
| EDGE_REF ref; |
| }; |
| class DawgInfoVector : public GenericVector<DawgInfo> { |
| public: |
| // Overload destructor, since clear() does not delete data_[] any more. |
| ~DawgInfoVector() { |
| if (size_reserved_ > 0) { |
| delete[] data_; |
| size_used_ = 0; |
| size_reserved_ = 0; |
| } |
| } |
| // Overload clear() in order to avoid allocating/deallocating memory |
| // when clearing the vector and re-inserting entries into it later. |
| void clear() { size_used_ = 0; } |
| // Adds an entry for the given dawg_index with the given node to the vec. |
| // Returns false if the same entry already exists in the vector, |
| // true otherwise. |
| inline bool add_unique(const DawgInfo &new_info, const char *debug_msg) { |
| for (int i = 0; i < size_used_; ++i) { |
| if (data_[i] == new_info) return false; |
| } |
| push_back(new_info); |
| if (dawg_debug_level) { |
| tprintf("%s[%d, " REFFORMAT "]\n", debug_msg, |
| new_info.dawg_index, new_info.ref); |
| } |
| return true; |
| } |
| // Removes an entry that equals to the given DawgInfo. |
| // This function assumes that the entries in the vector are unique. |
| // Returns true if an entry was found and removed. |
| inline bool remove(const DawgInfo &info) { |
| for (int i = 0; i < size_used_; ++i) { |
| if (data_[i] == info) { |
| for (int j = i + 1; j < size_used_; ++j) { |
| data_[j-1] = data_[j]; |
| } |
| size_used_--; |
| return true; |
| } |
| } |
| return false; |
| } |
| }; |
| |
| // |
| // Concrete class that can operate on a compacted (squished) Dawg (read, |
| // search and write to file). This class is read-only in the sense that |
| // new words can not be added to an instance of SquishedDawg. |
| // The underlying representation of the nodes and edges in SquishedDawg |
| // is stored as a contiguous EDGE_ARRAY (read from file or given as an |
| // argument to the constructor). |
| // |
| class SquishedDawg : public Dawg { |
| public: |
| SquishedDawg(FILE *file, DawgType type, |
| const STRING &lang, PermuterType perm) { |
| read_squished_dawg(file, type, lang, perm); |
| num_forward_edges_in_node0 = num_forward_edges(0); |
| } |
| SquishedDawg(const char* filename, DawgType type, |
| const STRING &lang, PermuterType perm) { |
| FILE *file = fopen(filename, "rb"); |
| if (file == NULL) { |
| tprintf("Failed to open dawg file %s\n", filename); |
| exit(1); |
| } |
| read_squished_dawg(file, type, lang, perm); |
| num_forward_edges_in_node0 = num_forward_edges(0); |
| fclose(file); |
| } |
| SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type, |
| const STRING &lang, PermuterType perm, int unicharset_size) : |
| edges_(edges), num_edges_(num_edges) { |
| init(type, lang, perm, unicharset_size); |
| num_forward_edges_in_node0 = num_forward_edges(0); |
| if (dawg_debug_level > 3) print_all("SquishedDawg:"); |
| } |
| ~SquishedDawg(); |
| |
| // Returns the edge that corresponds to the letter out of this node. |
| EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, |
| bool word_end) const; |
| |
| // Fills the given NodeChildVector with all the unichar ids (and the |
| // corresponding EDGE_REFs) for which there is an edge out of this node. |
| void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const { |
| EDGE_REF edge = node; |
| if (!edge_occupied(edge) || edge == NO_EDGE) return; |
| assert(forward_edge(edge)); // we don't expect any backward edges to |
| do { // be present when this funciton is called |
| vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge)); |
| } while (!last_edge(edge++)); |
| } |
| |
| // Returns the next node visited by following the edge |
| // indicated by the given EDGE_REF. |
| NODE_REF next_node(EDGE_REF edge) const { |
| return next_node_from_edge_rec((edges_[edge])); |
| } |
| |
| // Returns true if the edge indicated by the given EDGE_REF |
| // marks the end of a word. |
| bool end_of_word(EDGE_REF edge_ref) const { |
| return end_of_word_from_edge_rec((edges_[edge_ref])); |
| } |
| |
| // Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. |
| UNICHAR_ID edge_letter(EDGE_REF edge_ref) const { |
| return unichar_id_from_edge_rec((edges_[edge_ref])); |
| } |
| |
| // Prints the contents of the node indicated by the given NODE_REF. |
| // At most max_num_edges will be printed. |
| void print_node(NODE_REF node, int max_num_edges) const; |
| |
| // Writes the squished/reduced Dawg to a file. |
| void write_squished_dawg(const char *filename); |
| |
| private: |
| // Sets the next node link for this edge. |
| inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) { |
| set_next_node_in_edge_rec(&(edges_[edge_ref]), value); |
| } |
| // Sets the edge to be empty. |
| inline void set_empty_edge(EDGE_REF edge_ref) { |
| (edges_[edge_ref] = next_node_mask_); |
| } |
| // Goes through all the edges and clears each one out. |
| inline void clear_all_edges() { |
| for (int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge); |
| } |
| // Clears the last flag of this edge. |
| inline void clear_last_flag(EDGE_REF edge_ref) { |
| (edges_[edge_ref] &= ~(LAST_FLAG << flag_start_bit_)); |
| } |
| // Returns true if this edge is in the forward direction. |
| inline bool forward_edge(EDGE_REF edge_ref) const { |
| return (edge_occupied(edge_ref) && |
| (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref]))); |
| } |
| // Returns true if this edge is in the backward direction. |
| inline bool backward_edge(EDGE_REF edge_ref) const { |
| return (edge_occupied(edge_ref) && |
| (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref]))); |
| } |
| // Returns true if the edge spot in this location is occupied. |
| inline bool edge_occupied(EDGE_REF edge_ref) const { |
| return (edges_[edge_ref] != next_node_mask_); |
| } |
| // Returns true if this edge is the last edge in a sequence. |
| inline bool last_edge(EDGE_REF edge_ref) const { |
| return (edges_[edge_ref] & (LAST_FLAG << flag_start_bit_)) != 0; |
| } |
| |
| // Counts and returns the number of forward edges in this node. |
| inT32 num_forward_edges(NODE_REF node) const; |
| |
| // Reads SquishedDawg from a file. |
| void read_squished_dawg(FILE *file, DawgType type, |
| const STRING &lang, PermuterType perm); |
| |
| // Prints the contents of an edge indicated by the given EDGE_REF. |
| void print_edge(EDGE_REF edge) const; |
| |
| // Prints the contents of the SquishedDawg. |
| void print_all(const char* msg) { |
| tprintf("\n__________________________\n%s\n", msg); |
| for (int i = 0; i < num_edges_; ++i) print_edge(i); |
| tprintf("__________________________\n"); |
| } |
| // Constructs a mapping from the memory node indices to disk node indices. |
| NODE_MAP build_node_map(inT32 *num_nodes) const; |
| |
| |
| // Member variables. |
| EDGE_ARRAY edges_; |
| int num_edges_; |
| int num_forward_edges_in_node0; |
| }; |
| } // namespace tesseract |
| |
| #endif |