| /* -*-C-*- |
| ******************************************************************************** |
| * |
| * File: trie.h (Formerly trie.h) |
| * Description: Functions to build a trie data structure. |
| * Author: Mark Seaman, SW Productivity |
| * Created: Fri Oct 16 14:37:00 1987 |
| * Modified: Fri Jul 26 11:26:34 1991 (Mark Seaman) marks@hpgrlt |
| * Language: C |
| * Package: N/A |
| * Status: Reusable Software Component |
| * |
| * (c) Copyright 1987, Hewlett-Packard Company. |
| ** Licensed under the Apache License, Version 2.0 (the "License"); |
| ** you may not use this file except in compliance with the License. |
| ** You may obtain a copy of the License at |
| ** http://www.apache.org/licenses/LICENSE-2.0 |
| ** Unless required by applicable law or agreed to in writing, software |
| ** distributed under the License is distributed on an "AS IS" BASIS, |
| ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ** See the License for the specific language governing permissions and |
| ** limitations under the License. |
| * |
| *********************************************************************************/ |
| #ifndef TRIE_H |
| #define TRIE_H |
| |
| #include "dawg.h" |
| #include "cutil.h" |
| |
| class UNICHARSET; |
| |
| // Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed |
| // max int32, we will need to change GenericVector to use int64 for size |
| // and address indices. This does not seem to be needed immediately, |
| // since currently the largest number of edges limit used by tesseract |
| // (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32. |
| typedef inT64 EDGE_INDEX; // index of an edge in a given node |
| typedef bool *NODE_MARKER; |
| typedef GenericVector<EDGE_RECORD> EDGE_VECTOR; |
| |
| struct TRIE_NODE_RECORD { |
| EDGE_VECTOR forward_edges; |
| EDGE_VECTOR backward_edges; |
| }; |
| typedef GenericVector<TRIE_NODE_RECORD *> TRIE_NODES; |
| |
| namespace tesseract { |
| |
| // |
| // Concrete class for Trie data structure that allows to store a list of |
| // words (extends Dawg base class) as well as dynamically add new words. |
| // This class stores a vector of pointers to TRIE_NODE_RECORDs, each of |
| // which has a vector of forward and backward edges. |
| // |
| class Trie : public Dawg { |
| public: |
| // max_num_edges argument allows limiting the amount of memory this |
| // Trie can consume (if a new word insert would cause the Trie to |
| // contain more edges than max_num_edges, all the edges are cleared |
| // so that new inserts can proceed). |
| Trie(DawgType type, const STRING &lang, PermuterType perm, |
| uinT64 max_num_edges, int unicharset_size) { |
| init(type, lang, perm, unicharset_size); |
| num_edges_ = 0; |
| max_num_edges_ = max_num_edges; |
| deref_node_index_mask_ = ~letter_mask_; |
| new_dawg_node(); // need to allocate node 0 |
| } |
| ~Trie() { nodes_.delete_data_pointers(); } |
| |
| // Returns the edge that corresponds to the letter out of this node. |
| EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, |
| bool word_end) const { |
| EDGE_RECORD *edge_ptr; |
| EDGE_INDEX edge_index; |
| if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, |
| &edge_ptr, &edge_index)) return NO_EDGE; |
| return make_edge_ref(node_ref, edge_index); |
| } |
| |
| // Fills the given NodeChildVector with all the unichar ids (and the |
| // corresponding EDGE_REFs) for which there is an edge out of this node. |
| void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const { |
| const EDGE_VECTOR &forward_edges = nodes_[node]->forward_edges; |
| for (int i = 0; i < forward_edges.size(); ++i) { |
| vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]), |
| make_edge_ref(node, i))); |
| } |
| } |
| |
| // Returns the next node visited by following the edge |
| // indicated by the given EDGE_REF. |
| NODE_REF next_node(EDGE_REF edge_ref) const { |
| if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE; |
| return next_node_from_edge_rec(*deref_edge_ref(edge_ref)); |
| } |
| |
| // Returns true if the edge indicated by the given EDGE_REF |
| // marks the end of a word. |
| bool end_of_word(EDGE_REF edge_ref) const { |
| if (edge_ref == NO_EDGE || num_edges_ == 0) return false; |
| return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref)); |
| } |
| |
| // Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. |
| UNICHAR_ID edge_letter(EDGE_REF edge_ref) const { |
| if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID; |
| return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref)); |
| } |
| |
| // Prints the contents of the node indicated by the given NODE_REF. |
| // At most max_num_edges will be printed. |
| void print_node(NODE_REF node, int max_num_edges) const; |
| |
| // Writes edges from nodes_ to an EDGE_ARRAY and creates a SquishedDawg. |
| // Eliminates redundant edges and returns the pointer to the SquishedDawg. |
| // Note: the caller is responsible for deallocating memory associated |
| // with the returned SquishedDawg pointer. |
| SquishedDawg *trie_to_dawg(); |
| |
| // Inserts the list of words from the given file into the Trie. |
| bool read_word_list(const char *filename, |
| const UNICHARSET &unicharset); |
| |
| // Adds a word to the Trie (creates the necessary nodes and edges). |
| void add_word_to_dawg(const WERD_CHOICE &word); |
| |
| protected: |
| // The structure of an EDGE_REF for Trie edges is as follows: |
| // [LETTER_START_BIT, flag_start_bit_): |
| // edge index in *_edges in a TRIE_NODE_RECORD |
| // [flag_start_bit, 30th bit]: node index in nodes (TRIE_NODES vector) |
| // |
| // With this arrangement there are enough bits to represent edge indices |
| // (each node can have at most unicharset_size_ forward edges and |
| // the position of flag_start_bit is set to be log2(unicharset_size_)). |
| // It is also possible to accomodate a maximum number of nodes that is at |
| // least as large as that of the SquishedDawg representation (in SquishedDawg |
| // each EDGE_RECORD has 32-(flag_start_bit+NUM_FLAG_BITS) bits to represent |
| // the next node index). |
| // |
| |
| // Returns the pointer to EDGE_RECORD after decoding the location |
| // of the edge from the information in the given EDGE_REF. |
| // This function assumes that EDGE_REF holds valid node/edge indices. |
| inline EDGE_RECORD *deref_edge_ref(EDGE_REF edge_ref) const { |
| uinT64 edge_index = (edge_ref & letter_mask_) >> LETTER_START_BIT; |
| uinT64 node_index = |
| (edge_ref & deref_node_index_mask_) >> flag_start_bit_; |
| TRIE_NODE_RECORD *node_rec = nodes_[node_index]; |
| return &(node_rec->forward_edges[edge_index]); |
| } |
| // Constructs EDGE_REF from the given node_index and edge_index. |
| inline EDGE_REF make_edge_ref(NODE_REF node_index, |
| EDGE_INDEX edge_index) const { |
| return ((node_index << flag_start_bit_) | |
| (edge_index << LETTER_START_BIT)); |
| } |
| // Sets up this edge record to the requested values. |
| inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt, int direction, |
| bool word_end, UNICHAR_ID unichar_id) { |
| EDGE_RECORD flags = 0; |
| if (word_end) flags |= WERD_END_FLAG; |
| if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG; |
| *edge = ((nxt << next_node_start_bit_) | |
| (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) | |
| (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT)); |
| } |
| // Prints the given EDGE_RECORD. |
| inline void print_edge_rec(const EDGE_RECORD &edge_rec) const { |
| tprintf("|" REFFORMAT "|%s%s|%d|", next_node_from_edge_rec(edge_rec), |
| (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B", |
| end_of_word_from_edge_rec(edge_rec) ? ",E" : "", |
| unichar_id_from_edge_rec(edge_rec)); |
| } |
| // Returns true if the next node in recorded the given EDGE_RECORD |
| // has exactly one forward edge. |
| inline bool can_be_eliminated(const EDGE_RECORD &edge_rec) { |
| NODE_REF node_ref = next_node_from_edge_rec(edge_rec); |
| return (node_ref != NO_EDGE && |
| nodes_[node_ref]->forward_edges.size() == 1); |
| } |
| |
| // Prints the contents of the Trie. |
| // At most max_num_edges will be printed for each node. |
| void print_all(const char* msg, int max_num_edges) { |
| tprintf("\n__________________________\n%s\n", msg); |
| for (int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges); |
| tprintf("__________________________\n"); |
| } |
| |
| // Finds the edge with the given direction, word_end and unichar_id |
| // in the node indicated by node_ref. Fills in the pointer to the |
| // EDGE_RECORD and the index of the edge with the the values |
| // corresponding to the edge found. Returns true if an edge was found. |
| bool edge_char_of(NODE_REF node_ref, NODE_REF next_node, |
| int direction, bool word_end, UNICHAR_ID unichar_id, |
| EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const; |
| |
| // Adds an single edge linkage between node1 and node2 in the direction |
| // indicated by direction argument. |
| bool add_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, |
| bool word_end, UNICHAR_ID unichar_id); |
| |
| // Adds forward edge linkage from node1 to node2 and the corresponding |
| // backwad edge linkage in the other direction. |
| bool add_new_edge(NODE_REF node1, NODE_REF node2, |
| bool word_end, UNICHAR_ID unichar_id) { |
| return (add_edge_linkage(node1, node2, FORWARD_EDGE, |
| word_end, unichar_id) && |
| add_edge_linkage(node2, node1, BACKWARD_EDGE, |
| word_end, unichar_id)); |
| } |
| |
| // Sets the word ending flags in an already existing edge pair. |
| // Returns true on success. |
| void add_word_ending(EDGE_RECORD *edge, |
| NODE_REF the_next_node, |
| UNICHAR_ID unichar_id); |
| |
| // Allocates space for a new node in the Trie. |
| NODE_REF new_dawg_node(); |
| |
| // Removes a single edge linkage to between node1 and node2 in the |
| // direction indicated by direction argument. |
| void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, |
| bool word_end, UNICHAR_ID unichar_id); |
| |
| // Removes forward edge linkage from node1 to node2 and the corresponding |
| // backward edge linkage in the other direction. |
| void remove_edge(NODE_REF node1, NODE_REF node2, |
| bool word_end, UNICHAR_ID unichar_id) { |
| remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id); |
| remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id); |
| } |
| |
| // Compares edge1 and edge2 in the given node to see if they point to two |
| // next nodes that could be collapsed. If they do, performs the reduction |
| // and returns true. |
| bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, |
| const EDGE_RECORD &edge2); |
| |
| // Assuming that edge_index indicates the first edge in a group of edges |
| // in this node with a particular letter value, looks through these edges |
| // to see if any of them can be collapsed. If so does it. Returns to the |
| // caller when all edges with this letter have been reduced. |
| // Returns true if further reduction is possible with this same letter. |
| bool reduce_lettered_edges(EDGE_INDEX edge_index, |
| UNICHAR_ID unichar_id, |
| NODE_REF node, |
| const EDGE_VECTOR &backward_edges, |
| NODE_MARKER reduced_nodes); |
| |
| // Order num_edges of consequtive EDGE_RECORDS in the given EDGE_VECTOR in |
| // increasing order of unichar ids. This function is normally called |
| // for all edges in a single node, and since number of edges in each node |
| // is usually quite small, selection sort is used. |
| void sort_edges(EDGE_VECTOR *edges); |
| |
| // Eliminates any redundant edges from this node in the Trie. |
| void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes); |
| |
| |
| // Member variables |
| TRIE_NODES nodes_; // vector of nodes in the Trie |
| uinT64 num_edges_; // sum of all edges (forward and backward) |
| uinT64 max_num_edges_; // maximum number of edges allowed |
| uinT64 deref_direction_mask_; // mask for EDGE_REF to extract direction |
| uinT64 deref_node_index_mask_; // mask for EDGE_REF to extract node index |
| }; |
| } // namespace tesseract |
| |
| #endif |