| /* -*-C-*- |
| ******************************************************************************** |
| * |
| * File: dawg.c (Formerly dawg.c) |
| * Description: Use a Directed Accyclic Word Graph |
| * Author: Mark Seaman, OCR Technology |
| * Created: Fri Oct 16 14:37:00 1987 |
| * Modified: Wed Jul 24 16:59:16 1991 (Mark Seaman) marks@hpgrlt |
| * Language: C |
| * Package: N/A |
| * Status: Reusable Software Component |
| * |
| * (c) Copyright 1987, Hewlett-Packard Company. |
| ** Licensed under the Apache License, Version 2.0 (the "License"); |
| ** you may not use this file except in compliance with the License. |
| ** You may obtain a copy of the License at |
| ** http://www.apache.org/licenses/LICENSE-2.0 |
| ** Unless required by applicable law or agreed to in writing, software |
| ** distributed under the License is distributed on an "AS IS" BASIS, |
| ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ** See the License for the specific language governing permissions and |
| ** limitations under the License. |
| * |
| *********************************************************************************/ |
| /*---------------------------------------------------------------------- |
| I n c l u d e s |
| ----------------------------------------------------------------------*/ |
| #ifdef __MSW32__ |
| #include <windows.h> |
| #else |
| #include <netinet/in.h> |
| #endif |
| #include "dawg.h" |
| #include "cutil.h" |
| #include "tprintf.h" |
| #include "freelist.h" |
| #include "context.h" |
| #include "strngs.h" |
| #include "emalloc.h" |
| #include "dict.h" |
| |
| /*---------------------------------------------------------------------- |
| V a r i a b l e s |
| ----------------------------------------------------------------------*/ |
| inT32 debug = 0; |
| inT32 case_sensative = 1; |
| |
| /*---------------------------------------------------------------------- |
| F u n c t i o n s |
| ----------------------------------------------------------------------*/ |
| /********************************************************************** |
| * edge_char_of |
| * |
| * Return the edge that corresponds to the letter out of this node. |
| **********************************************************************/ |
| EDGE_REF edge_char_of(EDGE_ARRAY dawg, |
| NODE_REF node, |
| int character, |
| int word_end) { |
| EDGE_REF edge = node; |
| |
| if (! case_sensative) character = tolower (character); |
| |
| if (edge_occupied (dawg, edge)) { |
| do { |
| #if 0 |
| // TODO(rays) Turn dawg to a class so this can be used properly. |
| // The maximum value of edge is unknown to this function, but could |
| // easily be stored in a dawg class. |
| if ((edge < 0 || edge >= MAX_NUM_EDGES_IN_SQUISHED_DAWG_FILE) |
| && edge != NO_EDGE) { |
| tprintf("Impossible return from edge_char_of: node=" REFFORMAT |
| ", ch=%d, edge=" REFFORMAT "\n", *node, |
| static_cast<unsigned char>(dummy_word[char_index]), edge); |
| return NO_EDGE; |
| } |
| #endif |
| if ((edge_letter (dawg, edge) == character) && |
| (! word_end || end_of_word(dawg,edge))) |
| return (edge); |
| |
| } edge_loop (dawg, edge); |
| } |
| |
| return (NO_EDGE); |
| } |
| |
| |
| /********************************************************************** |
| * edges_in_node |
| * |
| * Count the number of edges in this node in the DAWG. This includes |
| * both forward and back links. |
| **********************************************************************/ |
| inT32 edges_in_node(EDGE_ARRAY dawg, NODE_REF node) { |
| EDGE_REF edge = node; |
| |
| if (edge_occupied (dawg, edge)) { |
| edge_loop(dawg, edge); |
| if (edge_occupied (dawg, edge) && backward_edge (dawg, edge)) { |
| edge_loop(dawg, edge); |
| return (edge - node); |
| } |
| else { |
| return (edge - node); |
| } |
| } |
| else { |
| return (edge - node); |
| } |
| } |
| |
| |
| /********************************************************************** |
| * def_letter_is_okay |
| * |
| * Default way to check this letter in light of the current state. If |
| * everything is still OK then return TRUE. |
| **********************************************************************/ |
| // TODO(tkielbus) Change the prevchar argument to make it unicode safe. |
| // We might want to get rid of def_letter_is_okay at some point though. |
| namespace tesseract { |
| int Dict::def_letter_is_okay(void* void_dawg, void* void_node, int char_index, |
| char prevchar, const char* word, int word_end) { |
| EDGE_ARRAY dawg = reinterpret_cast<EDGE_ARRAY>(void_dawg); |
| NODE_REF* node = reinterpret_cast<NODE_REF*>(void_node); |
| EDGE_REF edge; |
| STRING dummy_word(word); // Auto-deleting string fixes memory leak. |
| STRING word_single_lengths; //Lengths of single UTF-8 characters of the word. |
| const char *ptr; |
| |
| for (ptr = word; *ptr != '\0';) { |
| int step = UNICHAR::utf8_step(ptr); |
| if (step == 0) |
| return FALSE; |
| word_single_lengths += step; |
| ptr += step; |
| } |
| |
| if (*node == NO_EDGE) { /* Trailing punctuation */ |
| if (trailing_punc(dummy_word[char_index]) && |
| punctuation_ok(dummy_word.string(), word_single_lengths.string()) >= 0) |
| return (TRUE); |
| else |
| return (FALSE); |
| } |
| // rays: removed incorrect code that attempted to enforce leading |
| // punctutation (or nothing) before an alpha character. |
| /* Check the DAWG */ |
| edge = edge_char_of(dawg, *node, |
| static_cast<unsigned char>(dummy_word[char_index]), |
| word_end); |
| |
| if (edge != NO_EDGE) { /* Normal edge in DAWG */ |
| if (case_sensative || case_is_okay (dummy_word, char_index)) { |
| //next_node (dawg, edge); |
| *node = next_node(dawg, edge); |
| if (*node == 0) |
| *node = NO_EDGE; |
| return (TRUE); |
| } else { |
| return (FALSE); |
| } |
| } |
| else { |
| /* Leading punctuation */ |
| if (leading_punc (word [char_index]) && |
| (char_index == 0 || leading_punc (dummy_word [char_index-1]))) { |
| *node = 0; |
| if (punctuation_ok(word, word_single_lengths.string()) >= 0) |
| return (TRUE); |
| else |
| return FALSE; |
| } |
| /* Trailing punctuation */ |
| if (verify_trailing_punct (dawg, &dummy_word[0], char_index)) { |
| *node = NO_EDGE; |
| return (TRUE); |
| } |
| |
| return (FALSE); |
| } |
| } |
| } // namespace tesseract |
| |
| /********************************************************************** |
| * num_forward_edges |
| * |
| * Count and return the number of forward edges for this node. |
| **********************************************************************/ |
| inT32 num_forward_edges(EDGE_ARRAY dawg, NODE_REF node) { |
| EDGE_REF edge = node; |
| inT32 num = 0; |
| |
| if (forward_edge (dawg, edge)) { |
| do { |
| num++; |
| } edge_loop (dawg, edge); |
| } |
| |
| return (num); |
| } |
| |
| |
| /********************************************************************** |
| * print_dawg_node |
| * |
| * Print the contents of one of the nodes in the DAWG. |
| **********************************************************************/ |
| void print_dawg_node(EDGE_ARRAY dawg, NODE_REF node) { |
| EDGE_REF edge = node; |
| const char *forward_string = "FORWARD"; |
| const char *backward_string = " "; |
| |
| const char *last_string = "LAST"; |
| const char *not_last_string = " "; |
| |
| const char *eow_string = "EOW"; |
| const char *not_eow_string = " "; |
| |
| const char *direction; |
| const char *is_last; |
| const char *eow; |
| |
| int ch; |
| |
| if (edge_occupied (dawg, edge)) { |
| do { |
| if (forward_edge (dawg, edge)) direction = forward_string; |
| else direction = backward_string; |
| |
| if (last_edge (dawg, edge)) is_last = last_string; |
| else is_last = not_last_string; |
| |
| if (end_of_word (dawg, edge)) eow = eow_string; |
| else eow = not_eow_string; |
| |
| ch = edge_letter (dawg, edge); |
| cprintf (REFFORMAT " : next = " REFFORMAT ", char = '%c', %s %s %s\n", |
| edge, next_node (dawg, edge), ch, |
| direction, is_last, eow); |
| |
| if (edge - node > MAX_NODE_EDGES_DISPLAY) return; |
| } edge_loop (dawg, edge); |
| |
| if (edge_occupied (dawg, edge) && backward_edge (dawg, edge)) { |
| do { |
| if (forward_edge (dawg, edge)) direction = forward_string; |
| else direction = backward_string; |
| |
| if (last_edge (dawg, edge)) is_last = last_string; |
| else is_last = not_last_string; |
| |
| if (end_of_word (dawg, edge)) eow = eow_string; |
| else eow = not_eow_string; |
| |
| ch = edge_letter (dawg, edge); |
| cprintf (REFFORMAT " : next = " REFFORMAT ", char = '%c', %s %s %s\n", |
| edge, next_node (dawg, edge), ch, |
| direction, is_last, eow); |
| |
| if (edge - node > MAX_NODE_EDGES_DISPLAY) return; |
| } edge_loop (dawg, edge); |
| } |
| } |
| else { |
| cprintf (REFFORMAT " : no edges in this node\n", node); |
| } |
| cprintf("\n"); |
| } |
| |
| |
| /********************************************************************** |
| * read_squished_dawg |
| * |
| * Read the DAWG from a file and return it. Must be freed with memfree. |
| **********************************************************************/ |
| EDGE_ARRAY read_squished_dawg(const char *filename) { |
| FILE *file; |
| EDGE_REF edge; |
| inT32 num_edges = 0; |
| inT32 node_count = 0; |
| |
| if (debug) print_string ("read_debug"); |
| |
| #ifdef __UNIX__ |
| file = open_file (filename, "r"); |
| #else |
| file = open_file (filename, "rb"); |
| #endif |
| fread (&num_edges, sizeof (inT32), 1, file); |
| num_edges = ntohl(num_edges); |
| if (num_edges > MAX_NUM_EDGES_IN_SQUISHED_DAWG_FILE || num_edges < 0) { |
| tprintf("(ENDIAN)Error: trying to read a DAWG '%s' that contains " |
| "%d edges while the maximum is %d.\n", |
| filename, num_edges, MAX_NUM_EDGES_IN_SQUISHED_DAWG_FILE); |
| exit(1); |
| } |
| |
| uinT32 *dawg_32 = (uinT32*) Emalloc(num_edges * sizeof (uinT32)); |
| fread(&dawg_32[0], sizeof (uinT32), num_edges, file); |
| fclose(file); |
| EDGE_ARRAY dawg = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges); |
| |
| for (edge = 0; edge < num_edges; ++edge) |
| dawg[edge] = ntohl(dawg_32[edge]); |
| |
| Efree(dawg_32); |
| |
| for (edge = 0; edge < num_edges; ++edge) |
| if (last_edge (dawg, edge)) node_count++; |
| |
| return dawg; |
| } |
| |
| |
| /********************************************************************** |
| * verify_trailing_punct |
| * |
| * Make sure that there is a valid transition from the word core to a |
| * string of trailing puntuation. TRUE is returned if everything is |
| * OK. |
| **********************************************************************/ |
| namespace tesseract { |
| inT32 Dict::verify_trailing_punct(EDGE_ARRAY dawg, char *word, |
| inT32 char_index) { |
| char last_char; |
| char *first_char; |
| |
| if (trailing_punc (word [char_index])) { |
| |
| last_char = word [char_index]; |
| word [char_index] = (char) 0; |
| |
| for (first_char = word; leading_punc (first_char[0]); first_char++); |
| |
| if (word_in_dawg (dawg, first_char)) { |
| word [char_index] = last_char; |
| return (TRUE); |
| } |
| word [char_index] = last_char; |
| } |
| return (FALSE); |
| } |
| |
| |
| /********************************************************************** |
| * word_in_dawg |
| * |
| * Test to see if the word can be found in the DAWG. |
| **********************************************************************/ |
| inT32 Dict::word_in_dawg(EDGE_ARRAY dawg, const char *string) { |
| NODE_REF node = 0; |
| inT32 i; |
| inT32 length; |
| |
| length=strlen(string); |
| if (length==0) |
| return FALSE; |
| for (i=0; i<length; i++) { |
| if (debug > 1) { |
| print_dawg_node(dawg, node); |
| new_line(); |
| } |
| if (!(this->*letter_is_okay_)(dawg, &node, i, '\0', string, |
| (string[i+1]==0))) { |
| return (FALSE); |
| } |
| } |
| |
| return (TRUE); |
| } |
| } // namespace tesseract |