dict/context.cpp - platform/external/tesseract - Git at Google

 /* -*-C-*-
  ********************************************************************************
  *
  * File:        context.c  (Formerly context.c)
  * Description:  Context checking functions
  * Author:       Mark Seaman, OCR Technology
  * Created:      Thu Feb 15 11:18:24 1990
  * Modified:     Tue Jul  9 17:38:16 1991 (Mark Seaman) marks@hpgrlt
  * Language:     C
  * Package:      N/A
  * Status:       Experimental (Do Not Distribute)
  *
  * (c) Copyright 1990, Hewlett-Packard Company.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  *********************************************************************************/
 #include "context.h"
 #include "tordvars.h"
 #include "callcpp.h"
 #include "globals.h"
 #include "dict.h"
 #include "image.h"
 #include "ccutil.h"

 #include <stdio.h>
 #include <ctype.h>
 #include <string.h>
 #include <math.h>

 // Initialize probability_in_context to point to a default implementation (a
 // main program can override this).
 PROBABILITY_IN_CONTEXT_FUNCTION probability_in_context = &def_probability_in_context;

 double def_probability_in_context(const char* context,
                                   int context_bytes,
                                   const char* character,
                                   int character_bytes) {
   (void) context;
   (void) context_bytes;
   (void) character;
   (void) character_bytes;
   return 0.0;
 }

 /*----------------------------------------------------------------------
               V a r i a b l e s
 ----------------------------------------------------------------------*/
 static FILE *choice_file = NULL; /* File to save choices */

 /*----------------------------------------------------------------------
               F u n c t i o n s
 ----------------------------------------------------------------------*/
 /**********************************************************************
  * close_choices
  *
  * Close the choices file.
  **********************************************************************/
 void close_choices() {
   if (choice_file)
     fclose(choice_file);
 }


 /**********************************************************************
  * fix_quotes
  *
  * Fix up two single quote to make them two double quotes.
  **********************************************************************/
 void fix_quotes(char *str) {
   int i;
   for (i = 0; i < strlen (str); i++) {

     if (((str[i] == '\'') || (str[i] == '`')) &&
     ((str[i + 1] == '\'') || (str[i + 1] == '`'))) {
       str[i] = '\"';
       strcpy (str + i + 1, str + i + 2);
     }
   }
 }


 /**********************************************************************
  * punctuation_ok
  *
  * Check a string to see if it matches a set of punctuation rules.
  **********************************************************************/
 namespace tesseract {
 int Dict::punctuation_ok(const char *word, const char *lengths) {
   int punctuation_types[5];
   int trailing = 0;
   int num_puncts = 0;
   register int x;
   int offset;
   UNICHAR_ID ch_id;

   for (x = 0; x < 5; x++)
     punctuation_types[x] = 0;

   // check for un-supported symbols
   for (x = 0, offset = 0; x < strlen (lengths); offset += lengths[x++]) {
     // a un-supported symbol
     if (!getUnicharset().contains_unichar (word + offset,
                                                                lengths[x])) {
       return -1;
     }
   }

   for (x = 0, offset = 0; x < strlen (lengths); offset += lengths[x++]) {
     if (getUnicharset().get_isalpha (word + offset, lengths[x])) {
       if (trailing &&
         !(getUnicharset().get_isalpha(
             word + offset - lengths[x - 1], lengths[x - 1])
 #if 0
           ||
         (word[x - 1] == '\'' &&
         (word[x] == 's' || word[x] == 'd' || word[x] == 'l')) ||
         (word[x - 1] == '-')
 #endif
           ))
         return (-1);
       trailing = 1;
     }
     else {
       ch_id = getUnicharset().unichar_to_id(word + offset, lengths[x]);

       if (getUnicharset().eq(ch_id, ".") && trailing) {
         if (punctuation_types[0])
           return (-1);
         (punctuation_types[0])++;
       }

       else if (((getUnicharset().eq(ch_id, "{")) ||
                 (getUnicharset().eq(ch_id, "[")) ||
                 (getUnicharset().eq(ch_id, "("))) && !trailing) {
         if (punctuation_types[1])
           return (-1);
         (punctuation_types[1])++;
       }

       else if (((getUnicharset().eq(ch_id, "}")) ||
                 (getUnicharset().eq(ch_id, "]")) ||
                 (getUnicharset().eq(ch_id, ")"))) && trailing) {
         if (punctuation_types[2])
           return (-1);
         (punctuation_types[2])++;
       }

       else if (((getUnicharset().eq(ch_id, ":")) ||
                 (getUnicharset().eq(ch_id, ";")) ||
                 (getUnicharset().eq(ch_id, "!")) ||
                 (getUnicharset().eq(ch_id, "-")) ||
                 (getUnicharset().eq(ch_id, ",")) ||
                 (getUnicharset().eq(ch_id, "?"))) && trailing) {
         if (punctuation_types[3])
           return (-1);
         (punctuation_types[3])++;
         if (getUnicharset().eq(ch_id, "-"))
           punctuation_types[3] = 0;
       }

       else if (x < strlen(lengths) - 1 &&
                ((getUnicharset().eq(ch_id, "`")) ||
                (getUnicharset().eq(ch_id, "\"")) ||
                (getUnicharset().eq(ch_id, "\'")))) {
         UNICHAR_ID ch_id2 = getUnicharset().unichar_to_id(word + offset + lengths[x],
                                                      lengths[x + 1]);
         if ((getUnicharset().eq(ch_id2, "`")) ||
             (getUnicharset().eq(ch_id2, "\'"))) {
           offset += lengths[x++];
         }
         (punctuation_types[4])++;
         if (punctuation_types[4] > 2)
           return (-1);
       }

       else if (!getUnicharset().get_isdigit (ch_id))
         return (-1);
     }
   }

   for (x = 0; x < 5; x++) {
     if (punctuation_types[x])
       num_puncts++;
   }

   return (num_puncts);
 }


 /**********************************************************************
  * case_ok
  *
  * Check a string to see if it matches a set of lexical rules.
  **********************************************************************/
 int Dict::case_ok(const char *word, const char *lengths) {
   static int case_state_table[6][4] = { {
                                  /*  0. Begining of word         */
     /*    P   U   L   D                                     */
     /* -1. Error on case            */
       0, 1, 5, 4
     },
     {                            /*  1. After initial capital    */
       0, 3, 2, 4
     },
     {                            /*  2. After lower case         */
       0, -1, 2, -1
     },
     {                            /*  3. After upper case         */
       0, 3, -1, 4
     },
     {                            /*  4. After a digit            */
       0, -1, -1, 4
     },
     {                            /*  5. After initial lower case */
       5, -1, 2, -1
     },
   };

   register int last_state = 0;
   register int state = 0;
   register int x;
   int offset;
   UNICHAR_ID ch_id;

   for (x = 0, offset = 0; x < strlen (lengths); offset += lengths[x++]) {

     ch_id = getUnicharset().unichar_to_id(word + offset, lengths[x]);
     if (getUnicharset().get_isupper(ch_id))
       state = case_state_table[state][1];
     else if (getUnicharset().get_isalpha(ch_id))
       state = case_state_table[state][2];
     else if (getUnicharset().get_isdigit(ch_id))
       state = case_state_table[state][3];
     else
       state = case_state_table[state][0];

     if (debug_3)
       cprintf ("Case state = %d, char = %s\n", state,
                getUnicharset().id_to_unichar(ch_id));
     if (state == -1) {
                                  /* Handle ACCRONYMs */
 #if 0
       if (word[x] == 's' &&
         !isalpha (word[x + 1]) && !isdigit (word[x + 1]))
         state = last_state;
       else
 #endif
         return (FALSE);
     }

     last_state = state;
   }
   return state != 5;             /*single lower is bad */
 }
 }  // namespace tesseract


 /**********************************************************************
  * write_choice_line
  *
  * Write a blank line to the choices file.  This will indicate that
  * there is a new word that is following.
  **********************************************************************/
 void write_choice_line() {
   if (choice_file) {
     fprintf (choice_file, "\n");
     fflush(choice_file);
   }
 }
	/* --C--
	********************************************************************************
	*
	* File: context.c (Formerly context.c)
	* Description: Context checking functions
	* Author: Mark Seaman, OCR Technology
	* Created: Thu Feb 15 11:18:24 1990
	* Modified: Tue Jul 9 17:38:16 1991 (Mark Seaman) marks@hpgrlt
	* Language: C
	* Package: N/A
	* Status: Experimental (Do Not Distribute)
	*
	* (c) Copyright 1990, Hewlett-Packard Company.
	** Licensed under the Apache License, Version 2.0 (the "License");
	** you may not use this file except in compliance with the License.
	** You may obtain a copy of the License at
	** http://www.apache.org/licenses/LICENSE-2.0
	** Unless required by applicable law or agreed to in writing, software
	** distributed under the License is distributed on an "AS IS" BASIS,
	** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	** See the License for the specific language governing permissions and
	** limitations under the License.
	*
	*********************************************************************************/
	#include "context.h"
	#include "tordvars.h"
	#include "callcpp.h"
	#include "globals.h"
	#include "dict.h"
	#include "image.h"
	#include "ccutil.h"

	#include <stdio.h>
	#include <ctype.h>
	#include <string.h>
	#include <math.h>

	// Initialize probability_in_context to point to a default implementation (a
	// main program can override this).
	PROBABILITY_IN_CONTEXT_FUNCTION probability_in_context = &def_probability_in_context;

	double def_probability_in_context(const char* context,
	int context_bytes,
	const char* character,
	int character_bytes) {
	(void) context;
	(void) context_bytes;
	(void) character;
	(void) character_bytes;
	return 0.0;
	}

	/*----------------------------------------------------------------------
	V a r i a b l e s
	----------------------------------------------------------------------*/
	static FILE choice_file = NULL; / File to save choices */

	/*----------------------------------------------------------------------
	F u n c t i o n s
	----------------------------------------------------------------------*/
	/**********************************************************************
	* close_choices
	*
	* Close the choices file.
	**********************************************************************/
	void close_choices() {
	if (choice_file)
	fclose(choice_file);
	}


	/**********************************************************************
	* fix_quotes
	*
	* Fix up two single quote to make them two double quotes.
	**********************************************************************/
	void fix_quotes(char *str) {
	int i;
	for (i = 0; i < strlen (str); i++) {

	if (((str[i] == '\'') \|\| (str[i] == '`')) &&
	((str[i + 1] == '\'') \|\| (str[i + 1] == '`'))) {
	str[i] = '\"';
	strcpy (str + i + 1, str + i + 2);
	}
	}
	}


	/**********************************************************************
	* punctuation_ok
	*
	* Check a string to see if it matches a set of punctuation rules.
	**********************************************************************/
	namespace tesseract {
	int Dict::punctuation_ok(const char word, const char lengths) {
	int punctuation_types[5];
	int trailing = 0;
	int num_puncts = 0;
	register int x;
	int offset;
	UNICHAR_ID ch_id;

	for (x = 0; x < 5; x++)
	punctuation_types[x] = 0;

	// check for un-supported symbols
	for (x = 0, offset = 0; x < strlen (lengths); offset += lengths[x++]) {
	// a un-supported symbol
	if (!getUnicharset().contains_unichar (word + offset,
	lengths[x])) {
	return -1;
	}
	}

	for (x = 0, offset = 0; x < strlen (lengths); offset += lengths[x++]) {
	if (getUnicharset().get_isalpha (word + offset, lengths[x])) {
	if (trailing &&
	!(getUnicharset().get_isalpha(
	word + offset - lengths[x - 1], lengths[x - 1])
	#if 0
	\|\|
	(word[x - 1] == '\'' &&
	(word[x] == 's' \|\| word[x] == 'd' \|\| word[x] == 'l')) \|\|
	(word[x - 1] == '-')
	#endif
	))
	return (-1);
	trailing = 1;
	}
	else {
	ch_id = getUnicharset().unichar_to_id(word + offset, lengths[x]);

	if (getUnicharset().eq(ch_id, ".") && trailing) {
	if (punctuation_types[0])
	return (-1);
	(punctuation_types[0])++;
	}

	else if (((getUnicharset().eq(ch_id, "{")) \|\|
	(getUnicharset().eq(ch_id, "[")) \|\|
	(getUnicharset().eq(ch_id, "("))) && !trailing) {
	if (punctuation_types[1])
	return (-1);
	(punctuation_types[1])++;
	}

	else if (((getUnicharset().eq(ch_id, "}")) \|\|
	(getUnicharset().eq(ch_id, "]")) \|\|
	(getUnicharset().eq(ch_id, ")"))) && trailing) {
	if (punctuation_types[2])
	return (-1);
	(punctuation_types[2])++;
	}

	else if (((getUnicharset().eq(ch_id, ":")) \|\|
	(getUnicharset().eq(ch_id, ";")) \|\|
	(getUnicharset().eq(ch_id, "!")) \|\|
	(getUnicharset().eq(ch_id, "-")) \|\|
	(getUnicharset().eq(ch_id, ",")) \|\|
	(getUnicharset().eq(ch_id, "?"))) && trailing) {
	if (punctuation_types[3])
	return (-1);
	(punctuation_types[3])++;
	if (getUnicharset().eq(ch_id, "-"))
	punctuation_types[3] = 0;
	}

	else if (x < strlen(lengths) - 1 &&
	((getUnicharset().eq(ch_id, "`")) \|\|
	(getUnicharset().eq(ch_id, "\"")) \|\|
	(getUnicharset().eq(ch_id, "\'")))) {
	UNICHAR_ID ch_id2 = getUnicharset().unichar_to_id(word + offset + lengths[x],
	lengths[x + 1]);
	if ((getUnicharset().eq(ch_id2, "`")) \|\|
	(getUnicharset().eq(ch_id2, "\'"))) {
	offset += lengths[x++];
	}
	(punctuation_types[4])++;
	if (punctuation_types[4] > 2)
	return (-1);
	}

	else if (!getUnicharset().get_isdigit (ch_id))
	return (-1);
	}
	}

	for (x = 0; x < 5; x++) {
	if (punctuation_types[x])
	num_puncts++;
	}

	return (num_puncts);
	}


	/**********************************************************************
	* case_ok
	*
	* Check a string to see if it matches a set of lexical rules.
	**********************************************************************/
	int Dict::case_ok(const char word, const char lengths) {
	static int case_state_table[6][4] = { {
	/* 0. Begining of word */
	/* P U L D */
	/* -1. Error on case */
	0, 1, 5, 4
	},
	{ /* 1. After initial capital */
	0, 3, 2, 4
	},
	{ /* 2. After lower case */
	0, -1, 2, -1
	},
	{ /* 3. After upper case */
	0, 3, -1, 4
	},
	{ /* 4. After a digit */
	0, -1, -1, 4
	},
	{ /* 5. After initial lower case */
	5, -1, 2, -1
	},
	};

	register int last_state = 0;
	register int state = 0;
	register int x;
	int offset;
	UNICHAR_ID ch_id;

	for (x = 0, offset = 0; x < strlen (lengths); offset += lengths[x++]) {

	ch_id = getUnicharset().unichar_to_id(word + offset, lengths[x]);
	if (getUnicharset().get_isupper(ch_id))
	state = case_state_table[state][1];
	else if (getUnicharset().get_isalpha(ch_id))
	state = case_state_table[state][2];
	else if (getUnicharset().get_isdigit(ch_id))
	state = case_state_table[state][3];
	else
	state = case_state_table[state][0];

	if (debug_3)
	cprintf ("Case state = %d, char = %s\n", state,
	getUnicharset().id_to_unichar(ch_id));
	if (state == -1) {
	/* Handle ACCRONYMs */
	#if 0
	if (word[x] == 's' &&
	!isalpha (word[x + 1]) && !isdigit (word[x + 1]))
	state = last_state;
	else
	#endif
	return (FALSE);
	}

	last_state = state;
	}
	return state != 5; /single lower is bad /
	}
	} // namespace tesseract


	/**********************************************************************
	* write_choice_line
	*
	* Write a blank line to the choices file. This will indicate that
	* there is a new word that is following.
	**********************************************************************/
	void write_choice_line() {
	if (choice_file) {
	fprintf (choice_file, "\n");
	fflush(choice_file);
	}
	}