dict/context.cpp - platform/external/tesseract - Git at Google

 /* -*-C-*-
  ********************************************************************************
  *
  * File:        context.c  (Formerly context.c)
  * Description:  Context checking functions
  * Author:       Mark Seaman, OCR Technology
  * Created:      Thu Feb 15 11:18:24 1990
  * Modified:     Tue Jul  9 17:38:16 1991 (Mark Seaman) marks@hpgrlt
  * Language:     C
  * Package:      N/A
  * Status:       Experimental (Do Not Distribute)
  *
  * (c) Copyright 1990, Hewlett-Packard Company.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  *********************************************************************************/
 #include "context.h"

 #include "callcpp.h"
 #include "ccutil.h"
 #include "dict.h"
 #include "globals.h"
 #include "image.h"
 #include "ratngs.h"
 #include "tordvars.h"
 #include "unicharset.h"

 #include <stdio.h>
 #include <ctype.h>
 #include <string.h>
 #include <math.h>

 // Initialize probability_in_context to point to a default implementation (a
 // main program can override this).
 PROBABILITY_IN_CONTEXT_FUNCTION probability_in_context = &def_probability_in_context;

 double def_probability_in_context(const char* context,
                                   int context_bytes,
                                   const char* character,
                                   int character_bytes) {
   (void) context;
   (void) context_bytes;
   (void) character;
   (void) character_bytes;
   return 0.0;
 }

 /*----------------------------------------------------------------------
               V a r i a b l e s
 ----------------------------------------------------------------------*/
 static FILE *choice_file = NULL; /* File to save choices */

 /*----------------------------------------------------------------------
               F u n c t i o n s
 ----------------------------------------------------------------------*/
 /**********************************************************************
  * close_choices
  *
  * Close the choices file.
  **********************************************************************/
 void close_choices() {
   if (choice_file)
     fclose(choice_file);
 }

 namespace tesseract {

 /**********************************************************************
  * case_ok
  *
  * Check a string to see if it matches a set of lexical rules.
  **********************************************************************/
 int Context::case_ok(const WERD_CHOICE &word,
                      const UNICHARSET &unicharset) {
   static int case_state_table[6][4] = { {
                                  /*  0. Begining of word         */
     /*    P   U   L   D                                     */
     /* -1. Error on case            */
       0, 1, 5, 4
     },
     {                            /*  1. After initial capital    */
       0, 3, 2, 4
     },
     {                            /*  2. After lower case         */
       0, -1, 2, -1
     },
     {                            /*  3. After upper case         */
       0, 3, -1, 4
     },
     {                            /*  4. After a digit            */
       0, -1, -1, 4
     },
     {                            /*  5. After initial lower case */
       5, -1, 2, -1
     },
   };

   register int last_state = 0;
   register int state = 0;
   register int x;

   for (x = 0; x < word.length(); ++x) {
     UNICHAR_ID ch_id = word.unichar_id(x);
     if (unicharset.get_isupper(ch_id))
       state = case_state_table[state][1];
     else if (unicharset.get_islower(ch_id))
       state = case_state_table[state][2];
     else if (unicharset.get_isdigit(ch_id))
       state = case_state_table[state][3];
     else
       state = case_state_table[state][0];

     if (tord_debug_3)
       tprintf("Case state = %d, char = %s\n", state,
               unicharset.id_to_unichar(ch_id));
     if (state == -1) {
                                  /* Handle ACCRONYMs */
 #if 0
       if (word[x] == 's' &&
         !isalpha (word[x + 1]) && !isdigit (word[x + 1]))
         state = last_state;
       else
 #endif
         return (FALSE);
     }

     last_state = state;
   }
   return state != 5;             /*single lower is bad */
 }
 }  // namespace tesseract


 /**********************************************************************
  * write_choice_line
  *
  * Write a blank line to the choices file.  This will indicate that
  * there is a new word that is following.
  **********************************************************************/
 void write_choice_line() {
   if (choice_file) {
     fprintf (choice_file, "\n");
     fflush(choice_file);
   }
 }
	/* --C--
	********************************************************************************
	*
	* File: context.c (Formerly context.c)
	* Description: Context checking functions
	* Author: Mark Seaman, OCR Technology
	* Created: Thu Feb 15 11:18:24 1990
	* Modified: Tue Jul 9 17:38:16 1991 (Mark Seaman) marks@hpgrlt
	* Language: C
	* Package: N/A
	* Status: Experimental (Do Not Distribute)
	*
	* (c) Copyright 1990, Hewlett-Packard Company.
	** Licensed under the Apache License, Version 2.0 (the "License");
	** you may not use this file except in compliance with the License.
	** You may obtain a copy of the License at
	** http://www.apache.org/licenses/LICENSE-2.0
	** Unless required by applicable law or agreed to in writing, software
	** distributed under the License is distributed on an "AS IS" BASIS,
	** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	** See the License for the specific language governing permissions and
	** limitations under the License.
	*
	*********************************************************************************/
	#include "context.h"

	#include "callcpp.h"
	#include "ccutil.h"
	#include "dict.h"
	#include "globals.h"
	#include "image.h"
	#include "ratngs.h"
	#include "tordvars.h"
	#include "unicharset.h"

	#include <stdio.h>
	#include <ctype.h>
	#include <string.h>
	#include <math.h>

	// Initialize probability_in_context to point to a default implementation (a
	// main program can override this).
	PROBABILITY_IN_CONTEXT_FUNCTION probability_in_context = &def_probability_in_context;

	double def_probability_in_context(const char* context,
	int context_bytes,
	const char* character,
	int character_bytes) {
	(void) context;
	(void) context_bytes;
	(void) character;
	(void) character_bytes;
	return 0.0;
	}

	/*----------------------------------------------------------------------
	V a r i a b l e s
	----------------------------------------------------------------------*/
	static FILE choice_file = NULL; / File to save choices */

	/*----------------------------------------------------------------------
	F u n c t i o n s
	----------------------------------------------------------------------*/
	/**********************************************************************
	* close_choices
	*
	* Close the choices file.
	**********************************************************************/
	void close_choices() {
	if (choice_file)
	fclose(choice_file);
	}

	namespace tesseract {

	/**********************************************************************
	* case_ok
	*
	* Check a string to see if it matches a set of lexical rules.
	**********************************************************************/
	int Context::case_ok(const WERD_CHOICE &word,
	const UNICHARSET &unicharset) {
	static int case_state_table[6][4] = { {
	/* 0. Begining of word */
	/* P U L D */
	/* -1. Error on case */
	0, 1, 5, 4
	},
	{ /* 1. After initial capital */
	0, 3, 2, 4
	},
	{ /* 2. After lower case */
	0, -1, 2, -1
	},
	{ /* 3. After upper case */
	0, 3, -1, 4
	},
	{ /* 4. After a digit */
	0, -1, -1, 4
	},
	{ /* 5. After initial lower case */
	5, -1, 2, -1
	},
	};

	register int last_state = 0;
	register int state = 0;
	register int x;

	for (x = 0; x < word.length(); ++x) {
	UNICHAR_ID ch_id = word.unichar_id(x);
	if (unicharset.get_isupper(ch_id))
	state = case_state_table[state][1];
	else if (unicharset.get_islower(ch_id))
	state = case_state_table[state][2];
	else if (unicharset.get_isdigit(ch_id))
	state = case_state_table[state][3];
	else
	state = case_state_table[state][0];

	if (tord_debug_3)
	tprintf("Case state = %d, char = %s\n", state,
	unicharset.id_to_unichar(ch_id));
	if (state == -1) {
	/* Handle ACCRONYMs */
	#if 0
	if (word[x] == 's' &&
	!isalpha (word[x + 1]) && !isdigit (word[x + 1]))
	state = last_state;
	else
	#endif
	return (FALSE);
	}

	last_state = state;
	}
	return state != 5; /single lower is bad /
	}
	} // namespace tesseract


	/**********************************************************************
	* write_choice_line
	*
	* Write a blank line to the choices file. This will indicate that
	* there is a new word that is following.
	**********************************************************************/
	void write_choice_line() {
	if (choice_file) {
	fprintf (choice_file, "\n");
	fflush(choice_file);
	}
	}