ccutil/ambigs.h - platform/external/tesseract - Git at Google

 ///////////////////////////////////////////////////////////////////////
 // File:        ambigs.h
 // Description: Constants, flags, functions for dealing with
 //              ambiguities (training and recognition).
 // Author:      Daria Antonova
 // Created:     Mon Aug 23 11:26:43 PDT 2008
 //
 // (C) Copyright 2008, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 ///////////////////////////////////////////////////////////////////////

 #ifndef TESSERACT_CCUTIL_AMBIGS_H_
 #define TESSERACT_CCUTIL_AMBIGS_H_

 #include "elst.h"
 #include "tprintf.h"
 #include "unichar.h"
 #include "unicharset.h"
 #include "genericvector.h"

 #define MAX_AMBIG_SIZE    10

 extern INT_VAR_H(global_ambigs_debug_level, 0,
                  "Debug level for unichar ambiguities");
 extern BOOL_VAR_H(use_definite_ambigs_for_classifier, 0,
                   "Use definite ambiguities when running character classifier");

 namespace tesseract {

 static const int kUnigramAmbigsBufferSize = 1000;
 static const char kAmbigNgramSeparator[] = { ' ', '\0' };
 static const char kAmbigDelimiters[] = "\t ";
 static const char kIllegalMsg[] =
   "Illegal ambiguity specification on line %d\n";
 static const char kIllegalUnicharMsg[] =
   "Illegal unichar %s in ambiguity specification\n";

 enum AmbigType {
   NOT_AMBIG,        // the ngram pair is not ambiguous
   REPLACE_AMBIG,    // ocred ngram should always be substituted with correct
   DEFINITE_AMBIG,   // add correct ngram to the classifier results (1-1)
   SIMILAR_AMBIG,    // use pairwise classifier for ocred/correct pair (1-1)
   CASE_AMBIG,       // this is a case ambiguity (1-1)

   AMBIG_TYPE_COUNT  // number of enum entries
 };

 // A collection of utility functions for arrays of UNICHAR_IDs that are
 // terminated by INVALID_UNICHAR_ID.
 class UnicharIdArrayUtils {
  public:
   // Compares two arrays of unichar ids. Returns -1 if the length of array1 is
   // less than length of array2, if any array1[i] is less than array2[i].
   // Returns 0 if the arrays are equal, 1 otherwise.
   // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.
   static inline int compare(const UNICHAR_ID array1[],
                             const UNICHAR_ID array2[]) {
     const UNICHAR_ID *ptr1 = array1;
     const UNICHAR_ID *ptr2 = array2;
     while (*ptr1 != INVALID_UNICHAR_ID && *ptr2 != INVALID_UNICHAR_ID) {
       if (*ptr1 != *ptr2) return *ptr1 < *ptr2 ? -1 : 1;
       ++ptr1;
       ++ptr2;
     }
     if (*ptr1 == INVALID_UNICHAR_ID && *ptr2 == INVALID_UNICHAR_ID) return 0;
     return *ptr1 == INVALID_UNICHAR_ID ? -1 : 1;
   }

   // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.
   // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID
   // and that dst has enough space for all the elements from src.
   static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
     int i = 0;
     do {
       dst[i] = src[i];
     } while (dst[i++] != INVALID_UNICHAR_ID);
     return i - 1;
   }

   // Prints unichars corresponding to the unichar_ids in the given array.
   // The function assumes that array is terminated by INVALID_UNICHAR_ID.
   static inline void print(const UNICHAR_ID array[],
                            const UNICHARSET &unicharset) {
     const UNICHAR_ID *ptr = array;
     if (*ptr == INVALID_UNICHAR_ID) tprintf("[Empty]");
     while (*ptr != INVALID_UNICHAR_ID) {
       tprintf("%s ", unicharset.id_to_unichar(*ptr++));
     }
     tprintf("( ");
     ptr = array;
     while (*ptr != INVALID_UNICHAR_ID) tprintf("%d ", *ptr++);
     tprintf(")\n");
   }
 };

 // AMBIG_SPEC_LIST stores a list of dangerous ambigs that
 // start with the same unichar (e.g. r->t rn->m rr1->m).
 class AmbigSpec : public ELIST_LINK {
  public:
   AmbigSpec();
   ~AmbigSpec() {}

   // Comparator function for sorting AmbigSpec_LISTs. The lists will
   // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
   // in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
   static int compare_ambig_specs(const void *spec1, const void *spec2) {
     const AmbigSpec *s1 =
       *reinterpret_cast<const AmbigSpec * const *>(spec1);
     const AmbigSpec *s2 =
       *reinterpret_cast<const AmbigSpec * const *>(spec2);
     return UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
   }

   UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
   UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1];
   UNICHAR_ID correct_ngram_id;
   AmbigType type;
   int wrong_ngram_size;
 };
 ELISTIZEH(AmbigSpec);

 // AMBIG_TABLE[i] stores a set of ambiguities whose
 // wrong ngram starts with unichar id i.
 typedef GenericVector<AmbigSpec_LIST *> UnicharAmbigsVector;
 typedef GenericVector<UNICHAR_ID> UnicharIdVector;

 class UnicharAmbigs {
  public:
   UnicharAmbigs() {}
   ~UnicharAmbigs() {
     replace_ambigs_.delete_data_pointers();
     dang_ambigs_.delete_data_pointers();
     one_to_one_definite_ambigs_.delete_data_pointers();
   }

   const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; }
   const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; }

   // Fills in two ambiguity tables (replaceable and dangerous) with information
   // read from the ambigs file. An ambiguity table is an array of lists.
   // The array is indexed by a class id. Each entry in the table provides
   // a list of potential ambiguities which can start with the corresponding
   // character. For example the ambiguity "rn -> m", would be located in the
   // table at index of unicharset.unichar_to_id('r').
   // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in
   // one_to_one_definite_ambigs_. This vector is also indexed by the class id
   // of the wrong part of the ambiguity and each entry contains a vector of
   // unichar ids that are ambiguous to it.
   void LoadUnicharAmbigs(FILE *ambigs_file, inT64 end_offset,
                          UNICHARSET *unicharset);

   // Return definite 1-1 ambigs.
   const UnicharIdVector *OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const {
     if (one_to_one_definite_ambigs_.empty()) return NULL;
     return one_to_one_definite_ambigs_[unichar_id];
   }

  private:

   bool ParseAmbiguityLine(int line_num, int version,
                           const UNICHARSET &unicharset, char *buffer,
                           int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
                           int *ReplacementAmbigPartSize,
                           char *ReplacementString, int *type);
   void InsertIntoTable(UnicharAmbigsVector &table,
                        int TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
                        int ReplacementAmbigPartSize,
                        const char *ReplacementString, int type,
                        AmbigSpec *ambig_spec, UNICHARSET *unicharset);
   UnicharAmbigsVector dang_ambigs_;
   UnicharAmbigsVector replace_ambigs_;
   GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_;
 };

 }  // namespace tesseract

 #endif  // TESSERACT_CCUTIL_AMBIGS_H_
	///////////////////////////////////////////////////////////////////////
	// File: ambigs.h
	// Description: Constants, flags, functions for dealing with
	// ambiguities (training and recognition).
	// Author: Daria Antonova
	// Created: Mon Aug 23 11:26:43 PDT 2008
	//
	// (C) Copyright 2008, Google Inc.
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	// http://www.apache.org/licenses/LICENSE-2.0
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//
	///////////////////////////////////////////////////////////////////////

	#ifndef TESSERACT_CCUTIL_AMBIGS_H_
	#define TESSERACT_CCUTIL_AMBIGS_H_

	#include "elst.h"
	#include "tprintf.h"
	#include "unichar.h"
	#include "unicharset.h"
	#include "genericvector.h"

	#define MAX_AMBIG_SIZE 10

	extern INT_VAR_H(global_ambigs_debug_level, 0,
	"Debug level for unichar ambiguities");
	extern BOOL_VAR_H(use_definite_ambigs_for_classifier, 0,
	"Use definite ambiguities when running character classifier");

	namespace tesseract {

	static const int kUnigramAmbigsBufferSize = 1000;
	static const char kAmbigNgramSeparator[] = { ' ', '\0' };
	static const char kAmbigDelimiters[] = "\t ";
	static const char kIllegalMsg[] =
	"Illegal ambiguity specification on line %d\n";
	static const char kIllegalUnicharMsg[] =
	"Illegal unichar %s in ambiguity specification\n";

	enum AmbigType {
	NOT_AMBIG, // the ngram pair is not ambiguous
	REPLACE_AMBIG, // ocred ngram should always be substituted with correct
	DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1)
	SIMILAR_AMBIG, // use pairwise classifier for ocred/correct pair (1-1)
	CASE_AMBIG, // this is a case ambiguity (1-1)

	AMBIG_TYPE_COUNT // number of enum entries
	};

	// A collection of utility functions for arrays of UNICHAR_IDs that are
	// terminated by INVALID_UNICHAR_ID.
	class UnicharIdArrayUtils {
	public:
	// Compares two arrays of unichar ids. Returns -1 if the length of array1 is
	// less than length of array2, if any array1[i] is less than array2[i].
	// Returns 0 if the arrays are equal, 1 otherwise.
	// The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.
	static inline int compare(const UNICHAR_ID array1[],
	const UNICHAR_ID array2[]) {
	const UNICHAR_ID *ptr1 = array1;
	const UNICHAR_ID *ptr2 = array2;
	while (ptr1 != INVALID_UNICHAR_ID && ptr2 != INVALID_UNICHAR_ID) {
	if (ptr1 != ptr2) return ptr1 < ptr2 ? -1 : 1;
	++ptr1;
	++ptr2;
	}
	if (ptr1 == INVALID_UNICHAR_ID && ptr2 == INVALID_UNICHAR_ID) return 0;
	return *ptr1 == INVALID_UNICHAR_ID ? -1 : 1;
	}

	// Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.
	// The function assumes that the arrays are terminated by INVALID_UNICHAR_ID
	// and that dst has enough space for all the elements from src.
	static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
	int i = 0;
	do {
	dst[i] = src[i];
	} while (dst[i++] != INVALID_UNICHAR_ID);
	return i - 1;
	}

	// Prints unichars corresponding to the unichar_ids in the given array.
	// The function assumes that array is terminated by INVALID_UNICHAR_ID.
	static inline void print(const UNICHAR_ID array[],
	const UNICHARSET &unicharset) {
	const UNICHAR_ID *ptr = array;
	if (*ptr == INVALID_UNICHAR_ID) tprintf("[Empty]");
	while (*ptr != INVALID_UNICHAR_ID) {
	tprintf("%s ", unicharset.id_to_unichar(*ptr++));
	}
	tprintf("( ");
	ptr = array;
	while (ptr != INVALID_UNICHAR_ID) tprintf("%d ", ptr++);
	tprintf(")\n");
	}
	};

	// AMBIG_SPEC_LIST stores a list of dangerous ambigs that
	// start with the same unichar (e.g. r->t rn->m rr1->m).
	class AmbigSpec : public ELIST_LINK {
	public:
	AmbigSpec();
	~AmbigSpec() {}

	// Comparator function for sorting AmbigSpec_LISTs. The lists will
	// be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
	// in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
	static int compare_ambig_specs(const void spec1, const void spec2) {
	const AmbigSpec *s1 =
	reinterpret_cast<const AmbigSpec const *>(spec1);
	const AmbigSpec *s2 =
	reinterpret_cast<const AmbigSpec const *>(spec2);
	return UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
	}

	UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
	UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1];
	UNICHAR_ID correct_ngram_id;
	AmbigType type;
	int wrong_ngram_size;
	};
	ELISTIZEH(AmbigSpec);

	// AMBIG_TABLE[i] stores a set of ambiguities whose
	// wrong ngram starts with unichar id i.
	typedef GenericVector<AmbigSpec_LIST *> UnicharAmbigsVector;
	typedef GenericVector<UNICHAR_ID> UnicharIdVector;

	class UnicharAmbigs {
	public:
	UnicharAmbigs() {}
	~UnicharAmbigs() {
	replace_ambigs_.delete_data_pointers();
	dang_ambigs_.delete_data_pointers();
	one_to_one_definite_ambigs_.delete_data_pointers();
	}

	const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; }
	const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; }

	// Fills in two ambiguity tables (replaceable and dangerous) with information
	// read from the ambigs file. An ambiguity table is an array of lists.
	// The array is indexed by a class id. Each entry in the table provides
	// a list of potential ambiguities which can start with the corresponding
	// character. For example the ambiguity "rn -> m", would be located in the
	// table at index of unicharset.unichar_to_id('r').
	// In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in
	// one_to_one_definite_ambigs_. This vector is also indexed by the class id
	// of the wrong part of the ambiguity and each entry contains a vector of
	// unichar ids that are ambiguous to it.
	void LoadUnicharAmbigs(FILE *ambigs_file, inT64 end_offset,
	UNICHARSET *unicharset);

	// Return definite 1-1 ambigs.
	const UnicharIdVector *OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const {
	if (one_to_one_definite_ambigs_.empty()) return NULL;
	return one_to_one_definite_ambigs_[unichar_id];
	}

	private:

	bool ParseAmbiguityLine(int line_num, int version,
	const UNICHARSET &unicharset, char *buffer,
	int TestAmbigPartSize, UNICHAR_ID TestUnicharIds,
	int *ReplacementAmbigPartSize,
	char ReplacementString, int type);
	void InsertIntoTable(UnicharAmbigsVector &table,
	int TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
	int ReplacementAmbigPartSize,
	const char *ReplacementString, int type,
	AmbigSpec ambig_spec, UNICHARSET unicharset);
	UnicharAmbigsVector dang_ambigs_;
	UnicharAmbigsVector replace_ambigs_;
	GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_;
	};

	} // namespace tesseract

	#endif // TESSERACT_CCUTIL_AMBIGS_H_