ccutil/unicharset.h - platform/external/tesseract - Git at Google

 ///////////////////////////////////////////////////////////////////////
 // File:        unicharset.h
 // Description: Unicode character/ligature set class.
 // Author:      Thomas Kielbus
 // Created:     Wed Jun 28 17:05:01 PDT 2006
 //
 // (C) Copyright 2006, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 ///////////////////////////////////////////////////////////////////////

 #ifndef TESSERACT_CCUTIL_UNICHARSET_H__
 #define TESSERACT_CCUTIL_UNICHARSET_H__

 #include "assert.h"
 #include "strngs.h"
 #include "unichar.h"
 #include "unicharmap.h"
 #include "varable.h"

 class CHAR_FRAGMENT {
  public:
   // Minimum number of characters used for fragment representation.
   static const int kMinLen = 6;
   // Maximum number of characters used for fragment representation.
   static const int kMaxLen = 3 + UNICHAR_LEN + 2;
   // Special character used in representing character fragments.
   static const char kSeparator = '|';
   // Maximum number of fragments per character.
   static const int kMaxChunks = 3;

   // Setters and Getters.
   inline void set_all(const char *unichar, int pos, int total) {
     this->set_unichar(unichar);
     this->set_pos(pos);
     this->set_total(total);
   }
   inline void set_unichar(const char *uch) {
     strncpy(this->unichar, uch, UNICHAR_LEN);
     this->unichar[UNICHAR_LEN] = '\0';
   }
   inline void set_pos(int p) { this->pos = p; }
   inline void set_total(int t) { this->total = t; }
   inline const char* get_unichar() const { return this->unichar; }
   inline int get_pos() const { return this->pos; }
   inline int get_total() const { return this->total; }

   // Returns the string that represents a fragment
   // with the given unichar, pos and total.
   static STRING to_string(const char *unichar, int pos, int total) {
     STRING result = "";
     result += kSeparator;
     result += unichar;
     char buffer[kMaxLen];
     snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos, kSeparator, total);
     result += buffer;
     return result;
   }
   // Returns the string that represents this fragment.
   STRING to_string() const {
     return to_string(this->unichar, this->pos, this->total);
   }

   // Checks whether a fragment has the same unichar,
   // position and total as the given inputs.
   inline bool equals(const char *other_unichar,
                      int other_pos, int other_total) const {
     return (strcmp(this->unichar, other_unichar) == 0 &&
             this->pos == other_pos && this->total == other_total);
   }
   inline bool equals(const CHAR_FRAGMENT *other) const {
     return this->equals(other->get_unichar(),
                         other->get_pos(),
                         other->get_total());
   }

   // Checks whether a given fragment is a continuation of this fragment.
   // Assumes that the given fragment pointer is not NULL.
   inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
     return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
             this->total == fragment->get_total() &&
             this->pos == fragment->get_pos() + 1);
   }

   // Returns true if this fragment is a beginning fragment.
   inline bool is_beginning() const { return this->pos == 0; }

   // Returns true if this fragment is an ending fragment.
   inline bool is_ending() const { return this->pos == this->total-1; }

   // Parses the string to see whether it represents a character fragment
   // (rather than a regular character). If so, allocates memory for a new
   // CHAR_FRAGMENT instance and fills it in with the corresponding fragment
   // information. Fragments are of the form:
   // |m|1|2, meaning chunk 1 of 2 of character m.
   //
   // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
   // instance, otherwise (if the string does not represent a fragment or it
   // looks like it does, but parsing it as a fragment fails) returns NULL.
   //
   // Note: The caller is responsible for deallocating memory
   // associated with the returned pointer.
   static CHAR_FRAGMENT *parse_from_string(const char *str);

  private:
   char unichar[UNICHAR_LEN + 1];
   inT16 pos;    // fragment position in the character
   inT16 total;  // total number of fragments in the character
 };

 // The UNICHARSET class is an utility class for Tesseract that holds the
 // set of characters that are used by the engine. Each character is identified
 // by a unique number, from 0 to (size - 1).
 class UNICHARSET {
  public:
   // Create an empty UNICHARSET
   UNICHARSET();

   ~UNICHARSET();

   // Return the UNICHAR_ID of a given unichar representation within the
   // UNICHARSET.
   const UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;

   // Return the UNICHAR_ID of a given unichar representation within the
   // UNICHARSET. Only the first length characters from unichar_repr are used.
   const UNICHAR_ID unichar_to_id(const char* const unichar_repr,
                                  int length) const;

   // Return the minimum number of bytes that matches a legal UNICHAR_ID,
   // while leaving a legal UNICHAR_ID afterwards. In other words, if there
   // is both a short and a long match to the string, return the length that
   // ensures there is a legal match after it.
   int step(const char* str) const;

   // Return the unichar representation corresponding to the given UNICHAR_ID
   // within the UNICHARSET.
   const char* const id_to_unichar(UNICHAR_ID id) const;

   // Return a STRING that reformats the utf8 str into the str followed
   // by its hex unicodes.
   static STRING debug_utf8_str(const char* str);

   // Return a STRING containing debug information on the unichar, including
   // the id_to_unichar, its hex unicodes and the properties.
   STRING debug_str(UNICHAR_ID id) const;
   STRING debug_str(const char * unichar_repr) const {
     return debug_str(unichar_to_id(unichar_repr));
   }

   // Add a unichar representation to the set.
   void unichar_insert(const char* const unichar_repr);

   // Return true if the given unichar id exists within the set.
   // Relies on the fact that unichar ids are contiguous in the unicharset.
   bool contains_unichar_id(UNICHAR_ID unichar_id) const {
     return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used;
   }

   // Return true if the given unichar representation exists within the set.
   bool contains_unichar(const char* const unichar_repr) const;
   bool contains_unichar(const char* const unichar_repr, int length) const;

   // Return true if the given unichar representation corresponds to the given
   // UNICHAR_ID within the set.
   bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;

   // Delete CHAR_FRAGMENTs stored in properties of unichars array.
   void delete_pointers_in_unichars() {
     for (int i = 0; i < size_used; ++i) {
       if (unichars[i].properties.fragment != NULL) {
         delete unichars[i].properties.fragment;
         unichars[i].properties.fragment = NULL;
       }
     }
   }

   // Clear the UNICHARSET (all the previous data is lost).
   void clear() {
     if (size_reserved > 0) {
       for (int i = 0; i < script_table_size_used; ++i)
         delete[] script_table[i];
       delete[] script_table;
       script_table = 0;
       script_table_size_reserved = 0;
       script_table_size_used = 0;
       delete_pointers_in_unichars();
       delete[] unichars;
       unichars = 0;
       size_reserved = 0;
       size_used = 0;
     }
     ids.clear();
   }

   // Return the size of the set (the number of different UNICHAR it holds).
   int size() const {
     return size_used;
   }

   // Reserve enough memory space for the given number of UNICHARS
   void reserve(int unichars_number);

   // Opens the file indicated by filename and saves unicharset to that file.
   // Returns true if the operation is successful.
   bool save_to_file(const char * const filename) const {
     FILE* file = fopen(filename, "w+");
     if (file == NULL) return false;
     bool result = save_to_file(file);
     fclose(file);
     return result;
   }

   // Saves the content of the UNICHARSET to the given file.
   // Returns true if the operation is successful.
   bool save_to_file(FILE *file) const;

   // Opens the file indicated by filename and loads the UNICHARSET
   // from the given file. The previous data is lost.
   // Returns true if the operation is successful.
   bool load_from_file(const char* const filename) {
     FILE* file = fopen(filename, "r");
     if (file == NULL) return false;
     bool result = load_from_file(file);
     fclose(file);
     return result;
   }

   // Loads the UNICHARSET from the given file. The previous data is lost.
   // Returns true if the operation is successful.
   bool load_from_file(FILE *file);

   // Set a whitelist and/or blacklist of characters to recognize.
   // An empty or NULL whitelist enables everything (minus any blacklist).
   // An empty or NULL blacklist disables nothing.
   // The blacklist overrides the whitelist.
   // Each list is a string of utf8 character strings. Boundaries between
   // unicharset units are worked out automatically, and characters not in
   // the unicharset are silently ignored.
   void set_black_and_whitelist(const char* blacklist, const char* whitelist);

   // Set the isalpha property of the given unichar to the given value.
   void set_isalpha(UNICHAR_ID unichar_id, bool value) {
     unichars[unichar_id].properties.isalpha = value;
   }

   // Set the islower property of the given unichar to the given value.
   void set_islower(UNICHAR_ID unichar_id, bool value) {
     unichars[unichar_id].properties.islower = value;
   }

   // Set the isupper property of the given unichar to the given value.
   void set_isupper(UNICHAR_ID unichar_id, bool value) {
     unichars[unichar_id].properties.isupper = value;
   }

   // Set the isdigit property of the given unichar to the given value.
   void set_isdigit(UNICHAR_ID unichar_id, bool value) {
     unichars[unichar_id].properties.isdigit = value;
   }

   // Set the ispunctuation property of the given unichar to the given value.
   void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
     unichars[unichar_id].properties.ispunctuation = value;
   }

   // Set the isngram property of the given unichar to the given value.
   void set_isngram(UNICHAR_ID unichar_id, bool value) {
     unichars[unichar_id].properties.isngram = value;
   }

   // Set the script name of the given unichar to the given value.
   // Value is copied and thus can be a temporary;
   void set_script(UNICHAR_ID unichar_id, const char* value) {
     unichars[unichar_id].properties.script_id = add_script(value);
   }

   // Set other_case unichar id in the properties for the given unichar id.
   void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
     unichars[unichar_id].properties.other_case = other_case;
   }

   // Return the isalpha property of the given unichar.
   bool get_isalpha(UNICHAR_ID unichar_id) const {
     return unichars[unichar_id].properties.isalpha;
   }

   // Return the islower property of the given unichar.
   bool get_islower(UNICHAR_ID unichar_id) const {
     return unichars[unichar_id].properties.islower;
   }

   // Return the isupper property of the given unichar.
   bool get_isupper(UNICHAR_ID unichar_id) const {
     return unichars[unichar_id].properties.isupper;
   }

   // Return the isdigit property of the given unichar.
   bool get_isdigit(UNICHAR_ID unichar_id) const {
     return unichars[unichar_id].properties.isdigit;
   }

   // Return the ispunctuation property of the given unichar.
   bool get_ispunctuation(UNICHAR_ID unichar_id) const {
     return unichars[unichar_id].properties.ispunctuation;
   }

   // Return the isngram property of the given unichar.
   bool get_isngram(UNICHAR_ID unichar_id) const {
     return unichars[unichar_id].properties.isngram;
   }

   // Return the script name of the given unichar.
   // The returned pointer will always be the same for the same script, it's
   // managed by unicharset and thus MUST NOT be deleted
   int get_script(UNICHAR_ID unichar_id) const {
     return unichars[unichar_id].properties.script_id;
   }

   // Get other_case unichar id in the properties for the given unichar id.
   UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
     return unichars[unichar_id].properties.other_case;
   }

   // Returns UNICHAR_ID of the corresponding lower-case unichar.
   UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
     if (unichars[unichar_id].properties.islower) return unichar_id;
     return unichars[unichar_id].properties.other_case;
   }

   // Returns UNICHAR_ID of the corresponding upper-case unichar.
   UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
     if (unichars[unichar_id].properties.isupper) return unichar_id;
     return unichars[unichar_id].properties.other_case;
   }

   // Return a pointer to the CHAR_FRAGMENT class if the given
   // unichar id represents a character fragment.
   const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
     return unichars[unichar_id].properties.fragment;
   }

   // Return the isalpha property of the given unichar representation.
   bool get_isalpha(const char* const unichar_repr) const {
     return get_isalpha(unichar_to_id(unichar_repr));
   }

   // Return the islower property of the given unichar representation.
   bool get_islower(const char* const unichar_repr) const {
     return get_islower(unichar_to_id(unichar_repr));
   }

   // Return the isupper property of the given unichar representation.
   bool get_isupper(const char* const unichar_repr) const {
     return get_isupper(unichar_to_id(unichar_repr));
   }

   // Return the isdigit property of the given unichar representation.
   bool get_isdigit(const char* const unichar_repr) const {
     return get_isdigit(unichar_to_id(unichar_repr));
   }

   // Return the ispunctuation property of the given unichar representation.
   bool get_ispunctuation(const char* const unichar_repr) const {
     return get_ispunctuation(unichar_to_id(unichar_repr));
   }

   // Return the script name of the given unichar representation.
   // The returned pointer will always be the same for the same script, it's
   // managed by unicharset and thus MUST NOT be deleted
   int get_script(const char* const unichar_repr) const {
     return get_script(unichar_to_id(unichar_repr));
   }

   // Return a pointer to the CHAR_FRAGMENT class struct if the given
   // unichar representation represents a character fragment.
   const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
     if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
         !ids.contains(unichar_repr)) {
       return NULL;
     }
     return get_fragment(unichar_to_id(unichar_repr));
   }

   // Return the isalpha property of the given unichar representation.
   // Only the first length characters from unichar_repr are used.
   bool get_isalpha(const char* const unichar_repr,
                int length) const {
     return get_isalpha(unichar_to_id(unichar_repr, length));
   }

   // Return the islower property of the given unichar representation.
   // Only the first length characters from unichar_repr are used.
   bool get_islower(const char* const unichar_repr,
                int length) const {
     return get_islower(unichar_to_id(unichar_repr, length));
   }

   // Return the isupper property of the given unichar representation.
   // Only the first length characters from unichar_repr are used.
   bool get_isupper(const char* const unichar_repr,
                int length) const {
     return get_isupper(unichar_to_id(unichar_repr, length));
   }

   // Return the isdigit property of the given unichar representation.
   // Only the first length characters from unichar_repr are used.
   bool get_isdigit(const char* const unichar_repr,
                int length) const {
     return get_isdigit(unichar_to_id(unichar_repr, length));
   }

   // Return the ispunctuation property of the given unichar representation.
   // Only the first length characters from unichar_repr are used.
   bool get_ispunctuation(const char* const unichar_repr,
                           int length) const {
     return get_ispunctuation(unichar_to_id(unichar_repr, length));
   }

   // Return the script name of the given unichar representation.
   // Only the first length characters from unichar_repr are used.
   // The returned pointer will always be the same for the same script, it's
   // managed by unicharset and thus MUST NOT be deleted
   int get_script(const char* const unichar_repr,
                int length) const {
     return get_script(unichar_to_id(unichar_repr, length));
   }

   // Return the (current) number of scripts in the script table
   int get_script_table_size() const {
     return script_table_size_used;
   }

   // Return the script string from its id
   const char* get_script_from_script_id(int id) const {
     if (id >= script_table_size_used || id < 0)
       return null_script;
     return script_table[id];
   }

   // Returns the id from the name of the script, or 0 if script is not found.
   // Note that this is an expensive operation since it involves iteratively
   // comparing strings in the script table.  To avoid dependency on STL, we
   // won't use a hash.  Instead, the calling function can use this to lookup
   // and save the ID for relevant scripts for fast comparisons later.
   int get_script_id_from_name(const char* script_name) const;

   // Return true if the given script is the null script
   bool is_null_script(const char* script) const {
     return script == null_script;
   }

   // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
   // then the returned pointer will be the same.
   // The script parameter is copied and thus can be a temporary.
   int add_script(const char* script);

   // Return the enabled property of the given unichar.
   bool get_enabled(UNICHAR_ID unichar_id) const {
     return unichars[unichar_id].properties.enabled;
   }


   int null_sid() const { return null_sid_; }
   int common_sid() const { return common_sid_; }
   int latin_sid() const { return latin_sid_; }
   int cyrillic_sid() const { return cyrillic_sid_; }
   int greek_sid() const { return greek_sid_; }
   int han_sid() const { return han_sid_; }

  private:

   struct UNICHAR_PROPERTIES {
     bool  isalpha;
     bool  islower;
     bool  isupper;
     bool  isdigit;
     bool  ispunctuation;
     bool  isngram;
     bool  enabled;
     int   script_id;
     UNICHAR_ID other_case;  // id of the corresponding upper/lower case unichar

     // Contains meta information about the fragment if a unichar represents
     // a fragment of a character, otherwise should be set to NULL.
     // It is assumed that character fragments are added to the unicharset
     // after the corresponding 'base' characters.
     CHAR_FRAGMENT *fragment;
   };

   struct UNICHAR_SLOT {
     char representation[UNICHAR_LEN + 1];
     UNICHAR_PROPERTIES properties;
   };

   UNICHAR_SLOT* unichars;
   UNICHARMAP ids;
   int size_used;
   int size_reserved;
   char** script_table;
   int script_table_size_used;
   int script_table_size_reserved;
   const char* null_script;

   // A few convenient script name-to-id mapping without using hash.
   // These are initialized when unicharset file is loaded.  Anything
   // missing from this list can be looked up using get_script_id_from_name.
   int null_sid_;
   int common_sid_;
   int latin_sid_;
   int cyrillic_sid_;
   int greek_sid_;
   int han_sid_;
 };

 #endif  // TESSERACT_CCUTIL_UNICHARSET_H__
	///////////////////////////////////////////////////////////////////////
	// File: unicharset.h
	// Description: Unicode character/ligature set class.
	// Author: Thomas Kielbus
	// Created: Wed Jun 28 17:05:01 PDT 2006
	//
	// (C) Copyright 2006, Google Inc.
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	// http://www.apache.org/licenses/LICENSE-2.0
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//
	///////////////////////////////////////////////////////////////////////

	#ifndef TESSERACT_CCUTIL_UNICHARSET_H__
	#define TESSERACT_CCUTIL_UNICHARSET_H__

	#include "assert.h"
	#include "strngs.h"
	#include "unichar.h"
	#include "unicharmap.h"
	#include "varable.h"

	class CHAR_FRAGMENT {
	public:
	// Minimum number of characters used for fragment representation.
	static const int kMinLen = 6;
	// Maximum number of characters used for fragment representation.
	static const int kMaxLen = 3 + UNICHAR_LEN + 2;
	// Special character used in representing character fragments.
	static const char kSeparator = '\|';
	// Maximum number of fragments per character.
	static const int kMaxChunks = 3;

	// Setters and Getters.
	inline void set_all(const char *unichar, int pos, int total) {
	this->set_unichar(unichar);
	this->set_pos(pos);
	this->set_total(total);
	}
	inline void set_unichar(const char *uch) {
	strncpy(this->unichar, uch, UNICHAR_LEN);
	this->unichar[UNICHAR_LEN] = '\0';
	}
	inline void set_pos(int p) { this->pos = p; }
	inline void set_total(int t) { this->total = t; }
	inline const char* get_unichar() const { return this->unichar; }
	inline int get_pos() const { return this->pos; }
	inline int get_total() const { return this->total; }

	// Returns the string that represents a fragment
	// with the given unichar, pos and total.
	static STRING to_string(const char *unichar, int pos, int total) {
	STRING result = "";
	result += kSeparator;
	result += unichar;
	char buffer[kMaxLen];
	snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos, kSeparator, total);
	result += buffer;
	return result;
	}
	// Returns the string that represents this fragment.
	STRING to_string() const {
	return to_string(this->unichar, this->pos, this->total);
	}

	// Checks whether a fragment has the same unichar,
	// position and total as the given inputs.
	inline bool equals(const char *other_unichar,
	int other_pos, int other_total) const {
	return (strcmp(this->unichar, other_unichar) == 0 &&
	this->pos == other_pos && this->total == other_total);
	}
	inline bool equals(const CHAR_FRAGMENT *other) const {
	return this->equals(other->get_unichar(),
	other->get_pos(),
	other->get_total());
	}

	// Checks whether a given fragment is a continuation of this fragment.
	// Assumes that the given fragment pointer is not NULL.
	inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
	return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
	this->total == fragment->get_total() &&
	this->pos == fragment->get_pos() + 1);
	}

	// Returns true if this fragment is a beginning fragment.
	inline bool is_beginning() const { return this->pos == 0; }

	// Returns true if this fragment is an ending fragment.
	inline bool is_ending() const { return this->pos == this->total-1; }

	// Parses the string to see whether it represents a character fragment
	// (rather than a regular character). If so, allocates memory for a new
	// CHAR_FRAGMENT instance and fills it in with the corresponding fragment
	// information. Fragments are of the form:
	// \|m\|1\|2, meaning chunk 1 of 2 of character m.
	//
	// If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
	// instance, otherwise (if the string does not represent a fragment or it
	// looks like it does, but parsing it as a fragment fails) returns NULL.
	//
	// Note: The caller is responsible for deallocating memory
	// associated with the returned pointer.
	static CHAR_FRAGMENT parse_from_string(const char str);

	private:
	char unichar[UNICHAR_LEN + 1];
	inT16 pos; // fragment position in the character
	inT16 total; // total number of fragments in the character
	};

	// The UNICHARSET class is an utility class for Tesseract that holds the
	// set of characters that are used by the engine. Each character is identified
	// by a unique number, from 0 to (size - 1).
	class UNICHARSET {
	public:
	// Create an empty UNICHARSET
	UNICHARSET();

	~UNICHARSET();

	// Return the UNICHAR_ID of a given unichar representation within the
	// UNICHARSET.
	const UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;

	// Return the UNICHAR_ID of a given unichar representation within the
	// UNICHARSET. Only the first length characters from unichar_repr are used.
	const UNICHAR_ID unichar_to_id(const char* const unichar_repr,
	int length) const;

	// Return the minimum number of bytes that matches a legal UNICHAR_ID,
	// while leaving a legal UNICHAR_ID afterwards. In other words, if there
	// is both a short and a long match to the string, return the length that
	// ensures there is a legal match after it.
	int step(const char* str) const;

	// Return the unichar representation corresponding to the given UNICHAR_ID
	// within the UNICHARSET.
	const char* const id_to_unichar(UNICHAR_ID id) const;

	// Return a STRING that reformats the utf8 str into the str followed
	// by its hex unicodes.
	static STRING debug_utf8_str(const char* str);

	// Return a STRING containing debug information on the unichar, including
	// the id_to_unichar, its hex unicodes and the properties.
	STRING debug_str(UNICHAR_ID id) const;
	STRING debug_str(const char * unichar_repr) const {
	return debug_str(unichar_to_id(unichar_repr));
	}

	// Add a unichar representation to the set.
	void unichar_insert(const char* const unichar_repr);

	// Return true if the given unichar id exists within the set.
	// Relies on the fact that unichar ids are contiguous in the unicharset.
	bool contains_unichar_id(UNICHAR_ID unichar_id) const {
	return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used;
	}

	// Return true if the given unichar representation exists within the set.
	bool contains_unichar(const char* const unichar_repr) const;
	bool contains_unichar(const char* const unichar_repr, int length) const;

	// Return true if the given unichar representation corresponds to the given
	// UNICHAR_ID within the set.
	bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;

	// Delete CHAR_FRAGMENTs stored in properties of unichars array.
	void delete_pointers_in_unichars() {
	for (int i = 0; i < size_used; ++i) {
	if (unichars[i].properties.fragment != NULL) {
	delete unichars[i].properties.fragment;
	unichars[i].properties.fragment = NULL;
	}
	}
	}

	// Clear the UNICHARSET (all the previous data is lost).
	void clear() {
	if (size_reserved > 0) {
	for (int i = 0; i < script_table_size_used; ++i)
	delete[] script_table[i];
	delete[] script_table;
	script_table = 0;
	script_table_size_reserved = 0;
	script_table_size_used = 0;
	delete_pointers_in_unichars();
	delete[] unichars;
	unichars = 0;
	size_reserved = 0;
	size_used = 0;
	}
	ids.clear();
	}

	// Return the size of the set (the number of different UNICHAR it holds).
	int size() const {
	return size_used;
	}

	// Reserve enough memory space for the given number of UNICHARS
	void reserve(int unichars_number);

	// Opens the file indicated by filename and saves unicharset to that file.
	// Returns true if the operation is successful.
	bool save_to_file(const char * const filename) const {
	FILE* file = fopen(filename, "w+");
	if (file == NULL) return false;
	bool result = save_to_file(file);
	fclose(file);
	return result;
	}

	// Saves the content of the UNICHARSET to the given file.
	// Returns true if the operation is successful.
	bool save_to_file(FILE *file) const;

	// Opens the file indicated by filename and loads the UNICHARSET
	// from the given file. The previous data is lost.
	// Returns true if the operation is successful.
	bool load_from_file(const char* const filename) {
	FILE* file = fopen(filename, "r");
	if (file == NULL) return false;
	bool result = load_from_file(file);
	fclose(file);
	return result;
	}

	// Loads the UNICHARSET from the given file. The previous data is lost.
	// Returns true if the operation is successful.
	bool load_from_file(FILE *file);

	// Set a whitelist and/or blacklist of characters to recognize.
	// An empty or NULL whitelist enables everything (minus any blacklist).
	// An empty or NULL blacklist disables nothing.
	// The blacklist overrides the whitelist.
	// Each list is a string of utf8 character strings. Boundaries between
	// unicharset units are worked out automatically, and characters not in
	// the unicharset are silently ignored.
	void set_black_and_whitelist(const char* blacklist, const char* whitelist);

	// Set the isalpha property of the given unichar to the given value.
	void set_isalpha(UNICHAR_ID unichar_id, bool value) {
	unichars[unichar_id].properties.isalpha = value;
	}

	// Set the islower property of the given unichar to the given value.
	void set_islower(UNICHAR_ID unichar_id, bool value) {
	unichars[unichar_id].properties.islower = value;
	}

	// Set the isupper property of the given unichar to the given value.
	void set_isupper(UNICHAR_ID unichar_id, bool value) {
	unichars[unichar_id].properties.isupper = value;
	}

	// Set the isdigit property of the given unichar to the given value.
	void set_isdigit(UNICHAR_ID unichar_id, bool value) {
	unichars[unichar_id].properties.isdigit = value;
	}

	// Set the ispunctuation property of the given unichar to the given value.
	void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
	unichars[unichar_id].properties.ispunctuation = value;
	}

	// Set the isngram property of the given unichar to the given value.
	void set_isngram(UNICHAR_ID unichar_id, bool value) {
	unichars[unichar_id].properties.isngram = value;
	}

	// Set the script name of the given unichar to the given value.
	// Value is copied and thus can be a temporary;
	void set_script(UNICHAR_ID unichar_id, const char* value) {
	unichars[unichar_id].properties.script_id = add_script(value);
	}

	// Set other_case unichar id in the properties for the given unichar id.
	void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
	unichars[unichar_id].properties.other_case = other_case;
	}

	// Return the isalpha property of the given unichar.
	bool get_isalpha(UNICHAR_ID unichar_id) const {
	return unichars[unichar_id].properties.isalpha;
	}

	// Return the islower property of the given unichar.
	bool get_islower(UNICHAR_ID unichar_id) const {
	return unichars[unichar_id].properties.islower;
	}

	// Return the isupper property of the given unichar.
	bool get_isupper(UNICHAR_ID unichar_id) const {
	return unichars[unichar_id].properties.isupper;
	}

	// Return the isdigit property of the given unichar.
	bool get_isdigit(UNICHAR_ID unichar_id) const {
	return unichars[unichar_id].properties.isdigit;
	}

	// Return the ispunctuation property of the given unichar.
	bool get_ispunctuation(UNICHAR_ID unichar_id) const {
	return unichars[unichar_id].properties.ispunctuation;
	}

	// Return the isngram property of the given unichar.
	bool get_isngram(UNICHAR_ID unichar_id) const {
	return unichars[unichar_id].properties.isngram;
	}

	// Return the script name of the given unichar.
	// The returned pointer will always be the same for the same script, it's
	// managed by unicharset and thus MUST NOT be deleted
	int get_script(UNICHAR_ID unichar_id) const {
	return unichars[unichar_id].properties.script_id;
	}

	// Get other_case unichar id in the properties for the given unichar id.
	UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
	return unichars[unichar_id].properties.other_case;
	}

	// Returns UNICHAR_ID of the corresponding lower-case unichar.
	UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
	if (unichars[unichar_id].properties.islower) return unichar_id;
	return unichars[unichar_id].properties.other_case;
	}

	// Returns UNICHAR_ID of the corresponding upper-case unichar.
	UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
	if (unichars[unichar_id].properties.isupper) return unichar_id;
	return unichars[unichar_id].properties.other_case;
	}

	// Return a pointer to the CHAR_FRAGMENT class if the given
	// unichar id represents a character fragment.
	const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
	return unichars[unichar_id].properties.fragment;
	}

	// Return the isalpha property of the given unichar representation.
	bool get_isalpha(const char* const unichar_repr) const {
	return get_isalpha(unichar_to_id(unichar_repr));
	}

	// Return the islower property of the given unichar representation.
	bool get_islower(const char* const unichar_repr) const {
	return get_islower(unichar_to_id(unichar_repr));
	}

	// Return the isupper property of the given unichar representation.
	bool get_isupper(const char* const unichar_repr) const {
	return get_isupper(unichar_to_id(unichar_repr));
	}

	// Return the isdigit property of the given unichar representation.
	bool get_isdigit(const char* const unichar_repr) const {
	return get_isdigit(unichar_to_id(unichar_repr));
	}

	// Return the ispunctuation property of the given unichar representation.
	bool get_ispunctuation(const char* const unichar_repr) const {
	return get_ispunctuation(unichar_to_id(unichar_repr));
	}

	// Return the script name of the given unichar representation.
	// The returned pointer will always be the same for the same script, it's
	// managed by unicharset and thus MUST NOT be deleted
	int get_script(const char* const unichar_repr) const {
	return get_script(unichar_to_id(unichar_repr));
	}

	// Return a pointer to the CHAR_FRAGMENT class struct if the given
	// unichar representation represents a character fragment.
	const CHAR_FRAGMENT get_fragment(const char const unichar_repr) const {
	if (unichar_repr == NULL \|\| unichar_repr[0] == '\0' \|\|
	!ids.contains(unichar_repr)) {
	return NULL;
	}
	return get_fragment(unichar_to_id(unichar_repr));
	}

	// Return the isalpha property of the given unichar representation.
	// Only the first length characters from unichar_repr are used.
	bool get_isalpha(const char* const unichar_repr,
	int length) const {
	return get_isalpha(unichar_to_id(unichar_repr, length));
	}

	// Return the islower property of the given unichar representation.
	// Only the first length characters from unichar_repr are used.
	bool get_islower(const char* const unichar_repr,
	int length) const {
	return get_islower(unichar_to_id(unichar_repr, length));
	}

	// Return the isupper property of the given unichar representation.
	// Only the first length characters from unichar_repr are used.
	bool get_isupper(const char* const unichar_repr,
	int length) const {
	return get_isupper(unichar_to_id(unichar_repr, length));
	}

	// Return the isdigit property of the given unichar representation.
	// Only the first length characters from unichar_repr are used.
	bool get_isdigit(const char* const unichar_repr,
	int length) const {
	return get_isdigit(unichar_to_id(unichar_repr, length));
	}

	// Return the ispunctuation property of the given unichar representation.
	// Only the first length characters from unichar_repr are used.
	bool get_ispunctuation(const char* const unichar_repr,
	int length) const {
	return get_ispunctuation(unichar_to_id(unichar_repr, length));
	}

	// Return the script name of the given unichar representation.
	// Only the first length characters from unichar_repr are used.
	// The returned pointer will always be the same for the same script, it's
	// managed by unicharset and thus MUST NOT be deleted
	int get_script(const char* const unichar_repr,
	int length) const {
	return get_script(unichar_to_id(unichar_repr, length));
	}

	// Return the (current) number of scripts in the script table
	int get_script_table_size() const {
	return script_table_size_used;
	}

	// Return the script string from its id
	const char* get_script_from_script_id(int id) const {
	if (id >= script_table_size_used \|\| id < 0)
	return null_script;
	return script_table[id];
	}

	// Returns the id from the name of the script, or 0 if script is not found.
	// Note that this is an expensive operation since it involves iteratively
	// comparing strings in the script table. To avoid dependency on STL, we
	// won't use a hash. Instead, the calling function can use this to lookup
	// and save the ID for relevant scripts for fast comparisons later.
	int get_script_id_from_name(const char* script_name) const;

	// Return true if the given script is the null script
	bool is_null_script(const char* script) const {
	return script == null_script;
	}

	// Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
	// then the returned pointer will be the same.
	// The script parameter is copied and thus can be a temporary.
	int add_script(const char* script);

	// Return the enabled property of the given unichar.
	bool get_enabled(UNICHAR_ID unichar_id) const {
	return unichars[unichar_id].properties.enabled;
	}


	int null_sid() const { return null_sid_; }
	int common_sid() const { return common_sid_; }
	int latin_sid() const { return latin_sid_; }
	int cyrillic_sid() const { return cyrillic_sid_; }
	int greek_sid() const { return greek_sid_; }
	int han_sid() const { return han_sid_; }

	private:

	struct UNICHAR_PROPERTIES {
	bool isalpha;
	bool islower;
	bool isupper;
	bool isdigit;
	bool ispunctuation;
	bool isngram;
	bool enabled;
	int script_id;
	UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar

	// Contains meta information about the fragment if a unichar represents
	// a fragment of a character, otherwise should be set to NULL.
	// It is assumed that character fragments are added to the unicharset
	// after the corresponding 'base' characters.
	CHAR_FRAGMENT *fragment;
	};

	struct UNICHAR_SLOT {
	char representation[UNICHAR_LEN + 1];
	UNICHAR_PROPERTIES properties;
	};

	UNICHAR_SLOT* unichars;
	UNICHARMAP ids;
	int size_used;
	int size_reserved;
	char** script_table;
	int script_table_size_used;
	int script_table_size_reserved;
	const char* null_script;

	// A few convenient script name-to-id mapping without using hash.
	// These are initialized when unicharset file is loaded. Anything
	// missing from this list can be looked up using get_script_id_from_name.
	int null_sid_;
	int common_sid_;
	int latin_sid_;
	int cyrillic_sid_;
	int greek_sid_;
	int han_sid_;
	};

	#endif // TESSERACT_CCUTIL_UNICHARSET_H__