| /////////////////////////////////////////////////////////////////////// |
| // File: unicharset.h |
| // Description: Unicode character/ligature set class. |
| // Author: Thomas Kielbus |
| // Created: Wed Jun 28 17:05:01 PDT 2006 |
| // |
| // (C) Copyright 2006, Google Inc. |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| /////////////////////////////////////////////////////////////////////// |
| |
| #ifndef TESSERACT_CCUTIL_UNICHARSET_H__ |
| #define TESSERACT_CCUTIL_UNICHARSET_H__ |
| |
| #include "strngs.h" |
| #include "unichar.h" |
| #include "unicharmap.h" |
| |
| class CHAR_FRAGMENT { |
| public: |
| // Minimum number of characters used for fragment representation. |
| static const int kMinLen = 6; |
| // Maximum number of characters used for fragment representation. |
| static const int kMaxLen = 3 + UNICHAR_LEN + 2; |
| // Special character used in representing character fragments. |
| static const char kSeparator = '|'; |
| // Maximum number of fragments per character. |
| static const int kMaxChunks = 3; |
| |
| // Setters and Getters. |
| inline void set_all(const char *unichar, int pos, int total) { |
| this->set_unichar(unichar); |
| this->set_pos(pos); |
| this->set_total(total); |
| } |
| inline void set_unichar(const char *uch) { |
| strncpy(this->unichar, uch, UNICHAR_LEN); |
| this->unichar[UNICHAR_LEN] = '\0'; |
| } |
| inline void set_pos(int p) { this->pos = p; } |
| inline void set_total(int t) { this->total = t; } |
| inline const char* get_unichar() const { return this->unichar; } |
| inline int get_pos() const { return this->pos; } |
| inline int get_total() const { return this->total; } |
| |
| // Returns the string that represents a fragment |
| // with the given unichar, pos and total. |
| static STRING to_string(const char *unichar, int pos, int total) { |
| STRING result = ""; |
| result += kSeparator; |
| result += unichar; |
| char buffer[kMaxLen]; |
| snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos, kSeparator, total); |
| result += buffer; |
| return result; |
| } |
| // Returns the string that represents this fragment. |
| STRING to_string() const { |
| return to_string(this->unichar, this->pos, this->total); |
| } |
| |
| // Checks whether a fragment has the same unichar, |
| // position and total as the given inputs. |
| inline bool equals(const char *other_unichar, |
| int other_pos, int other_total) const { |
| return (strcmp(this->unichar, other_unichar) == 0 && |
| this->pos == other_pos && this->total == other_total); |
| } |
| inline bool equals(const CHAR_FRAGMENT *other) const { |
| return this->equals(other->get_unichar(), |
| other->get_pos(), |
| other->get_total()); |
| } |
| |
| // Checks whether a given fragment is a continuation of this fragment. |
| // Assumes that the given fragment pointer is not NULL. |
| inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const { |
| return (strcmp(this->unichar, fragment->get_unichar()) == 0 && |
| this->total == fragment->get_total() && |
| this->pos == fragment->get_pos() + 1); |
| } |
| |
| // Returns true if this fragment is a beginning fragment. |
| inline bool is_beginning() const { return this->pos == 0; } |
| |
| // Returns true if this fragment is an ending fragment. |
| inline bool is_ending() const { return this->pos == this->total-1; } |
| |
| // Parses the string to see whether it represents a character fragment |
| // (rather than a regular character). If so, allocates memory for a new |
| // CHAR_FRAGMENT instance and fills it in with the corresponding fragment |
| // information. Fragments are of the form: |
| // |m|1|2, meaning chunk 1 of 2 of character m. |
| // |
| // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT |
| // instance, otherwise (if the string does not represent a fragment or it |
| // looks like it does, but parsing it as a fragment fails) returns NULL. |
| // |
| // Note: The caller is responsible for deallocating memory |
| // associated with the returned pointer. |
| static CHAR_FRAGMENT *parse_from_string(const char *str); |
| |
| private: |
| char unichar[UNICHAR_LEN + 1]; |
| inT16 pos; // fragment position in the character |
| inT16 total; // total number of fragments in the character |
| }; |
| |
| // The UNICHARSET class is an utility class for Tesseract that holds the |
| // set of characters that are used by the engine. Each character is identified |
| // by a unique number, from 0 to (size - 1). |
| class UNICHARSET { |
| public: |
| |
| // Create an empty UNICHARSET |
| UNICHARSET(); |
| |
| ~UNICHARSET(); |
| |
| // Return the UNICHAR_ID of a given unichar representation within the |
| // UNICHARSET. |
| const UNICHAR_ID unichar_to_id(const char* const unichar_repr) const; |
| |
| // Return the UNICHAR_ID of a given unichar representation within the |
| // UNICHARSET. Only the first length characters from unichar_repr are used. |
| const UNICHAR_ID unichar_to_id(const char* const unichar_repr, |
| int length) const; |
| |
| // Return the minimum number of bytes that matches a legal UNICHAR_ID, |
| // while leaving a legal UNICHAR_ID afterwards. In other words, if there |
| // is both a short and a long match to the string, return the length that |
| // ensures there is a legal match after it. |
| int step(const char* str) const; |
| |
| // Return the unichar representation corresponding to the given UNICHAR_ID |
| // within the UNICHARSET. |
| const char* const id_to_unichar(UNICHAR_ID id) const; |
| |
| // Return a STRING that reformats the utf8 str into the str followed |
| // by its hex unicodes. |
| static STRING debug_utf8_str(const char* str); |
| |
| // Return a STRING containing debug information on the unichar, including |
| // the id_to_unichar, its hex unicodes and the properties. |
| STRING debug_str(UNICHAR_ID id) const; |
| STRING debug_str(const char * unichar_repr) const { |
| return debug_str(unichar_to_id(unichar_repr)); |
| } |
| |
| // Add a unichar representation to the set. |
| void unichar_insert(const char* const unichar_repr); |
| |
| // Return true if the given unichar id exists within the set. |
| // Relies on the fact that unichar ids are contiguous in the unicharset. |
| bool contains_unichar_id(UNICHAR_ID unichar_id) { |
| return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used; |
| } |
| |
| // Return true if the given unichar representation exists within the set. |
| bool contains_unichar(const char* const unichar_repr); |
| bool contains_unichar(const char* const unichar_repr, int length); |
| |
| // Return true if the given unichar representation corresponds to the given |
| // UNICHAR_ID within the set. |
| bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr); |
| |
| // Delete CHAR_FRAGMENTs stored in properties of unichars array. |
| void delete_fragments() { |
| for (int i = 0; i < size_used; ++i) { |
| if (unichars[i].properties.fragment != NULL) { |
| delete unichars[i].properties.fragment; |
| unichars[i].properties.fragment = NULL; |
| } |
| } |
| } |
| |
| // Clear the UNICHARSET (all the previous data is lost). |
| void clear() { |
| if (size_reserved > 0) { |
| for (int i = 0; i < script_table_size_used; ++i) |
| delete[] script_table[i]; |
| delete[] script_table; |
| script_table = 0; |
| script_table_size_reserved = 0; |
| script_table_size_used = 0; |
| delete_fragments(); |
| delete[] unichars; |
| unichars = 0; |
| size_reserved = 0; |
| size_used = 0; |
| } |
| ids.clear(); |
| } |
| |
| // Return the size of the set (the number of different UNICHAR it holds). |
| int size() const { |
| return size_used; |
| } |
| |
| // Reserve enough memory space for the given number of UNICHARS |
| void reserve(int unichars_number); |
| |
| // Save the content of the UNICHARSET to the given file. Return true if the |
| // operation is successful. |
| bool save_to_file(const char* const filename) const; |
| |
| // Load the UNICHARSET from the given file. The previous data is lost. Return |
| // true if the operation is successful. |
| bool load_from_file(const char* const filename); |
| |
| // Set a whitelist and/or blacklist of characters to recognize. |
| // An empty or NULL whitelist enables everything (minus any blacklist). |
| // An empty or NULL blacklist disables nothing. |
| // The blacklist overrides the whitelist. |
| // Each list is a string of utf8 character strings. Boundaries between |
| // unicharset units are worked out automatically, and characters not in |
| // the unicharset are silently ignored. |
| void set_black_and_whitelist(const char* blacklist, const char* whitelist); |
| |
| // Set the isalpha property of the given unichar to the given value. |
| void set_isalpha(UNICHAR_ID unichar_id, bool value) { |
| unichars[unichar_id].properties.isalpha = value; |
| } |
| |
| // Set the islower property of the given unichar to the given value. |
| void set_islower(UNICHAR_ID unichar_id, bool value) { |
| unichars[unichar_id].properties.islower = value; |
| } |
| |
| // Set the isupper property of the given unichar to the given value. |
| void set_isupper(UNICHAR_ID unichar_id, bool value) { |
| unichars[unichar_id].properties.isupper = value; |
| } |
| |
| // Set the isdigit property of the given unichar to the given value. |
| void set_isdigit(UNICHAR_ID unichar_id, bool value) { |
| unichars[unichar_id].properties.isdigit = value; |
| } |
| |
| // Set the script name of the given unichar to the given value. |
| // Value is copied and thus can be a temporary; |
| void set_script(UNICHAR_ID unichar_id, const char* value) { |
| unichars[unichar_id].properties.script_id = add_script(value); |
| } |
| |
| // Return the isalpha property of the given unichar. |
| bool get_isalpha(UNICHAR_ID unichar_id) const { |
| return unichars[unichar_id].properties.isalpha; |
| } |
| |
| // Return the islower property of the given unichar. |
| bool get_islower(UNICHAR_ID unichar_id) const { |
| return unichars[unichar_id].properties.islower; |
| } |
| |
| // Return the isupper property of the given unichar. |
| bool get_isupper(UNICHAR_ID unichar_id) const { |
| return unichars[unichar_id].properties.isupper; |
| } |
| |
| // Return the isdigit property of the given unichar. |
| bool get_isdigit(UNICHAR_ID unichar_id) const { |
| return unichars[unichar_id].properties.isdigit; |
| } |
| |
| // Return the script name of the given unichar. |
| // The returned pointer will always be the same for the same script, it's |
| // managed by unicharset and thus MUST NOT be deleted |
| int get_script(UNICHAR_ID unichar_id) const { |
| return unichars[unichar_id].properties.script_id; |
| } |
| |
| // Return a pointer to the CHAR_FRAGMENT class if the given |
| // unichar id represents a character fragment. |
| const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const { |
| return unichars[unichar_id].properties.fragment; |
| } |
| |
| // Return the isalpha property of the given unichar representation. |
| bool get_isalpha(const char* const unichar_repr) const { |
| return get_isalpha(unichar_to_id(unichar_repr)); |
| } |
| |
| // Return the islower property of the given unichar representation. |
| bool get_islower(const char* const unichar_repr) const { |
| return get_islower(unichar_to_id(unichar_repr)); |
| } |
| |
| // Return the isupper property of the given unichar representation. |
| bool get_isupper(const char* const unichar_repr) const { |
| return get_isupper(unichar_to_id(unichar_repr)); |
| } |
| |
| // Return the isdigit property of the given unichar representation. |
| bool get_isdigit(const char* const unichar_repr) const { |
| return get_isdigit(unichar_to_id(unichar_repr)); |
| } |
| |
| // Return the script name of the given unichar representation. |
| // The returned pointer will always be the same for the same script, it's |
| // managed by unicharset and thus MUST NOT be deleted |
| int get_script(const char* const unichar_repr) const { |
| return get_script(unichar_to_id(unichar_repr)); |
| } |
| |
| // Return a pointer to the CHAR_FRAGMENT class struct if the given |
| // unichar representation represents a character fragment. |
| const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const { |
| if (unichar_repr == NULL || unichar_repr[0] == '\0' || |
| !ids.contains(unichar_repr)) { |
| return NULL; |
| } |
| return get_fragment(unichar_to_id(unichar_repr)); |
| } |
| |
| // Return the isalpha property of the given unichar representation. |
| // Only the first length characters from unichar_repr are used. |
| bool get_isalpha(const char* const unichar_repr, |
| int length) const { |
| return get_isalpha(unichar_to_id(unichar_repr, length)); |
| } |
| |
| // Return the islower property of the given unichar representation. |
| // Only the first length characters from unichar_repr are used. |
| bool get_islower(const char* const unichar_repr, |
| int length) const { |
| return get_islower(unichar_to_id(unichar_repr, length)); |
| } |
| |
| // Return the isupper property of the given unichar representation. |
| // Only the first length characters from unichar_repr are used. |
| bool get_isupper(const char* const unichar_repr, |
| int length) const { |
| return get_isupper(unichar_to_id(unichar_repr, length)); |
| } |
| |
| // Return the isdigit property of the given unichar representation. |
| // Only the first length characters from unichar_repr are used. |
| bool get_isdigit(const char* const unichar_repr, |
| int length) const { |
| return get_isdigit(unichar_to_id(unichar_repr, length)); |
| } |
| |
| // Return the script name of the given unichar representation. |
| // Only the first length characters from unichar_repr are used. |
| // The returned pointer will always be the same for the same script, it's |
| // managed by unicharset and thus MUST NOT be deleted |
| int get_script(const char* const unichar_repr, |
| int length) const { |
| return get_script(unichar_to_id(unichar_repr, length)); |
| } |
| |
| // Return the (current) number of scripts in the script table |
| int get_script_table_size() const { |
| return script_table_size_used; |
| } |
| |
| // Return the script string from its id |
| const char* get_script_from_script_id(int id) const { |
| if (id >= script_table_size_used || id < 0) |
| return null_script; |
| return script_table[id]; |
| } |
| |
| // Return true if the given script is the null script |
| bool is_null_script(const char* script) const { |
| return script == null_script; |
| } |
| |
| // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0, |
| // then the returned pointer will be the same. |
| // The script parameter is copied and thus can be a temporary. |
| int add_script(const char* script); |
| |
| // Return the enabled property of the given unichar. |
| bool get_enabled(UNICHAR_ID unichar_id) const { |
| return unichars[unichar_id].properties.enabled; |
| } |
| |
| private: |
| |
| struct UNICHAR_PROPERTIES { |
| bool isalpha; |
| bool islower; |
| bool isupper; |
| bool isdigit; |
| bool enabled; |
| int script_id; |
| |
| // Contains meta information about the fragment if a unichar represents |
| // a fragment of a character, otherwise should be set to NULL. |
| // It is assumed that character fragments are added to the unicharset |
| // after the corresponding 'base' characters. |
| CHAR_FRAGMENT *fragment; |
| }; |
| |
| struct UNICHAR_SLOT { |
| char representation[UNICHAR_LEN + 1]; |
| UNICHAR_PROPERTIES properties; |
| }; |
| |
| UNICHAR_SLOT* unichars; |
| UNICHARMAP ids; |
| int size_used; |
| int size_reserved; |
| char** script_table; |
| int script_table_size_used; |
| int script_table_size_reserved; |
| const char* null_script; |
| }; |
| |
| #endif // TESSERACT_CCUTIL_UNICHARSET_H__ |