blob: 713ae03f404c994c319e56df53c44766f9e6d9bd [file] [log] [blame]
///////////////////////////////////////////////////////////////////////
// File: unicharset.h
// Description: Unicode character/ligature set class.
// Author: Thomas Kielbus
// Created: Wed Jun 28 17:05:01 PDT 2006
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCUTIL_UNICHARSET_H__
#define TESSERACT_CCUTIL_UNICHARSET_H__
#include "strngs.h"
#include "unichar.h"
#include "unicharmap.h"
class CHAR_FRAGMENT {
public:
// Minimum number of characters used for fragment representation.
static const int kMinLen = 6;
// Maximum number of characters used for fragment representation.
static const int kMaxLen = 3 + UNICHAR_LEN + 2;
// Special character used in representing character fragments.
static const char kSeparator = '|';
// Maximum number of fragments per character.
static const int kMaxChunks = 3;
// Setters and Getters.
inline void set_all(const char *unichar, int pos, int total) {
this->set_unichar(unichar);
this->set_pos(pos);
this->set_total(total);
}
inline void set_unichar(const char *uch) {
strncpy(this->unichar, uch, UNICHAR_LEN);
this->unichar[UNICHAR_LEN] = '\0';
}
inline void set_pos(int p) { this->pos = p; }
inline void set_total(int t) { this->total = t; }
inline const char* get_unichar() const { return this->unichar; }
inline int get_pos() const { return this->pos; }
inline int get_total() const { return this->total; }
// Returns the string that represents a fragment
// with the given unichar, pos and total.
static STRING to_string(const char *unichar, int pos, int total) {
STRING result = "";
result += kSeparator;
result += unichar;
char buffer[kMaxLen];
snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos, kSeparator, total);
result += buffer;
return result;
}
// Returns the string that represents this fragment.
STRING to_string() const {
return to_string(this->unichar, this->pos, this->total);
}
// Checks whether a fragment has the same unichar,
// position and total as the given inputs.
inline bool equals(const char *other_unichar,
int other_pos, int other_total) const {
return (strcmp(this->unichar, other_unichar) == 0 &&
this->pos == other_pos && this->total == other_total);
}
inline bool equals(const CHAR_FRAGMENT *other) const {
return this->equals(other->get_unichar(),
other->get_pos(),
other->get_total());
}
// Checks whether a given fragment is a continuation of this fragment.
// Assumes that the given fragment pointer is not NULL.
inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
this->total == fragment->get_total() &&
this->pos == fragment->get_pos() + 1);
}
// Returns true if this fragment is a beginning fragment.
inline bool is_beginning() const { return this->pos == 0; }
// Returns true if this fragment is an ending fragment.
inline bool is_ending() const { return this->pos == this->total-1; }
// Parses the string to see whether it represents a character fragment
// (rather than a regular character). If so, allocates memory for a new
// CHAR_FRAGMENT instance and fills it in with the corresponding fragment
// information. Fragments are of the form:
// |m|1|2, meaning chunk 1 of 2 of character m.
//
// If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
// instance, otherwise (if the string does not represent a fragment or it
// looks like it does, but parsing it as a fragment fails) returns NULL.
//
// Note: The caller is responsible for deallocating memory
// associated with the returned pointer.
static CHAR_FRAGMENT *parse_from_string(const char *str);
private:
char unichar[UNICHAR_LEN + 1];
inT16 pos; // fragment position in the character
inT16 total; // total number of fragments in the character
};
// The UNICHARSET class is an utility class for Tesseract that holds the
// set of characters that are used by the engine. Each character is identified
// by a unique number, from 0 to (size - 1).
class UNICHARSET {
public:
// Create an empty UNICHARSET
UNICHARSET();
~UNICHARSET();
// Return the UNICHAR_ID of a given unichar representation within the
// UNICHARSET.
const UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
// Return the UNICHAR_ID of a given unichar representation within the
// UNICHARSET. Only the first length characters from unichar_repr are used.
const UNICHAR_ID unichar_to_id(const char* const unichar_repr,
int length) const;
// Return the minimum number of bytes that matches a legal UNICHAR_ID,
// while leaving a legal UNICHAR_ID afterwards. In other words, if there
// is both a short and a long match to the string, return the length that
// ensures there is a legal match after it.
int step(const char* str) const;
// Return the unichar representation corresponding to the given UNICHAR_ID
// within the UNICHARSET.
const char* const id_to_unichar(UNICHAR_ID id) const;
// Return a STRING that reformats the utf8 str into the str followed
// by its hex unicodes.
static STRING debug_utf8_str(const char* str);
// Return a STRING containing debug information on the unichar, including
// the id_to_unichar, its hex unicodes and the properties.
STRING debug_str(UNICHAR_ID id) const;
STRING debug_str(const char * unichar_repr) const {
return debug_str(unichar_to_id(unichar_repr));
}
// Add a unichar representation to the set.
void unichar_insert(const char* const unichar_repr);
// Return true if the given unichar id exists within the set.
// Relies on the fact that unichar ids are contiguous in the unicharset.
bool contains_unichar_id(UNICHAR_ID unichar_id) {
return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used;
}
// Return true if the given unichar representation exists within the set.
bool contains_unichar(const char* const unichar_repr);
bool contains_unichar(const char* const unichar_repr, int length);
// Return true if the given unichar representation corresponds to the given
// UNICHAR_ID within the set.
bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr);
// Delete CHAR_FRAGMENTs stored in properties of unichars array.
void delete_fragments() {
for (int i = 0; i < size_used; ++i) {
if (unichars[i].properties.fragment != NULL) {
delete unichars[i].properties.fragment;
unichars[i].properties.fragment = NULL;
}
}
}
// Clear the UNICHARSET (all the previous data is lost).
void clear() {
if (size_reserved > 0) {
for (int i = 0; i < script_table_size_used; ++i)
delete[] script_table[i];
delete[] script_table;
script_table = 0;
script_table_size_reserved = 0;
script_table_size_used = 0;
delete_fragments();
delete[] unichars;
unichars = 0;
size_reserved = 0;
size_used = 0;
}
ids.clear();
}
// Return the size of the set (the number of different UNICHAR it holds).
int size() const {
return size_used;
}
// Reserve enough memory space for the given number of UNICHARS
void reserve(int unichars_number);
// Save the content of the UNICHARSET to the given file. Return true if the
// operation is successful.
bool save_to_file(const char* const filename) const;
// Load the UNICHARSET from the given file. The previous data is lost. Return
// true if the operation is successful.
bool load_from_file(const char* const filename);
// Set a whitelist and/or blacklist of characters to recognize.
// An empty or NULL whitelist enables everything (minus any blacklist).
// An empty or NULL blacklist disables nothing.
// The blacklist overrides the whitelist.
// Each list is a string of utf8 character strings. Boundaries between
// unicharset units are worked out automatically, and characters not in
// the unicharset are silently ignored.
void set_black_and_whitelist(const char* blacklist, const char* whitelist);
// Set the isalpha property of the given unichar to the given value.
void set_isalpha(UNICHAR_ID unichar_id, bool value) {
unichars[unichar_id].properties.isalpha = value;
}
// Set the islower property of the given unichar to the given value.
void set_islower(UNICHAR_ID unichar_id, bool value) {
unichars[unichar_id].properties.islower = value;
}
// Set the isupper property of the given unichar to the given value.
void set_isupper(UNICHAR_ID unichar_id, bool value) {
unichars[unichar_id].properties.isupper = value;
}
// Set the isdigit property of the given unichar to the given value.
void set_isdigit(UNICHAR_ID unichar_id, bool value) {
unichars[unichar_id].properties.isdigit = value;
}
// Set the script name of the given unichar to the given value.
// Value is copied and thus can be a temporary;
void set_script(UNICHAR_ID unichar_id, const char* value) {
unichars[unichar_id].properties.script_id = add_script(value);
}
// Return the isalpha property of the given unichar.
bool get_isalpha(UNICHAR_ID unichar_id) const {
return unichars[unichar_id].properties.isalpha;
}
// Return the islower property of the given unichar.
bool get_islower(UNICHAR_ID unichar_id) const {
return unichars[unichar_id].properties.islower;
}
// Return the isupper property of the given unichar.
bool get_isupper(UNICHAR_ID unichar_id) const {
return unichars[unichar_id].properties.isupper;
}
// Return the isdigit property of the given unichar.
bool get_isdigit(UNICHAR_ID unichar_id) const {
return unichars[unichar_id].properties.isdigit;
}
// Return the script name of the given unichar.
// The returned pointer will always be the same for the same script, it's
// managed by unicharset and thus MUST NOT be deleted
int get_script(UNICHAR_ID unichar_id) const {
return unichars[unichar_id].properties.script_id;
}
// Return a pointer to the CHAR_FRAGMENT class if the given
// unichar id represents a character fragment.
const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
return unichars[unichar_id].properties.fragment;
}
// Return the isalpha property of the given unichar representation.
bool get_isalpha(const char* const unichar_repr) const {
return get_isalpha(unichar_to_id(unichar_repr));
}
// Return the islower property of the given unichar representation.
bool get_islower(const char* const unichar_repr) const {
return get_islower(unichar_to_id(unichar_repr));
}
// Return the isupper property of the given unichar representation.
bool get_isupper(const char* const unichar_repr) const {
return get_isupper(unichar_to_id(unichar_repr));
}
// Return the isdigit property of the given unichar representation.
bool get_isdigit(const char* const unichar_repr) const {
return get_isdigit(unichar_to_id(unichar_repr));
}
// Return the script name of the given unichar representation.
// The returned pointer will always be the same for the same script, it's
// managed by unicharset and thus MUST NOT be deleted
int get_script(const char* const unichar_repr) const {
return get_script(unichar_to_id(unichar_repr));
}
// Return a pointer to the CHAR_FRAGMENT class struct if the given
// unichar representation represents a character fragment.
const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
!ids.contains(unichar_repr)) {
return NULL;
}
return get_fragment(unichar_to_id(unichar_repr));
}
// Return the isalpha property of the given unichar representation.
// Only the first length characters from unichar_repr are used.
bool get_isalpha(const char* const unichar_repr,
int length) const {
return get_isalpha(unichar_to_id(unichar_repr, length));
}
// Return the islower property of the given unichar representation.
// Only the first length characters from unichar_repr are used.
bool get_islower(const char* const unichar_repr,
int length) const {
return get_islower(unichar_to_id(unichar_repr, length));
}
// Return the isupper property of the given unichar representation.
// Only the first length characters from unichar_repr are used.
bool get_isupper(const char* const unichar_repr,
int length) const {
return get_isupper(unichar_to_id(unichar_repr, length));
}
// Return the isdigit property of the given unichar representation.
// Only the first length characters from unichar_repr are used.
bool get_isdigit(const char* const unichar_repr,
int length) const {
return get_isdigit(unichar_to_id(unichar_repr, length));
}
// Return the script name of the given unichar representation.
// Only the first length characters from unichar_repr are used.
// The returned pointer will always be the same for the same script, it's
// managed by unicharset and thus MUST NOT be deleted
int get_script(const char* const unichar_repr,
int length) const {
return get_script(unichar_to_id(unichar_repr, length));
}
// Return the (current) number of scripts in the script table
int get_script_table_size() const {
return script_table_size_used;
}
// Return the script string from its id
const char* get_script_from_script_id(int id) const {
if (id >= script_table_size_used || id < 0)
return null_script;
return script_table[id];
}
// Return true if the given script is the null script
bool is_null_script(const char* script) const {
return script == null_script;
}
// Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
// then the returned pointer will be the same.
// The script parameter is copied and thus can be a temporary.
int add_script(const char* script);
// Return the enabled property of the given unichar.
bool get_enabled(UNICHAR_ID unichar_id) const {
return unichars[unichar_id].properties.enabled;
}
private:
struct UNICHAR_PROPERTIES {
bool isalpha;
bool islower;
bool isupper;
bool isdigit;
bool enabled;
int script_id;
// Contains meta information about the fragment if a unichar represents
// a fragment of a character, otherwise should be set to NULL.
// It is assumed that character fragments are added to the unicharset
// after the corresponding 'base' characters.
CHAR_FRAGMENT *fragment;
};
struct UNICHAR_SLOT {
char representation[UNICHAR_LEN + 1];
UNICHAR_PROPERTIES properties;
};
UNICHAR_SLOT* unichars;
UNICHARMAP ids;
int size_used;
int size_reserved;
char** script_table;
int script_table_size_used;
int script_table_size_reserved;
const char* null_script;
};
#endif // TESSERACT_CCUTIL_UNICHARSET_H__