blob: cf816eefa565a01163478e8131bf44ac7932737b [file] [log] [blame]
/*
* Copyright (C) 2017 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_
#define LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_
namespace libtextclassifier {
namespace nlp_core {
namespace lang_id {
// Unicode scripts we care about. To get compact and fast code, we detect only
// a few Unicode scripts that offer a strong indication about the language of
// the text (e.g., Hiragana -> Japanese).
enum Script {
// Special value to indicate internal errors in the script detection code.
kScriptError,
// Special values for all Unicode scripts that we do not detect. One special
// value for Unicode characters of 1, 2, 3, respectively 4 bytes (as we
// already have that information, we use it). kScriptOtherUtf8OneByte means
// ~Latin and kScriptOtherUtf8FourBytes means ~Han.
kScriptOtherUtf8OneByte,
kScriptOtherUtf8TwoBytes,
kScriptOtherUtf8ThreeBytes,
kScriptOtherUtf8FourBytes,
kScriptGreek,
kScriptCyrillic,
kScriptHebrew,
kScriptArabic,
kScriptHangulJamo, // Used primarily for Korean.
kScriptHiragana, // Used primarily for Japanese.
kScriptKatakana, // Used primarily for Japanese.
// Add new scripts here.
// Do not add any script after kNumRelevantScripts. This value indicates the
// number of elements in this enum Script (except this value) such that we can
// easily iterate over the scripts.
kNumRelevantScripts,
};
template<typename IntType>
inline bool InRange(IntType value, IntType low, IntType hi) {
return (value >= low) && (value <= hi);
}
// Returns Script for the UTF8 character that starts at address p.
// Precondition: p points to a valid UTF8 character of num_bytes bytes.
inline Script GetScript(const unsigned char *p, int num_bytes) {
switch (num_bytes) {
case 1:
return kScriptOtherUtf8OneByte;
case 2: {
// 2-byte UTF8 characters have 11 bits of information. unsigned int has
// at least 16 bits (http://en.cppreference.com/w/cpp/language/types) so
// it's enough. It's also usually the fastest int type on the current
// CPU, so it's better to use than int32.
static const unsigned int kGreekStart = 0x370;
// Commented out (unsued in the code): kGreekEnd = 0x3FF;
static const unsigned int kCyrillicStart = 0x400;
static const unsigned int kCyrillicEnd = 0x4FF;
static const unsigned int kHebrewStart = 0x590;
// Commented out (unsued in the code): kHebrewEnd = 0x5FF;
static const unsigned int kArabicStart = 0x600;
static const unsigned int kArabicEnd = 0x6FF;
const unsigned int codepoint = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F);
if (codepoint > kCyrillicEnd) {
if (codepoint >= kArabicStart) {
if (codepoint <= kArabicEnd) {
return kScriptArabic;
}
} else {
// At this point, codepoint < kArabicStart = kHebrewEnd + 1, so
// codepoint <= kHebrewEnd.
if (codepoint >= kHebrewStart) {
return kScriptHebrew;
}
}
} else {
if (codepoint >= kCyrillicStart) {
return kScriptCyrillic;
} else {
// At this point, codepoint < kCyrillicStart = kGreekEnd + 1, so
// codepoint <= kGreekEnd.
if (codepoint >= kGreekStart) {
return kScriptGreek;
}
}
}
return kScriptOtherUtf8TwoBytes;
}
case 3: {
// 3-byte UTF8 characters have 16 bits of information. unsigned int has
// at least 16 bits.
static const unsigned int kHangulJamoStart = 0x1100;
static const unsigned int kHangulJamoEnd = 0x11FF;
static const unsigned int kHiraganaStart = 0x3041;
static const unsigned int kHiraganaEnd = 0x309F;
// Commented out (unsued in the code): kKatakanaStart = 0x30A0;
static const unsigned int kKatakanaEnd = 0x30FF;
const unsigned int codepoint =
((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F);
if (codepoint > kHiraganaEnd) {
// On this branch, codepoint > kHiraganaEnd = kKatakanaStart - 1, so
// codepoint >= kKatakanaStart.
if (codepoint <= kKatakanaEnd) {
return kScriptKatakana;
}
} else {
if (codepoint >= kHiraganaStart) {
return kScriptHiragana;
} else {
if (InRange(codepoint, kHangulJamoStart, kHangulJamoEnd)) {
return kScriptHangulJamo;
}
}
}
return kScriptOtherUtf8ThreeBytes;
}
case 4:
return kScriptOtherUtf8FourBytes;
default:
return kScriptError;
}
}
// Returns Script for the UTF8 character that starts at address p. Similar to
// the previous version of GetScript, except for "char" vs "unsigned char".
// Most code works with "char *" pointers, ignoring the fact that char is
// unsigned (by default) on most platforms, but signed on iOS. This code takes
// care of making sure we always treat chars as unsigned.
inline Script GetScript(const char *p, int num_bytes) {
return GetScript(reinterpret_cast<const unsigned char *>(p),
num_bytes);
}
} // namespace lang_id
} // namespace nlp_core
} // namespace libtextclassifier
#endif // LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_