| /* |
| * Copyright (C) 2017 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #ifndef LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_ |
| #define LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_ |
| |
| namespace libtextclassifier { |
| namespace nlp_core { |
| namespace lang_id { |
| |
| // Unicode scripts we care about. To get compact and fast code, we detect only |
| // a few Unicode scripts that offer a strong indication about the language of |
| // the text (e.g., Hiragana -> Japanese). |
| enum Script { |
| // Special value to indicate internal errors in the script detection code. |
| kScriptError, |
| |
| // Special values for all Unicode scripts that we do not detect. One special |
| // value for Unicode characters of 1, 2, 3, respectively 4 bytes (as we |
| // already have that information, we use it). kScriptOtherUtf8OneByte means |
| // ~Latin and kScriptOtherUtf8FourBytes means ~Han. |
| kScriptOtherUtf8OneByte, |
| kScriptOtherUtf8TwoBytes, |
| kScriptOtherUtf8ThreeBytes, |
| kScriptOtherUtf8FourBytes, |
| |
| kScriptGreek, |
| kScriptCyrillic, |
| kScriptHebrew, |
| kScriptArabic, |
| kScriptHangulJamo, // Used primarily for Korean. |
| kScriptHiragana, // Used primarily for Japanese. |
| kScriptKatakana, // Used primarily for Japanese. |
| |
| // Add new scripts here. |
| |
| // Do not add any script after kNumRelevantScripts. This value indicates the |
| // number of elements in this enum Script (except this value) such that we can |
| // easily iterate over the scripts. |
| kNumRelevantScripts, |
| }; |
| |
| template<typename IntType> |
| inline bool InRange(IntType value, IntType low, IntType hi) { |
| return (value >= low) && (value <= hi); |
| } |
| |
| // Returns Script for the UTF8 character that starts at address p. |
| // Precondition: p points to a valid UTF8 character of num_bytes bytes. |
| inline Script GetScript(const unsigned char *p, int num_bytes) { |
| switch (num_bytes) { |
| case 1: |
| return kScriptOtherUtf8OneByte; |
| |
| case 2: { |
| // 2-byte UTF8 characters have 11 bits of information. unsigned int has |
| // at least 16 bits (http://en.cppreference.com/w/cpp/language/types) so |
| // it's enough. It's also usually the fastest int type on the current |
| // CPU, so it's better to use than int32. |
| static const unsigned int kGreekStart = 0x370; |
| |
| // Commented out (unsued in the code): kGreekEnd = 0x3FF; |
| static const unsigned int kCyrillicStart = 0x400; |
| static const unsigned int kCyrillicEnd = 0x4FF; |
| static const unsigned int kHebrewStart = 0x590; |
| |
| // Commented out (unsued in the code): kHebrewEnd = 0x5FF; |
| static const unsigned int kArabicStart = 0x600; |
| static const unsigned int kArabicEnd = 0x6FF; |
| const unsigned int codepoint = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F); |
| if (codepoint > kCyrillicEnd) { |
| if (codepoint >= kArabicStart) { |
| if (codepoint <= kArabicEnd) { |
| return kScriptArabic; |
| } |
| } else { |
| // At this point, codepoint < kArabicStart = kHebrewEnd + 1, so |
| // codepoint <= kHebrewEnd. |
| if (codepoint >= kHebrewStart) { |
| return kScriptHebrew; |
| } |
| } |
| } else { |
| if (codepoint >= kCyrillicStart) { |
| return kScriptCyrillic; |
| } else { |
| // At this point, codepoint < kCyrillicStart = kGreekEnd + 1, so |
| // codepoint <= kGreekEnd. |
| if (codepoint >= kGreekStart) { |
| return kScriptGreek; |
| } |
| } |
| } |
| return kScriptOtherUtf8TwoBytes; |
| } |
| |
| case 3: { |
| // 3-byte UTF8 characters have 16 bits of information. unsigned int has |
| // at least 16 bits. |
| static const unsigned int kHangulJamoStart = 0x1100; |
| static const unsigned int kHangulJamoEnd = 0x11FF; |
| static const unsigned int kHiraganaStart = 0x3041; |
| static const unsigned int kHiraganaEnd = 0x309F; |
| |
| // Commented out (unsued in the code): kKatakanaStart = 0x30A0; |
| static const unsigned int kKatakanaEnd = 0x30FF; |
| const unsigned int codepoint = |
| ((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F); |
| if (codepoint > kHiraganaEnd) { |
| // On this branch, codepoint > kHiraganaEnd = kKatakanaStart - 1, so |
| // codepoint >= kKatakanaStart. |
| if (codepoint <= kKatakanaEnd) { |
| return kScriptKatakana; |
| } |
| } else { |
| if (codepoint >= kHiraganaStart) { |
| return kScriptHiragana; |
| } else { |
| if (InRange(codepoint, kHangulJamoStart, kHangulJamoEnd)) { |
| return kScriptHangulJamo; |
| } |
| } |
| } |
| return kScriptOtherUtf8ThreeBytes; |
| } |
| |
| case 4: |
| return kScriptOtherUtf8FourBytes; |
| |
| default: |
| return kScriptError; |
| } |
| } |
| |
| // Returns Script for the UTF8 character that starts at address p. Similar to |
| // the previous version of GetScript, except for "char" vs "unsigned char". |
| // Most code works with "char *" pointers, ignoring the fact that char is |
| // unsigned (by default) on most platforms, but signed on iOS. This code takes |
| // care of making sure we always treat chars as unsigned. |
| inline Script GetScript(const char *p, int num_bytes) { |
| return GetScript(reinterpret_cast<const unsigned char *>(p), |
| num_bytes); |
| } |
| |
| } // namespace lang_id |
| } // namespace nlp_core |
| } // namespace libtextclassifier |
| |
| #endif // LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_ |