| /* |
| * Copyright (C) 2018 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_APPROX_SCRIPT_H_ |
| #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_APPROX_SCRIPT_H_ |
| |
| #include "lang_id/common/utf8.h" |
| #include "lang_id/script/script-detector.h" |
| |
| namespace libtextclassifier3 { |
| namespace mobile { |
| |
| // Returns script for the UTF-8 character that starts at address |s| and has |
| // |num_bytes| bytes. Note: behavior is unspecified if s points to a UTF-8 |
| // character that has a different number of bytes. If you don't know |
| // |num_bytes|, call GetApproxScript(const char *s). |
| // |
| // NOTE: to keep BUILD deps small, this function returns an int, but you can |
| // assume it's an enum UScriptCode (unicode/uscript.h) |
| // |
| // If unable to determine the script, this function returns kUnknownUscript, the |
| // int value of USCRIPT_UNKNOWN from enum UScriptCode. |
| int GetApproxScript(const unsigned char *s, int num_bytes); |
| |
| // See comments for GetApproxScript() above. |
| extern const int kUnknownUscript; |
| |
| // Same as before, but s is a const char *pointer (no unsigned). Internally, we |
| // prefer "unsigned char" (the signed status of char is ambiguous), so we cast |
| // and call the previous version (with const unsigned char *). |
| inline int GetApproxScript(const char *s, int num_bytes) { |
| return GetApproxScript(reinterpret_cast<const unsigned char *>(s), num_bytes); |
| } |
| |
| // Returns script for the UTF-8 character that starts at address |s|. NOTE: |
| // UTF-8 is a var-length encoding, taking between 1 and 4 bytes per Unicode |
| // character. We infer the number of bytes based on s[0]. If that number is k, |
| // we expect to be able to read k bytes starting from address |s|. I.e., do not |
| // call this function on broken UTF-8. |
| inline int GetApproxScript(const char *s) { |
| return GetApproxScript(s, utils::OneCharLen(s)); |
| } |
| |
| // Returns max value returned by the GetApproxScript() functions. |
| int GetMaxApproxScriptResult(); |
| |
| class ApproxScriptDetector : public ScriptDetector { |
| public: |
| ~ApproxScriptDetector() override = default; |
| |
| // Note: the int result of this method is actually a UScriptCode enum value. |
| // We return int to match the general case from the base class ScriptDetector |
| // (some script detectors do not use UScriptCode). |
| int GetScript(const char *s, int num_bytes) const override { |
| return GetApproxScript(s, num_bytes); |
| } |
| |
| int GetMaxScript() const override { |
| return GetMaxApproxScriptResult(); |
| } |
| |
| SAFTM_DEFINE_REGISTRATION_METHOD("approx-unicode-script-detector", |
| ApproxScriptDetector); |
| }; |
| |
| } // namespace mobile |
| } // namespace nlp_saft |
| |
| #endif // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_APPROX_SCRIPT_H_ |