| /* |
| * Copyright (C) 2018 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| // An implementation of Unilib that uses Android Java interfaces via JNI. The |
| // performance critical ops have been re-implemented in C++. |
| // Specifically, this class must be compatible with API level 14 (ICS). |
| |
| #ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_ |
| #define LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_ |
| |
| #include <jni.h> |
| #include <memory> |
| #include <string> |
| |
| #include "utils/base/integral_types.h" |
| #include "utils/java/jni-cache.h" |
| #include "utils/java/scoped_global_ref.h" |
| #include "utils/java/scoped_local_ref.h" |
| #include "utils/utf8/unicodetext.h" |
| |
| namespace libtextclassifier3 { |
| |
| class UniLib { |
| public: |
| UniLib(); |
| explicit UniLib(const std::shared_ptr<JniCache>& jni_cache); |
| |
| bool ParseInt32(const UnicodeText& text, int* result) const; |
| bool IsOpeningBracket(char32 codepoint) const; |
| bool IsClosingBracket(char32 codepoint) const; |
| bool IsWhitespace(char32 codepoint) const; |
| bool IsDigit(char32 codepoint) const; |
| bool IsUpper(char32 codepoint) const; |
| |
| char32 ToLower(char32 codepoint) const; |
| char32 GetPairedBracket(char32 codepoint) const; |
| |
| // Forward declaration for friend. |
| class RegexPattern; |
| |
| class RegexMatcher { |
| public: |
| static constexpr int kError = -1; |
| static constexpr int kNoError = 0; |
| |
| // Checks whether the input text matches the pattern exactly. |
| bool Matches(int* status) const; |
| |
| // Approximate Matches() implementation implemented using Find(). It uses |
| // the first Find() result and then checks that it spans the whole input. |
| // NOTE: Unlike Matches() it can result in false negatives. |
| // NOTE: Resets the matcher, so the current Find() state will be lost. |
| bool ApproximatelyMatches(int* status); |
| |
| // Finds occurrences of the pattern in the input text. |
| // Can be called repeatedly to find all occurences. A call will update |
| // internal state, so that 'Start', 'End' and 'Group' can be called to get |
| // information about the match. |
| // NOTE: Any call to ApproximatelyMatches() in between Find() calls will |
| // modify the state. |
| bool Find(int* status); |
| |
| // Gets the start offset of the last match (from 'Find'). |
| // Sets status to 'kError' if 'Find' |
| // was not called previously. |
| int Start(int* status) const; |
| |
| // Gets the start offset of the specified group of the last match. |
| // (from 'Find'). |
| // Sets status to 'kError' if an invalid group was specified or if 'Find' |
| // was not called previously. |
| int Start(int group_idx, int* status) const; |
| |
| // Gets the end offset of the last match (from 'Find'). |
| // Sets status to 'kError' if 'Find' |
| // was not called previously. |
| int End(int* status) const; |
| |
| // Gets the end offset of the specified group of the last match. |
| // (from 'Find'). |
| // Sets status to 'kError' if an invalid group was specified or if 'Find' |
| // was not called previously. |
| int End(int group_idx, int* status) const; |
| |
| // Gets the text of the last match (from 'Find'). |
| // Sets status to 'kError' if 'Find' was not called previously. |
| UnicodeText Group(int* status) const; |
| |
| // Gets the text of the specified group of the last match (from 'Find'). |
| // Sets status to 'kError' if an invalid group was specified or if 'Find' |
| // was not called previously. |
| UnicodeText Group(int group_idx, int* status) const; |
| |
| protected: |
| friend class RegexPattern; |
| RegexMatcher(const JniCache* jni_cache, ScopedGlobalRef<jobject> matcher, |
| ScopedGlobalRef<jstring> text); |
| |
| private: |
| bool UpdateLastFindOffset() const; |
| |
| const JniCache* jni_cache_; |
| ScopedGlobalRef<jobject> matcher_; |
| ScopedGlobalRef<jstring> text_; |
| mutable int last_find_offset_ = 0; |
| mutable int last_find_offset_codepoints_ = 0; |
| mutable bool last_find_offset_dirty_ = true; |
| }; |
| |
| class RegexPattern { |
| public: |
| std::unique_ptr<RegexMatcher> Matcher(const UnicodeText& context) const; |
| |
| protected: |
| friend class UniLib; |
| RegexPattern(const JniCache* jni_cache, const UnicodeText& regex); |
| |
| private: |
| const JniCache* jni_cache_; |
| ScopedGlobalRef<jobject> pattern_; |
| }; |
| |
| class BreakIterator { |
| public: |
| int Next(); |
| |
| static constexpr int kDone = -1; |
| |
| protected: |
| friend class UniLib; |
| BreakIterator(const JniCache* jni_cache, const UnicodeText& text); |
| |
| private: |
| const JniCache* jni_cache_; |
| ScopedGlobalRef<jstring> text_; |
| ScopedGlobalRef<jobject> iterator_; |
| int last_break_index_; |
| int last_unicode_index_; |
| }; |
| |
| std::unique_ptr<RegexPattern> CreateRegexPattern( |
| const UnicodeText& regex) const; |
| std::unique_ptr<BreakIterator> CreateBreakIterator( |
| const UnicodeText& text) const; |
| |
| private: |
| std::shared_ptr<JniCache> jni_cache_; |
| }; |
| |
| } // namespace libtextclassifier3 |
| |
| #endif // LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_ |