blob: a5ea54fe5d8d33c13a39188a3e4c409ba61ed934 [file] [log] [blame]
/*
* Copyright (C) 2018 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// An implementation of Unilib that uses Android Java interfaces via JNI. The
// performance critical ops have been re-implemented in C++.
// Specifically, this class must be compatible with API level 14 (ICS).
#ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_
#define LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_
#include <jni.h>
#include <memory>
#include <string>
#include "utils/base/integral_types.h"
#include "utils/java/jni-cache.h"
#include "utils/java/scoped_global_ref.h"
#include "utils/java/scoped_local_ref.h"
#include "utils/utf8/unicodetext.h"
namespace libtextclassifier3 {
class UniLib {
public:
UniLib();
explicit UniLib(const std::shared_ptr<JniCache>& jni_cache);
bool ParseInt32(const UnicodeText& text, int* result) const;
bool IsOpeningBracket(char32 codepoint) const;
bool IsClosingBracket(char32 codepoint) const;
bool IsWhitespace(char32 codepoint) const;
bool IsDigit(char32 codepoint) const;
bool IsUpper(char32 codepoint) const;
char32 ToLower(char32 codepoint) const;
char32 GetPairedBracket(char32 codepoint) const;
// Forward declaration for friend.
class RegexPattern;
class RegexMatcher {
public:
static constexpr int kError = -1;
static constexpr int kNoError = 0;
// Checks whether the input text matches the pattern exactly.
bool Matches(int* status) const;
// Approximate Matches() implementation implemented using Find(). It uses
// the first Find() result and then checks that it spans the whole input.
// NOTE: Unlike Matches() it can result in false negatives.
// NOTE: Resets the matcher, so the current Find() state will be lost.
bool ApproximatelyMatches(int* status);
// Finds occurrences of the pattern in the input text.
// Can be called repeatedly to find all occurences. A call will update
// internal state, so that 'Start', 'End' and 'Group' can be called to get
// information about the match.
// NOTE: Any call to ApproximatelyMatches() in between Find() calls will
// modify the state.
bool Find(int* status);
// Gets the start offset of the last match (from 'Find').
// Sets status to 'kError' if 'Find'
// was not called previously.
int Start(int* status) const;
// Gets the start offset of the specified group of the last match.
// (from 'Find').
// Sets status to 'kError' if an invalid group was specified or if 'Find'
// was not called previously.
int Start(int group_idx, int* status) const;
// Gets the end offset of the last match (from 'Find').
// Sets status to 'kError' if 'Find'
// was not called previously.
int End(int* status) const;
// Gets the end offset of the specified group of the last match.
// (from 'Find').
// Sets status to 'kError' if an invalid group was specified or if 'Find'
// was not called previously.
int End(int group_idx, int* status) const;
// Gets the text of the last match (from 'Find').
// Sets status to 'kError' if 'Find' was not called previously.
UnicodeText Group(int* status) const;
// Gets the text of the specified group of the last match (from 'Find').
// Sets status to 'kError' if an invalid group was specified or if 'Find'
// was not called previously.
UnicodeText Group(int group_idx, int* status) const;
protected:
friend class RegexPattern;
RegexMatcher(const JniCache* jni_cache, ScopedGlobalRef<jobject> matcher,
ScopedGlobalRef<jstring> text);
private:
bool UpdateLastFindOffset() const;
const JniCache* jni_cache_;
ScopedGlobalRef<jobject> matcher_;
ScopedGlobalRef<jstring> text_;
mutable int last_find_offset_ = 0;
mutable int last_find_offset_codepoints_ = 0;
mutable bool last_find_offset_dirty_ = true;
};
class RegexPattern {
public:
std::unique_ptr<RegexMatcher> Matcher(const UnicodeText& context) const;
protected:
friend class UniLib;
RegexPattern(const JniCache* jni_cache, const UnicodeText& regex);
private:
const JniCache* jni_cache_;
ScopedGlobalRef<jobject> pattern_;
};
class BreakIterator {
public:
int Next();
static constexpr int kDone = -1;
protected:
friend class UniLib;
BreakIterator(const JniCache* jni_cache, const UnicodeText& text);
private:
const JniCache* jni_cache_;
ScopedGlobalRef<jstring> text_;
ScopedGlobalRef<jobject> iterator_;
int last_break_index_;
int last_unicode_index_;
};
std::unique_ptr<RegexPattern> CreateRegexPattern(
const UnicodeText& regex) const;
std::unique_ptr<BreakIterator> CreateBreakIterator(
const UnicodeText& text) const;
private:
std::shared_ptr<JniCache> jni_cache_;
};
} // namespace libtextclassifier3
#endif // LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_