blob: 442d06ec25b6c5c817219e9f4c09fd526a9c8c57 [file] [log] [blame]
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <jni.h>
#include <string>
#include "absl/memory/memory.h"
#include "tensorflow_lite_support/cc/text/tokenizers/bert_tokenizer.h"
#include "tensorflow_lite_support/cc/text/tokenizers/tokenizer_jni_lib.h"
#include "tensorflow_lite_support/cc/utils/jni_utils.h"
namespace tflite {
namespace support {
using ::tflite::support::text::tokenizer::BertTokenizer;
using ::tflite::support::text::tokenizer::BertTokenizerOptions;
using ::tflite::support::utils::StringListToVector;
extern "C" JNIEXPORT jlong JNICALL
Java_org_tensorflow_lite_support_text_tokenizers_BertTokenizer_nativeLoadResource( // NOLINT
JNIEnv* env, jobject thiz, jobject vocab_list, jint max_bytes_per_token,
jint max_chars_per_sub_token, jstring jsuffix_indicator,
jboolean use_unknown_token, jstring junknown_token,
jboolean split_unknown_chars) {
// Convert java.util.List<String> into std::vector<string>
std::vector<std::string> vocab = StringListToVector(env, vocab_list);
// Convert jstrings to std::string
const char* raw_suffix_indicator =
env->GetStringUTFChars(jsuffix_indicator, JNI_FALSE);
std::string suffix_indicator(raw_suffix_indicator);
const char* raw_unknown_token =
env->GetStringUTFChars(junknown_token, JNI_FALSE);
std::string unknown_token(raw_unknown_token);
auto handle = absl::make_unique<BertTokenizer>(
vocab, BertTokenizerOptions{
.max_bytes_per_token = max_bytes_per_token,
.max_chars_per_subtoken = max_chars_per_sub_token,
.suffix_indicator = suffix_indicator,
.use_unknown_token = static_cast<bool>(use_unknown_token),
.unknown_token = unknown_token,
.split_unknown_chars = static_cast<bool>(split_unknown_chars),
.delim_str = text::tokenizer::kDefaultDelimRe,
.include_delim_str = text::tokenizer::kDefaultIncludeDelimRe});
env->ReleaseStringUTFChars(jsuffix_indicator, raw_suffix_indicator);
env->ReleaseStringUTFChars(junknown_token, raw_unknown_token);
return reinterpret_cast<jlong>(handle.release());
}
extern "C" JNIEXPORT jlong JNICALL
Java_org_tensorflow_lite_support_text_tokenizers_BertTokenizer_nativeUnloadResource( // NOLINT
JNIEnv* env, jobject thiz, jlong handle) {
delete reinterpret_cast<BertTokenizer*>(handle);
return 0;
}
extern "C" JNIEXPORT jobjectArray JNICALL
Java_org_tensorflow_lite_support_text_tokenizers_BertTokenizer_nativeTokenize(
JNIEnv* env, jobject thiz, jlong handle, jstring jtext) {
return nativeTokenize(env, handle, jtext);
}
extern "C" JNIEXPORT jintArray JNICALL
Java_org_tensorflow_lite_support_text_tokenizers_BertTokenizer_nativeConvertTokensToIds( // NOLINT
JNIEnv* env, jobject thiz, jlong handle, jobjectArray jtokens) {
return nativeConvertTokensToIds(env, handle, jtokens);
}
} // namespace support
} // namespace tflite