Fixes mismatch between Java and C++ indices.
am: 47c9e5861f
Change-Id: I418e76ea0041df6a65c6e635737fdbb0743c0915
diff --git a/tests/textclassifier_jni_test.cc b/tests/textclassifier_jni_test.cc
new file mode 100644
index 0000000..c441cf5
--- /dev/null
+++ b/tests/textclassifier_jni_test.cc
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "textclassifier_jni.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier {
+namespace {
+
+TEST(TextClassifier, ConvertIndicesBMPUTF8) {
+ EXPECT_EQ(ConvertIndicesBMPToUTF8("hello", {0, 5}),
+ ConvertIndicesUTF8ToBMP("hello", {0, 5}));
+
+ // Simple example where the longer character is before the selection.
+ // character ๐ is 0x1f601
+ EXPECT_EQ(ConvertIndicesBMPToUTF8("๐ Hello World.", {3, 8}),
+ std::make_pair(2, 7));
+
+ EXPECT_EQ(ConvertIndicesUTF8ToBMP("๐ Hello World.", {2, 7}),
+ std::make_pair(3, 8));
+
+ // Longer character is before and in selection.
+ EXPECT_EQ(ConvertIndicesBMPToUTF8("๐ Hell๐ World.", {3, 9}),
+ std::make_pair(2, 7));
+
+ EXPECT_EQ(ConvertIndicesUTF8ToBMP("๐ Hell๐ World.", {2, 7}),
+ std::make_pair(3, 9));
+
+ // Longer character is before and after selection.
+ EXPECT_EQ(ConvertIndicesBMPToUTF8("๐ Hello๐World.", {3, 8}),
+ std::make_pair(2, 7));
+
+ EXPECT_EQ(ConvertIndicesUTF8ToBMP("๐ Hello๐World.", {2, 7}),
+ std::make_pair(3, 8));
+
+ // Longer character is before in after selection.
+ EXPECT_EQ(ConvertIndicesBMPToUTF8("๐ Hell๐๐World.", {3, 9}),
+ std::make_pair(2, 7));
+
+ EXPECT_EQ(ConvertIndicesUTF8ToBMP("๐ Hell๐๐World.", {2, 7}),
+ std::make_pair(3, 9));
+}
+
+} // namespace
+} // namespace libtextclassifier
diff --git a/textclassifier_jni.cc b/textclassifier_jni.cc
index 4cb4af5..84a1e32 100644
--- a/textclassifier_jni.cc
+++ b/textclassifier_jni.cc
@@ -16,67 +16,14 @@
// Simple JNI wrapper for the SmartSelection library.
+#include "textclassifier_jni.h"
+
#include <jni.h>
#include <vector>
#include "lang_id/lang-id.h"
#include "smartselect/text-classification-model.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// SmartSelection.
-JNIEXPORT jlong JNICALL
-Java_android_view_textclassifier_SmartSelection_nativeNew(JNIEnv* env,
- jobject thiz,
- jint fd);
-
-JNIEXPORT jintArray JNICALL
-Java_android_view_textclassifier_SmartSelection_nativeSuggest(
- JNIEnv* env, jobject thiz, jlong ptr, jstring context, jint selection_begin,
- jint selection_end);
-
-JNIEXPORT jobjectArray JNICALL
-Java_android_view_textclassifier_SmartSelection_nativeClassifyText(
- JNIEnv* env, jobject thiz, jlong ptr, jstring context, jint selection_begin,
- jint selection_end, jint input_flags);
-
-JNIEXPORT void JNICALL
-Java_android_view_textclassifier_SmartSelection_nativeClose(JNIEnv* env,
- jobject thiz,
- jlong ptr);
-
-JNIEXPORT jstring JNICALL
-Java_android_view_textclassifier_SmartSelection_nativeGetLanguage(JNIEnv* env,
- jobject clazz,
- jint fd);
-
-JNIEXPORT jint JNICALL
-Java_android_view_textclassifier_SmartSelection_nativeGetVersion(JNIEnv* env,
- jobject clazz,
- jint fd);
-
-// LangId.
-JNIEXPORT jlong JNICALL Java_android_view_textclassifier_LangId_nativeNew(
- JNIEnv* env, jobject thiz, jint fd);
-
-JNIEXPORT jobjectArray JNICALL
-Java_android_view_textclassifier_LangId_nativeFindLanguages(JNIEnv* env,
- jobject thiz,
- jlong ptr,
- jstring text);
-
-JNIEXPORT void JNICALL Java_android_view_textclassifier_LangId_nativeClose(
- JNIEnv* env, jobject thiz, jlong ptr);
-
-JNIEXPORT int JNICALL Java_android_view_textclassifier_LangId_nativeGetVersion(
- JNIEnv* env, jobject clazz, jint fd);
-
-#ifdef __cplusplus
-}
-#endif
-
using libtextclassifier::TextClassificationModel;
using libtextclassifier::ModelOptions;
using libtextclassifier::nlp_core::lang_id::LangId;
@@ -143,6 +90,68 @@
} // namespace
+namespace libtextclassifier {
+
+using libtextclassifier::CodepointSpan;
+
+namespace {
+
+CodepointSpan ConvertIndicesBMPUTF8(const std::string& utf8_str,
+ CodepointSpan orig_indices,
+ bool from_utf8) {
+ const libtextclassifier::UnicodeText unicode_str =
+ libtextclassifier::UTF8ToUnicodeText(utf8_str, /*do_copy=*/false);
+
+ int unicode_index = 0;
+ int bmp_index = 0;
+
+ const int* source_index;
+ const int* target_index;
+ if (from_utf8) {
+ source_index = &unicode_index;
+ target_index = &bmp_index;
+ } else {
+ source_index = &bmp_index;
+ target_index = &unicode_index;
+ }
+
+ CodepointSpan result{-1, -1};
+ for (auto it = unicode_str.begin(); it != unicode_str.end();
+ ++it, ++unicode_index, ++bmp_index) {
+ if (orig_indices.first == *source_index) {
+ result.first = *target_index;
+ }
+
+ if (orig_indices.second == *source_index) {
+ result.second = *target_index;
+ }
+
+ // There is 1 extra character in the input for each UTF8 character > 0xFFFF.
+ if (*it > 0xFFFF) {
+ ++bmp_index;
+ }
+ }
+ return result;
+}
+
+} // namespace
+
+CodepointSpan ConvertIndicesBMPToUTF8(const std::string& utf8_str,
+ CodepointSpan orig_indices) {
+ return ConvertIndicesBMPUTF8(utf8_str, orig_indices, /*from_utf8=*/false);
+}
+
+CodepointSpan ConvertIndicesUTF8ToBMP(const std::string& utf8_str,
+ CodepointSpan orig_indices) {
+ return ConvertIndicesBMPUTF8(utf8_str, orig_indices, /*from_utf8=*/true);
+}
+
+} // namespace libtextclassifier
+
+using libtextclassifier::ConvertIndicesUTF8ToBMP;
+using libtextclassifier::ConvertIndicesBMPToUTF8;
+using libtextclassifier::CodepointSpan;
+
JNIEXPORT jlong JNICALL
Java_android_view_textclassifier_SmartSelection_nativeNew(JNIEnv* env,
jobject thiz,
@@ -158,8 +167,12 @@
TextClassificationModel* model =
reinterpret_cast<TextClassificationModel*>(ptr);
- const libtextclassifier::CodepointSpan selection = model->SuggestSelection(
- ToStlString(env, context), {selection_begin, selection_end});
+ const std::string context_utf8 = ToStlString(env, context);
+ CodepointSpan input_indices =
+ ConvertIndicesBMPToUTF8(context_utf8, {selection_begin, selection_end});
+ CodepointSpan selection =
+ model->SuggestSelection(context_utf8, input_indices);
+ selection = ConvertIndicesUTF8ToBMP(context_utf8, selection);
jintArray result = env->NewIntArray(2);
env->SetIntArrayRegion(result, 0, 1, &(std::get<0>(selection)));
diff --git a/textclassifier_jni.h b/textclassifier_jni.h
new file mode 100644
index 0000000..28bb444
--- /dev/null
+++ b/textclassifier_jni.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBTEXTCLASSIFIER_TEXTCLASSIFIER_JNI_H_
+#define LIBTEXTCLASSIFIER_TEXTCLASSIFIER_JNI_H_
+
+#include <jni.h>
+#include <string>
+
+#include "smartselect/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// SmartSelection.
+JNIEXPORT jlong JNICALL
+Java_android_view_textclassifier_SmartSelection_nativeNew(JNIEnv* env,
+ jobject thiz,
+ jint fd);
+
+JNIEXPORT jintArray JNICALL
+Java_android_view_textclassifier_SmartSelection_nativeSuggest(
+ JNIEnv* env, jobject thiz, jlong ptr, jstring context, jint selection_begin,
+ jint selection_end);
+
+JNIEXPORT jobjectArray JNICALL
+Java_android_view_textclassifier_SmartSelection_nativeClassifyText(
+ JNIEnv* env, jobject thiz, jlong ptr, jstring context, jint selection_begin,
+ jint selection_end, jint input_flags);
+
+JNIEXPORT void JNICALL
+Java_android_view_textclassifier_SmartSelection_nativeClose(JNIEnv* env,
+ jobject thiz,
+ jlong ptr);
+
+JNIEXPORT jstring JNICALL
+Java_android_view_textclassifier_SmartSelection_nativeGetLanguage(JNIEnv* env,
+ jobject clazz,
+ jint fd);
+
+JNIEXPORT jint JNICALL
+Java_android_view_textclassifier_SmartSelection_nativeGetVersion(JNIEnv* env,
+ jobject clazz,
+ jint fd);
+
+// LangId.
+JNIEXPORT jlong JNICALL Java_android_view_textclassifier_LangId_nativeNew(
+ JNIEnv* env, jobject thiz, jint fd);
+
+JNIEXPORT jobjectArray JNICALL
+Java_android_view_textclassifier_LangId_nativeFindLanguages(JNIEnv* env,
+ jobject thiz,
+ jlong ptr,
+ jstring text);
+
+JNIEXPORT void JNICALL Java_android_view_textclassifier_LangId_nativeClose(
+ JNIEnv* env, jobject thiz, jlong ptr);
+
+JNIEXPORT int JNICALL Java_android_view_textclassifier_LangId_nativeGetVersion(
+ JNIEnv* env, jobject clazz, jint fd);
+
+#ifdef __cplusplus
+}
+#endif
+
+namespace libtextclassifier {
+
+// Given a utf8 string and a span expressed in Java BMP (basic multilingual
+// plane) codepoints, converts it to a span expressed in utf8 codepoints.
+libtextclassifier::CodepointSpan ConvertIndicesBMPToUTF8(
+ const std::string& utf8_str, libtextclassifier::CodepointSpan bmp_indices);
+
+// Given a utf8 string and a span expressed in utf8 codepoints, converts it to a
+// span expressed in Java BMP (basic multilingual plane) codepoints.
+libtextclassifier::CodepointSpan ConvertIndicesUTF8ToBMP(
+ const std::string& utf8_str, libtextclassifier::CodepointSpan utf8_indices);
+
+} // namespace libtextclassifier
+
+#endif // LIBTEXTCLASSIFIER_TEXTCLASSIFIER_JNI_H_