Rewrite UTF-8 encoding.
Create a templated implementation to serve all the UTF-8
related encodings. Use it for `CharsetUtils.toUtf8Bytes()`
and the JNI `GetStringUTFChars()`. Do not use it yet for
the internal `ConvertUtf16ToModifiedUtf8()` which shall be
switched to actual Modified UTF-8 in a separate change.
Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: atest CtsLibcoreTestCases:libcore.java.lang.StringTest
Bug: 192935764
Change-Id: Id3ec3a0bc2638becdf4181d9c2b6794c91a75314
diff --git a/libdexfile/dex/utf-inl.h b/libdexfile/dex/utf-inl.h
index 5355766..d255610 100644
--- a/libdexfile/dex/utf-inl.h
+++ b/libdexfile/dex/utf-inl.h
@@ -94,6 +94,55 @@
return GetTrailingUtf16Char(c1) - GetTrailingUtf16Char(c2);
}
+template <bool kUseShortZero, bool kUse4ByteSequence, bool kReplaceBadSurrogates, typename Append>
+inline void ConvertUtf16ToUtf8(const uint16_t* utf16, size_t char_count, Append&& append) {
+ static_assert(kUse4ByteSequence || !kReplaceBadSurrogates);
+
+ // Use local helpers instead of macros from `libicu` to avoid the dependency on `libicu`.
+ auto is_lead = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0xfc00u) == 0xd800u; };
+ auto is_trail = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0xfc00u) == 0xdc00u; };
+ auto is_surrogate = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0xf800u) == 0xd800u; };
+ auto is_surrogate_lead = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0x0400u) == 0u; };
+ auto get_supplementary = [](uint16_t lead, uint16_t trail) ALWAYS_INLINE {
+ constexpr uint32_t offset = (0xd800u << 10) + 0xdc00u - 0x10000u;
+ return (static_cast<uint32_t>(lead) << 10) + static_cast<uint32_t>(trail) - offset;
+ };
+
+ for (size_t i = 0u; i < char_count; ++i) {
+ auto has_trail = [&]() { return i + 1u != char_count && is_trail(utf16[i + 1u]); };
+
+ uint16_t ch = utf16[i];
+ if (ch < 0x80u && (kUseShortZero || ch != 0u)) {
+ // One byte.
+ append(ch);
+ } else if (ch < 0x800u) {
+ // Two bytes.
+ append((ch >> 6) | 0xc0);
+ append((ch & 0x3f) | 0x80);
+ } else if (kReplaceBadSurrogates
+ ? is_surrogate(ch)
+ : kUse4ByteSequence && is_lead(ch) && has_trail()) {
+ if (kReplaceBadSurrogates && (!is_surrogate_lead(ch) || !has_trail())) {
+ append('?');
+ } else {
+ // We have a *valid* surrogate pair.
+ uint32_t code_point = get_supplementary(ch, utf16[i + 1u]);
+ ++i; // Consume the leading surrogate.
+ // Four bytes.
+ append((code_point >> 18) | 0xf0);
+ append(((code_point >> 12) & 0x3f) | 0x80);
+ append(((code_point >> 6) & 0x3f) | 0x80);
+ append((code_point & 0x3f) | 0x80);
+ }
+ } else {
+ // Three bytes.
+ append((ch >> 12) | 0xe0);
+ append(((ch >> 6) & 0x3f) | 0x80);
+ append((ch & 0x3f) | 0x80);
+ }
+ }
+}
+
} // namespace art
#endif // ART_LIBDEXFILE_DEX_UTF_INL_H_
diff --git a/libdexfile/dex/utf.h b/libdexfile/dex/utf.h
index e3dc7f9..6949319 100644
--- a/libdexfile/dex/utf.h
+++ b/libdexfile/dex/utf.h
@@ -66,6 +66,24 @@
size_t utf16_length);
/*
+ * Helper template for converting UTF-16 to UTF-8 and similar encodings.
+ *
+ * Template arguments:
+ * kUseShortZero: Encode U+0000 as a single byte with value 0 (otherwise emit 0xc0 0x80).
+ * kUse4ByteSequence: Encode valid surrogate pairs as a 4-byte sequence.
+ * kReplaceBadSurrogates: Replace unmatched surrogates with '?' (otherwise use 3-byte sequence).
+ * Must be false if kUse4ByteSequence is false.
+ * Append: The type of the `append` functor. Should be deduced automatically.
+ *
+ * Encoding kUseShortZero kUse4ByteSequence kReplaceBadSurrogates
+ * UTF-8 true true true
+ * Modified UTF8 false false n/a
+ * JNI GetStringUTFChars false true false
+ */
+template <bool kUseShortZero, bool kUse4ByteSequence, bool kReplaceBadSurrogates, typename Append>
+void ConvertUtf16ToUtf8(const uint16_t* utf16, size_t char_count, Append&& append);
+
+/*
* Convert from UTF-16 to Modified UTF-8. Note that the output is _not_
* NUL-terminated. You probably need to call CountUtf8Bytes before calling
* this anyway, so if you want a NUL-terminated string, you know where to
diff --git a/runtime/Android.bp b/runtime/Android.bp
index a801340..edc6a64 100644
--- a/runtime/Android.bp
+++ b/runtime/Android.bp
@@ -374,7 +374,6 @@
],
shared_libs: [
"libdl_android",
- "libicu",
"libstatssocket",
"libz", // For adler32.
"heapprofd_client_api",
@@ -405,9 +404,6 @@
"runtime_linux.cc",
"thread_linux.cc",
],
- header_libs: [
- "libicuuc_headers",
- ],
shared_libs: [
"libz", // For adler32.
],
diff --git a/runtime/jni/jni_internal.cc b/runtime/jni/jni_internal.cc
index b417bec..57102e3 100644
--- a/runtime/jni/jni_internal.cc
+++ b/runtime/jni/jni_internal.cc
@@ -214,6 +214,27 @@
bool has_bad_char_;
};
+// The JNI specification says that `GetStringUTFLength()`, `GetStringUTFChars()`
+// and `GetStringUTFRegion()` should emit the Modified UTF-8 encoding.
+// However, we have been emitting 4-byte UTF-8 sequences for several years now
+// and changing that would risk breaking a lot of binary interfaces.
+constexpr bool kUtfUseShortZero = false;
+constexpr bool kUtfUse4ByteSequence = true; // This is against the JNI spec.
+constexpr bool kUtfReplaceBadSurrogates = false;
+
+jsize GetUncompressedStringUTFLength(const uint16_t* chars, size_t length) {
+ jsize byte_count = 0;
+ ConvertUtf16ToUtf8<kUtfUseShortZero, kUtfUse4ByteSequence, kUtfReplaceBadSurrogates>(
+ chars, length, [&](char c ATTRIBUTE_UNUSED) { ++byte_count; });
+ return byte_count;
+}
+
+char* GetUncompressedStringUTFChars(const uint16_t* chars, size_t length, char* dest) {
+ ConvertUtf16ToUtf8<kUtfUseShortZero, kUtfUse4ByteSequence, kUtfReplaceBadSurrogates>(
+ chars, length, [&](char c) { *dest++ = c; });
+ return dest;
+}
+
} // namespace
// Consider turning this on when there is errors which could be related to JNI array copies such as
@@ -2044,7 +2065,10 @@
static jsize GetStringUTFLength(JNIEnv* env, jstring java_string) {
CHECK_NON_NULL_ARGUMENT_RETURN_ZERO(java_string);
ScopedObjectAccess soa(env);
- return soa.Decode<mirror::String>(java_string)->GetUtfLength();
+ ObjPtr<mirror::String> str = soa.Decode<mirror::String>(java_string);
+ return str->IsCompressed()
+ ? str->GetLength()
+ : GetUncompressedStringUTFLength(str->GetValue(), str->GetLength());
}
static void GetStringRegion(JNIEnv* env, jstring java_string, jsize start, jsize length,
@@ -2088,10 +2112,8 @@
}
buf[length] = '\0';
} else {
- const jchar* chars = s->GetValue();
- size_t bytes = CountUtf8Bytes(chars + start, length);
- ConvertUtf16ToModifiedUtf8(buf, bytes, chars + start, length);
- buf[bytes] = '\0';
+ char* end = GetUncompressedStringUTFChars(s->GetValue() + start, length, buf);
+ *end = '\0';
}
}
}
@@ -2195,9 +2217,12 @@
if (is_copy != nullptr) {
*is_copy = JNI_TRUE;
}
+
ScopedObjectAccess soa(env);
ObjPtr<mirror::String> s = soa.Decode<mirror::String>(java_string);
- size_t byte_count = s->GetUtfLength();
+ size_t length = s->GetLength();
+ size_t byte_count =
+ s->IsCompressed() ? length : GetUncompressedStringUTFLength(s->GetValue(), length);
char* bytes = new char[byte_count + 1];
CHECK(bytes != nullptr); // bionic aborts anyway.
if (s->IsCompressed()) {
@@ -2206,8 +2231,8 @@
bytes[i] = src[i];
}
} else {
- const uint16_t* chars = s->GetValue();
- ConvertUtf16ToModifiedUtf8(bytes, byte_count, chars, s->GetLength());
+ char* end = GetUncompressedStringUTFChars(s->GetValue(), length, bytes);
+ DCHECK_EQ(byte_count, static_cast<size_t>(end - bytes));
}
bytes[byte_count] = '\0';
return bytes;
diff --git a/runtime/native/libcore_util_CharsetUtils.cc b/runtime/native/libcore_util_CharsetUtils.cc
index 56dca72..92f355e 100644
--- a/runtime/native/libcore_util_CharsetUtils.cc
+++ b/runtime/native/libcore_util_CharsetUtils.cc
@@ -18,6 +18,7 @@
#include <string.h>
+#include "dex/utf-inl.h"
#include "handle_scope-inl.h"
#include "jni/jni_internal.h"
#include "mirror/string-inl.h"
@@ -26,7 +27,6 @@
#include "nativehelper/scoped_primitive_array.h"
#include "nativehelper/jni_macros.h"
#include "scoped_fast_native_object_access-inl.h"
-#include "unicode/utf16.h"
namespace art {
@@ -125,48 +125,16 @@
DCHECK_GE(length, 0);
DCHECK_LE(length, string->GetLength() - offset);
- auto visit_chars16 = [string, offset, length](auto append) REQUIRES_SHARED(Locks::mutator_lock_) {
- const uint16_t* chars16 = string->GetValue() + offset;
- for (int i = 0; i < length; ++i) {
- jint ch = chars16[i];
- if (ch < 0x80) {
- // One byte.
- append(ch);
- } else if (ch < 0x800) {
- // Two bytes.
- append((ch >> 6) | 0xc0);
- append((ch & 0x3f) | 0x80);
- } else if (U16_IS_SURROGATE(ch)) {
- // A supplementary character.
- jchar high = static_cast<jchar>(ch);
- jchar low = (i + 1 != length) ? chars16[i + 1] : 0;
- if (!U16_IS_SURROGATE_LEAD(high) || !U16_IS_TRAIL(low)) {
- append('?');
- continue;
- }
- // Now we know we have a *valid* surrogate pair, we can consume the low surrogate.
- ++i;
- ch = U16_GET_SUPPLEMENTARY(high, low);
- // Four bytes.
- append((ch >> 18) | 0xf0);
- append(((ch >> 12) & 0x3f) | 0x80);
- append(((ch >> 6) & 0x3f) | 0x80);
- append((ch & 0x3f) | 0x80);
- } else {
- // Three bytes.
- append((ch >> 12) | 0xe0);
- append(((ch >> 6) & 0x3f) | 0x80);
- append((ch & 0x3f) | 0x80);
- }
- }
- };
-
bool compressed = string->IsCompressed();
size_t utf8_length = 0;
if (compressed) {
utf8_length = length;
} else {
- visit_chars16([&utf8_length](jbyte c ATTRIBUTE_UNUSED) { ++utf8_length; });
+ const uint16_t* utf16 = string->GetValue() + offset;
+ auto count_length = [&utf8_length](jbyte c ATTRIBUTE_UNUSED) ALWAYS_INLINE { ++utf8_length; };
+ ConvertUtf16ToUtf8</*kUseShortZero=*/ true,
+ /*kUse4ByteSequence=*/ true,
+ /*kReplaceBadSurrogates=*/ true>(utf16, length, count_length);
}
ObjPtr<mirror::ByteArray> result =
mirror::ByteArray::Alloc(soa.Self(), dchecked_integral_cast<int32_t>(utf8_length));
@@ -177,8 +145,12 @@
if (compressed) {
memcpy(result->GetData(), string->GetValueCompressed() + offset, length);
} else {
+ const uint16_t* utf16 = string->GetValue() + offset;
int8_t* data = result->GetData();
- visit_chars16([&data](jbyte c) { *data++ = c; });
+ auto store_data = [&data](jbyte c) ALWAYS_INLINE { *data++ = c; };
+ ConvertUtf16ToUtf8</*kUseShortZero=*/ true,
+ /*kUse4ByteSequence=*/ true,
+ /*kReplaceBadSurrogates=*/ true>(utf16, length, store_data);
}
return soa.AddLocalReference<jbyteArray>(result);
}