Rewrite UTF-8 encoding. Create a templated implementation to serve all the UTF-8 related encodings. Use it for `CharsetUtils.toUtf8Bytes()` and the JNI `GetStringUTFChars()`. Do not use it yet for the internal `ConvertUtf16ToModifiedUtf8()` which shall be switched to actual Modified UTF-8 in a separate change. Test: m test-art-host-gtest Test: testrunner.py --host --optimizing Test: atest CtsLibcoreTestCases:libcore.java.lang.StringTest Bug: 192935764 Change-Id: Id3ec3a0bc2638becdf4181d9c2b6794c91a75314

commit: 09bfdf1700feeedf85c4a53502d3c14b4d3f41fd [log] [tgz]
author: Vladimir Marko <vmarko@google.com> Mon Jul 19 12:17:20 2021 +0100
committer: Vladimir Marko <vmarko@google.com> Thu Jul 22 14:09:53 2021 +0000
tree: bf6c232234d4c41697a78868e51b7dc07eabb011
parent: 1b621225a8869f930d530e4855157a444971aca1 [diff]
diff --git a/libdexfile/dex/utf-inl.h b/libdexfile/dex/utf-inl.h
index 5355766..d255610 100644
--- a/libdexfile/dex/utf-inl.h
+++ b/libdexfile/dex/utf-inl.h

@@ -94,6 +94,55 @@
   return GetTrailingUtf16Char(c1) - GetTrailingUtf16Char(c2);
 }
 
+template <bool kUseShortZero, bool kUse4ByteSequence, bool kReplaceBadSurrogates, typename Append>
+inline void ConvertUtf16ToUtf8(const uint16_t* utf16, size_t char_count, Append&& append) {
+  static_assert(kUse4ByteSequence || !kReplaceBadSurrogates);
+
+  // Use local helpers instead of macros from `libicu` to avoid the dependency on `libicu`.
+  auto is_lead = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0xfc00u) == 0xd800u; };
+  auto is_trail = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0xfc00u) == 0xdc00u; };
+  auto is_surrogate = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0xf800u) == 0xd800u; };
+  auto is_surrogate_lead = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0x0400u) == 0u; };
+  auto get_supplementary = [](uint16_t lead, uint16_t trail) ALWAYS_INLINE {
+    constexpr uint32_t offset = (0xd800u << 10) + 0xdc00u - 0x10000u;
+    return (static_cast<uint32_t>(lead) << 10) + static_cast<uint32_t>(trail) - offset;
+  };
+
+  for (size_t i = 0u; i < char_count; ++i) {
+    auto has_trail = [&]() { return i + 1u != char_count && is_trail(utf16[i + 1u]); };
+
+    uint16_t ch = utf16[i];
+    if (ch < 0x80u && (kUseShortZero || ch != 0u)) {
+      // One byte.
+      append(ch);
+    } else if (ch < 0x800u) {
+      // Two bytes.
+      append((ch >> 6) | 0xc0);
+      append((ch & 0x3f) | 0x80);
+    } else if (kReplaceBadSurrogates
+                   ? is_surrogate(ch)
+                   : kUse4ByteSequence && is_lead(ch) && has_trail()) {
+      if (kReplaceBadSurrogates && (!is_surrogate_lead(ch) || !has_trail())) {
+        append('?');
+      } else {
+        // We have a *valid* surrogate pair.
+        uint32_t code_point = get_supplementary(ch, utf16[i + 1u]);
+        ++i;  //  Consume the leading surrogate.
+        // Four bytes.
+        append((code_point >> 18) | 0xf0);
+        append(((code_point >> 12) & 0x3f) | 0x80);
+        append(((code_point >> 6) & 0x3f) | 0x80);
+        append((code_point & 0x3f) | 0x80);
+      }
+    } else {
+      // Three bytes.
+      append((ch >> 12) | 0xe0);
+      append(((ch >> 6) & 0x3f) | 0x80);
+      append((ch & 0x3f) | 0x80);
+    }
+  }
+}
+
 }  // namespace art
 
 #endif  // ART_LIBDEXFILE_DEX_UTF_INL_H_

diff --git a/libdexfile/dex/utf.h b/libdexfile/dex/utf.h
index e3dc7f9..6949319 100644
--- a/libdexfile/dex/utf.h
+++ b/libdexfile/dex/utf.h

@@ -66,6 +66,24 @@
                                                 size_t utf16_length);
 
 /*
+ * Helper template for converting UTF-16 to UTF-8 and similar encodings.
+ *
+ * Template arguments:
+ *    kUseShortZero: Encode U+0000 as a single byte with value 0 (otherwise emit 0xc0 0x80).
+ *    kUse4ByteSequence: Encode valid surrogate pairs as a 4-byte sequence.
+ *    kReplaceBadSurrogates: Replace unmatched surrogates with '?' (otherwise use 3-byte sequence).
+ *                           Must be false if kUse4ByteSequence is false.
+ *    Append: The type of the `append` functor. Should be deduced automatically.
+ *
+ * Encoding               kUseShortZero kUse4ByteSequence kReplaceBadSurrogates
+ * UTF-8                  true          true              true
+ * Modified UTF8          false         false             n/a
+ * JNI GetStringUTFChars  false         true              false
+ */
+template <bool kUseShortZero, bool kUse4ByteSequence, bool kReplaceBadSurrogates, typename Append>
+void ConvertUtf16ToUtf8(const uint16_t* utf16, size_t char_count, Append&& append);
+
+/*
  * Convert from UTF-16 to Modified UTF-8. Note that the output is _not_
  * NUL-terminated. You probably need to call CountUtf8Bytes before calling
  * this anyway, so if you want a NUL-terminated string, you know where to

diff --git a/runtime/Android.bp b/runtime/Android.bp
index a801340..edc6a64 100644
--- a/runtime/Android.bp
+++ b/runtime/Android.bp

@@ -374,7 +374,6 @@
             ],
             shared_libs: [
                 "libdl_android",
-                "libicu",
                 "libstatssocket",
                 "libz", // For adler32.
                 "heapprofd_client_api",
@@ -405,9 +404,6 @@
                 "runtime_linux.cc",
                 "thread_linux.cc",
             ],
-            header_libs: [
-                "libicuuc_headers",
-            ],
             shared_libs: [
                 "libz", // For adler32.
             ],

diff --git a/runtime/jni/jni_internal.cc b/runtime/jni/jni_internal.cc
index b417bec..57102e3 100644
--- a/runtime/jni/jni_internal.cc
+++ b/runtime/jni/jni_internal.cc

@@ -214,6 +214,27 @@
   bool has_bad_char_;
 };
 
+// The JNI specification says that `GetStringUTFLength()`, `GetStringUTFChars()`
+// and `GetStringUTFRegion()` should emit the Modified UTF-8 encoding.
+// However, we have been emitting 4-byte UTF-8 sequences for several years now
+// and changing that would risk breaking a lot of binary interfaces.
+constexpr bool kUtfUseShortZero = false;
+constexpr bool kUtfUse4ByteSequence = true;  // This is against the JNI spec.
+constexpr bool kUtfReplaceBadSurrogates = false;
+
+jsize GetUncompressedStringUTFLength(const uint16_t* chars, size_t length) {
+  jsize byte_count = 0;
+  ConvertUtf16ToUtf8<kUtfUseShortZero, kUtfUse4ByteSequence, kUtfReplaceBadSurrogates>(
+      chars, length, [&](char c ATTRIBUTE_UNUSED) { ++byte_count; });
+  return byte_count;
+}
+
+char* GetUncompressedStringUTFChars(const uint16_t* chars, size_t length, char* dest) {
+  ConvertUtf16ToUtf8<kUtfUseShortZero, kUtfUse4ByteSequence, kUtfReplaceBadSurrogates>(
+      chars, length, [&](char c) { *dest++ = c; });
+  return dest;
+}
+
 }  // namespace
 
 // Consider turning this on when there is errors which could be related to JNI array copies such as
@@ -2044,7 +2065,10 @@
   static jsize GetStringUTFLength(JNIEnv* env, jstring java_string) {
     CHECK_NON_NULL_ARGUMENT_RETURN_ZERO(java_string);
     ScopedObjectAccess soa(env);
-    return soa.Decode<mirror::String>(java_string)->GetUtfLength();
+    ObjPtr<mirror::String> str = soa.Decode<mirror::String>(java_string);
+    return str->IsCompressed()
+        ? str->GetLength()
+        : GetUncompressedStringUTFLength(str->GetValue(), str->GetLength());
   }
 
   static void GetStringRegion(JNIEnv* env, jstring java_string, jsize start, jsize length,
@@ -2088,10 +2112,8 @@
         }
         buf[length] = '\0';
       } else {
-        const jchar* chars = s->GetValue();
-        size_t bytes = CountUtf8Bytes(chars + start, length);
-        ConvertUtf16ToModifiedUtf8(buf, bytes, chars + start, length);
-        buf[bytes] = '\0';
+        char* end = GetUncompressedStringUTFChars(s->GetValue() + start, length, buf);
+        *end = '\0';
       }
     }
   }
@@ -2195,9 +2217,12 @@
     if (is_copy != nullptr) {
       *is_copy = JNI_TRUE;
     }
+
     ScopedObjectAccess soa(env);
     ObjPtr<mirror::String> s = soa.Decode<mirror::String>(java_string);
-    size_t byte_count = s->GetUtfLength();
+    size_t length = s->GetLength();
+    size_t byte_count =
+        s->IsCompressed() ? length : GetUncompressedStringUTFLength(s->GetValue(), length);
     char* bytes = new char[byte_count + 1];
     CHECK(bytes != nullptr);  // bionic aborts anyway.
     if (s->IsCompressed()) {
@@ -2206,8 +2231,8 @@
         bytes[i] = src[i];
       }
     } else {
-      const uint16_t* chars = s->GetValue();
-      ConvertUtf16ToModifiedUtf8(bytes, byte_count, chars, s->GetLength());
+      char* end = GetUncompressedStringUTFChars(s->GetValue(), length, bytes);
+      DCHECK_EQ(byte_count, static_cast<size_t>(end - bytes));
     }
     bytes[byte_count] = '\0';
     return bytes;

diff --git a/runtime/native/libcore_util_CharsetUtils.cc b/runtime/native/libcore_util_CharsetUtils.cc
index 56dca72..92f355e 100644
--- a/runtime/native/libcore_util_CharsetUtils.cc
+++ b/runtime/native/libcore_util_CharsetUtils.cc

@@ -18,6 +18,7 @@
 
 #include <string.h>
 
+#include "dex/utf-inl.h"
 #include "handle_scope-inl.h"
 #include "jni/jni_internal.h"
 #include "mirror/string-inl.h"
@@ -26,7 +27,6 @@
 #include "nativehelper/scoped_primitive_array.h"
 #include "nativehelper/jni_macros.h"
 #include "scoped_fast_native_object_access-inl.h"
-#include "unicode/utf16.h"
 
 namespace art {
 
@@ -125,48 +125,16 @@
   DCHECK_GE(length, 0);
   DCHECK_LE(length, string->GetLength() - offset);
 
-  auto visit_chars16 = [string, offset, length](auto append) REQUIRES_SHARED(Locks::mutator_lock_) {
-    const uint16_t* chars16 = string->GetValue() + offset;
-    for (int i = 0; i < length; ++i) {
-      jint ch = chars16[i];
-      if (ch < 0x80) {
-        // One byte.
-        append(ch);
-      } else if (ch < 0x800) {
-        // Two bytes.
-        append((ch >> 6) | 0xc0);
-        append((ch & 0x3f) | 0x80);
-      } else if (U16_IS_SURROGATE(ch)) {
-        // A supplementary character.
-        jchar high = static_cast<jchar>(ch);
-        jchar low = (i + 1 != length) ? chars16[i + 1] : 0;
-        if (!U16_IS_SURROGATE_LEAD(high) || !U16_IS_TRAIL(low)) {
-          append('?');
-          continue;
-        }
-        // Now we know we have a *valid* surrogate pair, we can consume the low surrogate.
-        ++i;
-        ch = U16_GET_SUPPLEMENTARY(high, low);
-        // Four bytes.
-        append((ch >> 18) | 0xf0);
-        append(((ch >> 12) & 0x3f) | 0x80);
-        append(((ch >> 6) & 0x3f) | 0x80);
-        append((ch & 0x3f) | 0x80);
-      } else {
-        // Three bytes.
-        append((ch >> 12) | 0xe0);
-        append(((ch >> 6) & 0x3f) | 0x80);
-        append((ch & 0x3f) | 0x80);
-      }
-    }
-  };
-
   bool compressed = string->IsCompressed();
   size_t utf8_length = 0;
   if (compressed) {
     utf8_length = length;
   } else {
-    visit_chars16([&utf8_length](jbyte c ATTRIBUTE_UNUSED) { ++utf8_length; });
+    const uint16_t* utf16 = string->GetValue() + offset;
+    auto count_length = [&utf8_length](jbyte c ATTRIBUTE_UNUSED) ALWAYS_INLINE { ++utf8_length; };
+    ConvertUtf16ToUtf8</*kUseShortZero=*/ true,
+                       /*kUse4ByteSequence=*/ true,
+                       /*kReplaceBadSurrogates=*/ true>(utf16, length, count_length);
   }
   ObjPtr<mirror::ByteArray> result =
       mirror::ByteArray::Alloc(soa.Self(), dchecked_integral_cast<int32_t>(utf8_length));
@@ -177,8 +145,12 @@
   if (compressed) {
     memcpy(result->GetData(), string->GetValueCompressed() + offset, length);
   } else {
+    const uint16_t* utf16 = string->GetValue() + offset;
     int8_t* data = result->GetData();
-    visit_chars16([&data](jbyte c) { *data++ = c; });
+    auto store_data = [&data](jbyte c) ALWAYS_INLINE { *data++ = c; };
+    ConvertUtf16ToUtf8</*kUseShortZero=*/ true,
+                       /*kUse4ByteSequence=*/ true,
+                       /*kReplaceBadSurrogates=*/ true>(utf16, length, store_data);
   }
   return soa.AddLocalReference<jbyteArray>(result);
 }
commit	09bfdf1700feeedf85c4a53502d3c14b4d3f41fd	[log] [tgz]
author	Vladimir Marko <vmarko@google.com>	Mon Jul 19 12:17:20 2021 +0100
committer	Vladimir Marko <vmarko@google.com>	Thu Jul 22 14:09:53 2021 +0000
tree	bf6c232234d4c41697a78868e51b7dc07eabb011
parent	1b621225a8869f930d530e4855157a444971aca1 [diff]