Merge "Support hints for register pairs."

commit: 8240a8af33aedea9a4fe5c3b394d7c025ad081fb [log] [tgz]
author: Nicolas Geoffray <ngeoffray@google.com> Fri Feb 13 05:44:19 2015 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> Fri Feb 13 05:44:20 2015 +0000
tree: 3683dbd32af6254f18187caebe79b9e24f031f78
parent: 5409701651407747e172d753f3fddeb6eb423927 [diff]
parent: da02afe615191a19eae9a039786c4c4fc20dbfff [diff]
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 06d258d..24d96ba 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk

@@ -144,6 +144,7 @@
   runtime/reference_table_test.cc \
   runtime/thread_pool_test.cc \
   runtime/transaction_test.cc \
+  runtime/utf_test.cc \
   runtime/utils_test.cc \
   runtime/verifier/method_verifier_test.cc \
   runtime/verifier/reg_type_test.cc \

diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 670b76c..c588e1a 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc

@@ -563,6 +563,7 @@
   }
   mirror::String* string = obj->AsString();
   const uint16_t* utf16_string = string->GetCharArray()->GetData() + string->GetOffset();
+  size_t utf16_length = static_cast<size_t>(string->GetLength());
   ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
   ReaderMutexLock mu(Thread::Current(), *class_linker->DexLock());
   size_t dex_cache_count = class_linker->GetDexCacheCount();
@@ -570,10 +571,10 @@
     DexCache* dex_cache = class_linker->GetDexCache(i);
     const DexFile& dex_file = *dex_cache->GetDexFile();
     const DexFile::StringId* string_id;
-    if (UNLIKELY(string->GetLength() == 0)) {
+    if (UNLIKELY(utf16_length == 0)) {
       string_id = dex_file.FindStringId("");
     } else {
-      string_id = dex_file.FindStringId(utf16_string);
+      string_id = dex_file.FindStringId(utf16_string, utf16_length);
     }
     if (string_id != nullptr) {
       // This string occurs in this dex file, assign the dex cache entry.

diff --git a/runtime/check_jni.cc b/runtime/check_jni.cc
index e45d3a3..6ec0949 100644
--- a/runtime/check_jni.cc
+++ b/runtime/check_jni.cc

@@ -1095,6 +1095,8 @@
     return true;
   }
 
+  // Checks whether |bytes| is valid modified UTF-8. We also accept 4 byte UTF
+  // sequences in place of encoded surrogate pairs.
   static uint8_t CheckUtfBytes(const char* bytes, const char** errorKind) {
     while (*bytes != '\0') {
       uint8_t utf8 = *(bytes++);
@@ -1114,14 +1116,26 @@
       case 0x09:
       case 0x0a:
       case 0x0b:
-      case 0x0f:
-        /*
-         * Bit pattern 10xx or 1111, which are illegal start bytes.
-         * Note: 1111 is valid for normal UTF-8, but not the
-         * Modified UTF-8 used here.
-         */
+         // Bit patterns 10xx, which are illegal start bytes.
         *errorKind = "start";
         return utf8;
+      case 0x0f:
+        // Bit pattern 1111, which might be the start of a 4 byte sequence.
+        if ((utf8 & 0x08) == 0) {
+          // Bit pattern 1111 0xxx, which is the start of a 4 byte sequence.
+          // We consume one continuation byte here, and fall through to consume two more.
+          utf8 = *(bytes++);
+          if ((utf8 & 0xc0) != 0x80) {
+            *errorKind = "continuation";
+            return utf8;
+          }
+        } else {
+          *errorKind = "start";
+          return utf8;
+        }
+
+        // Fall through to the cases below to consume two more continuation bytes.
+        FALLTHROUGH_INTENDED;
       case 0x0e:
         // Bit pattern 1110, so there are two additional bytes.
         utf8 = *(bytes++);
@@ -1129,7 +1143,9 @@
           *errorKind = "continuation";
           return utf8;
         }
-        FALLTHROUGH_INTENDED;  // Fall-through to take care of the final byte.
+
+        // Fall through to consume one more continuation byte.
+        FALLTHROUGH_INTENDED;
       case 0x0c:
       case 0x0d:
         // Bit pattern 110x, so there is one additional byte.

diff --git a/runtime/dex_file.cc b/runtime/dex_file.cc
index dc85f6c..94d62db 100644
--- a/runtime/dex_file.cc
+++ b/runtime/dex_file.cc

@@ -578,14 +578,14 @@
   return NULL;
 }
 
-const DexFile::StringId* DexFile::FindStringId(const uint16_t* string) const {
+const DexFile::StringId* DexFile::FindStringId(const uint16_t* string, size_t length) const {
   int32_t lo = 0;
   int32_t hi = NumStringIds() - 1;
   while (hi >= lo) {
     int32_t mid = (hi + lo) / 2;
     const DexFile::StringId& str_id = GetStringId(mid);
     const char* str = GetStringData(str_id);
-    int compare = CompareModifiedUtf8ToUtf16AsCodePointValues(str, string);
+    int compare = CompareModifiedUtf8ToUtf16AsCodePointValues(str, string, length);
     if (compare > 0) {
       lo = mid + 1;
     } else if (compare < 0) {

diff --git a/runtime/dex_file.h b/runtime/dex_file.h
index 9b8f254..e121a08 100644
--- a/runtime/dex_file.h
+++ b/runtime/dex_file.h

@@ -499,7 +499,7 @@
   const StringId* FindStringId(const char* string) const;
 
   // Looks up a string id for a given utf16 string.
-  const StringId* FindStringId(const uint16_t* string) const;
+  const StringId* FindStringId(const uint16_t* string, size_t length) const;
 
   // Returns the number of type identifiers in the .dex file.
   uint32_t NumTypeIds() const {

diff --git a/runtime/jni_internal_test.cc b/runtime/jni_internal_test.cc
index 906aa4c..1048214 100644
--- a/runtime/jni_internal_test.cc
+++ b/runtime/jni_internal_test.cc

@@ -1351,7 +1351,36 @@
   EXPECT_EQ(5, env_->GetStringLength(s));
   EXPECT_EQ(5, env_->GetStringUTFLength(s));
 
-  // TODO: check some non-ASCII strings.
+  // Encoded surrogate pair.
+  s = env_->NewStringUTF("\xed\xa0\x81\xed\xb0\x80");
+  EXPECT_NE(s, nullptr);
+  EXPECT_EQ(2, env_->GetStringLength(s));
+  // Note that this uses 2 x 3 byte UTF sequences, one
+  // for each half of the surrogate pair.
+  EXPECT_EQ(6, env_->GetStringUTFLength(s));
+  const char* chars = env_->GetStringUTFChars(s, nullptr);
+  EXPECT_STREQ("\xed\xa0\x81\xed\xb0\x80", chars);
+  env_->ReleaseStringUTFChars(s, chars);
+
+  // 4 byte UTF sequence appended to an encoded surrogate pair.
+  s = env_->NewStringUTF("\xed\xa0\x81\xed\xb0\x80 \xf0\x9f\x8f\xa0");
+  EXPECT_NE(s, nullptr);
+  EXPECT_EQ(5, env_->GetStringLength(s));
+  EXPECT_EQ(13, env_->GetStringUTFLength(s));
+  chars = env_->GetStringUTFChars(s, nullptr);
+  // The 4 byte sequence {0xf0, 0x9f, 0x8f, 0xa0} is converted into a surrogate
+  // pair {0xd83c, 0xdfe0} which is then converted into a two three byte
+  // sequences {0xed 0xa0, 0xbc} and {0xed, 0xbf, 0xa0}, one for each half of
+  // the surrogate pair.
+  EXPECT_STREQ("\xed\xa0\x81\xed\xb0\x80 \xed\xa0\xbc\xed\xbf\xa0", chars);
+  env_->ReleaseStringUTFChars(s, chars);
+
+  // A string with 1, 2, 3 and 4 byte UTF sequences with spaces
+  // between them
+  s = env_->NewStringUTF("\x24 \xc2\xa2 \xe2\x82\xac \xf0\x9f\x8f\xa0");
+  EXPECT_NE(s, nullptr);
+  EXPECT_EQ(8, env_->GetStringLength(s));
+  EXPECT_EQ(15, env_->GetStringUTFLength(s));
 }
 
 TEST_F(JniInternalTest, NewString) {

diff --git a/runtime/mirror/object_test.cc b/runtime/mirror/object_test.cc
index fb42d28..9b345a6 100644
--- a/runtime/mirror/object_test.cc
+++ b/runtime/mirror/object_test.cc

@@ -67,7 +67,7 @@
     ASSERT_TRUE(string->Equals(utf8_in) || (expected_utf16_length == 1 && strlen(utf8_in) == 0));
     ASSERT_TRUE(string->Equals(StringPiece(utf8_in)) || (expected_utf16_length == 1 && strlen(utf8_in) == 0));
     for (int32_t i = 0; i < expected_utf16_length; i++) {
-      EXPECT_EQ(utf16_expected[i], string->CharAt(i));
+      EXPECT_EQ(utf16_expected[i], string->UncheckedCharAt(i));
     }
     EXPECT_EQ(expected_hash, string->GetHashCode());
   }
@@ -424,6 +424,12 @@
   AssertString(1, "\xe1\x88\xb4",   "\x12\x34",                 0x1234);
   AssertString(1, "\xef\xbf\xbf",   "\xff\xff",                 0xffff);
   AssertString(3, "h\xe1\x88\xb4i", "\x00\x68\x12\x34\x00\x69", (31 * ((31 * 0x68) + 0x1234)) + 0x69);
+
+  // Test four-byte characters.
+  AssertString(2, "\xf0\x9f\x8f\xa0",  "\xd8\x3c\xdf\xe0", (31 * 0xd83c) + 0xdfe0);
+  AssertString(2, "\xf0\x9f\x9a\x80",  "\xd8\x3d\xde\x80", (31 * 0xd83d) + 0xde80);
+  AssertString(4, "h\xf0\x9f\x9a\x80i", "\x00\x68\xd8\x3d\xde\x80\x00\x69",
+               (31 * (31 * (31 * 0x68 +  0xd83d) + 0xde80) + 0x69));
 }
 
 TEST_F(ObjectTest, StringEqualsUtf8) {

diff --git a/runtime/mirror/string-inl.h b/runtime/mirror/string-inl.h
index 14d7de2..4a95519 100644
--- a/runtime/mirror/string-inl.h
+++ b/runtime/mirror/string-inl.h

@@ -33,6 +33,10 @@
   return Class::ComputeClassSize(true, vtable_entries, 0, 1, 0, 1, 2);
 }
 
+inline uint16_t String::UncheckedCharAt(int32_t index) {
+  return GetCharArray()->Get(index + GetOffset());
+}
+
 inline CharArray* String::GetCharArray() {
   return GetFieldObject<CharArray>(ValueOffset());
 }
@@ -54,20 +58,6 @@
   return Runtime::Current()->GetInternTable()->InternWeak(this);
 }
 
-inline uint16_t String::CharAt(int32_t index) {
-  // TODO: do we need this? Equals is the only caller, and could
-  // bounds check itself.
-  DCHECK_GE(count_, 0);  // ensures the unsigned comparison is safe.
-  if (UNLIKELY(static_cast<uint32_t>(index) >= static_cast<uint32_t>(count_))) {
-    Thread* self = Thread::Current();
-    ThrowLocation throw_location = self->GetCurrentLocationForThrow();
-    self->ThrowNewExceptionF(throw_location, "Ljava/lang/StringIndexOutOfBoundsException;",
-                             "length=%i; index=%i", count_, index);
-    return 0;
-  }
-  return GetCharArray()->Get(index + GetOffset());
-}
-
 inline int32_t String::GetHashCode() {
   int32_t result = GetField32(OFFSET_OF_OBJECT_MEMBER(String, hash_code_));
   if (UNLIKELY(result == 0)) {

diff --git a/runtime/mirror/string.cc b/runtime/mirror/string.cc
index e199d0e..e7c88c5 100644
--- a/runtime/mirror/string.cc
+++ b/runtime/mirror/string.cc

@@ -147,7 +147,7 @@
     // Note: don't short circuit on hash code as we're presumably here as the
     // hash code was already equal
     for (int32_t i = 0; i < that->GetLength(); ++i) {
-      if (this->CharAt(i) != that->CharAt(i)) {
+      if (this->UncheckedCharAt(i) != that->UncheckedCharAt(i)) {
         return false;
       }
     }
@@ -160,7 +160,7 @@
     return false;
   } else {
     for (int32_t i = 0; i < that_length; ++i) {
-      if (this->CharAt(i) != that_chars[that_offset + i]) {
+      if (this->UncheckedCharAt(i) != that_chars[that_offset + i]) {
         return false;
       }
     }
@@ -169,22 +169,52 @@
 }
 
 bool String::Equals(const char* modified_utf8) {
-  for (int32_t i = 0; i < GetLength(); ++i) {
-    uint16_t ch = GetUtf16FromUtf8(&modified_utf8);
-    if (ch == '\0' || ch != CharAt(i)) {
+  const int32_t length = GetLength();
+  int32_t i = 0;
+  while (i < length) {
+    const uint32_t ch = GetUtf16FromUtf8(&modified_utf8);
+    if (ch == '\0') {
       return false;
     }
+
+    if (GetLeadingUtf16Char(ch) != UncheckedCharAt(i++)) {
+      return false;
+    }
+
+    const uint16_t trailing = GetTrailingUtf16Char(ch);
+    if (trailing != 0) {
+      if (i == length) {
+        return false;
+      }
+
+      if (UncheckedCharAt(i++) != trailing) {
+        return false;
+      }
+    }
   }
   return *modified_utf8 == '\0';
 }
 
 bool String::Equals(const StringPiece& modified_utf8) {
+  const int32_t length = GetLength();
   const char* p = modified_utf8.data();
-  for (int32_t i = 0; i < GetLength(); ++i) {
-    uint16_t ch = GetUtf16FromUtf8(&p);
-    if (ch != CharAt(i)) {
+  for (int32_t i = 0; i < length; ++i) {
+    uint32_t ch = GetUtf16FromUtf8(&p);
+
+    if (GetLeadingUtf16Char(ch) != UncheckedCharAt(i)) {
       return false;
     }
+
+    const uint16_t trailing = GetTrailingUtf16Char(ch);
+    if (trailing != 0) {
+      if (i == (length - 1)) {
+        return false;
+      }
+
+      if (UncheckedCharAt(++i) != trailing) {
+        return false;
+      }
+    }
   }
   return true;
 }

diff --git a/runtime/mirror/string.h b/runtime/mirror/string.h
index 30b8aa3..6c22b9b 100644
--- a/runtime/mirror/string.h
+++ b/runtime/mirror/string.h

@@ -69,8 +69,6 @@
 
   int32_t GetUtfLength() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  uint16_t CharAt(int32_t index) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
   String* Intern() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   static String* AllocFromUtf16(Thread* self,
@@ -86,9 +84,14 @@
                                        const char* utf8_data_in)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  // TODO: This is only used in the interpreter to compare against
+  // entries from a dex files constant pool (ArtField names). Should
+  // we unify this with Equals(const StringPiece&); ?
   bool Equals(const char* modified_utf8) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  // TODO: do we need this overload? give it a more intention-revealing name.
+  // TODO: This is only used to compare DexCache.location with
+  // a dex_file's location (which is an std::string). Do we really
+  // need this in mirror::String just for that one usage ?
   bool Equals(const StringPiece& modified_utf8)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
@@ -127,6 +130,9 @@
   static void VisitRoots(RootCallback* callback, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  // TODO: Make this private. It's only used on ObjectTest at the moment.
+  uint16_t UncheckedCharAt(int32_t index) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
  private:
   void SetHashCode(int32_t new_hash_code) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     // Hash code is invariant so use non-transactional mode. Also disable check as we may run inside

diff --git a/runtime/utf-inl.h b/runtime/utf-inl.h
index 1373d17..b2d6765 100644
--- a/runtime/utf-inl.h
+++ b/runtime/utf-inl.h

@@ -21,26 +21,57 @@
 
 namespace art {
 
-inline uint16_t GetUtf16FromUtf8(const char** utf8_data_in) {
-  uint8_t one = *(*utf8_data_in)++;
+inline uint16_t GetTrailingUtf16Char(uint32_t maybe_pair) {
+  return static_cast<uint16_t>(maybe_pair >> 16);
+}
+
+inline uint16_t GetLeadingUtf16Char(uint32_t maybe_pair) {
+  return static_cast<uint16_t>(maybe_pair & 0x0000FFFF);
+}
+
+inline uint32_t GetUtf16FromUtf8(const char** utf8_data_in) {
+  const uint8_t one = *(*utf8_data_in)++;
   if ((one & 0x80) == 0) {
     // one-byte encoding
     return one;
   }
-  // two- or three-byte encoding
-  uint8_t two = *(*utf8_data_in)++;
+
+  const uint8_t two = *(*utf8_data_in)++;
   if ((one & 0x20) == 0) {
     // two-byte encoding
     return ((one & 0x1f) << 6) | (two & 0x3f);
   }
-  // three-byte encoding
-  uint8_t three = *(*utf8_data_in)++;
-  return ((one & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
+
+  const uint8_t three = *(*utf8_data_in)++;
+  if ((one & 0x10) == 0) {
+    return ((one & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
+  }
+
+  // Four byte encodings need special handling. We'll have
+  // to convert them into a surrogate pair.
+  const uint8_t four = *(*utf8_data_in)++;
+
+  // Since this is a 4 byte UTF-8 sequence, it will lie between
+  // U+10000 and U+1FFFFF.
+  //
+  // TODO: What do we do about values in (U+10FFFF, U+1FFFFF) ? The
+  // spec says they're invalid but nobody appears to check for them.
+  const uint32_t code_point = ((one & 0x0f) << 18) | ((two & 0x3f) << 12)
+      | ((three & 0x3f) << 6) | (four & 0x3f);
+
+  uint32_t surrogate_pair = 0;
+  // Step two: Write out the high (leading) surrogate to the bottom 16 bits
+  // of the of the 32 bit type.
+  surrogate_pair |= ((code_point >> 10) + 0xd7c0) & 0xffff;
+  // Step three : Write out the low (trailing) surrogate to the top 16 bits.
+  surrogate_pair |= ((code_point & 0x03ff) + 0xdc00) << 16;
+
+  return surrogate_pair;
 }
 
 inline int CompareModifiedUtf8ToModifiedUtf8AsUtf16CodePointValues(const char* utf8_1,
                                                                    const char* utf8_2) {
-  uint16_t c1, c2;
+  uint32_t c1, c2;
   do {
     c1 = *utf8_1;
     c2 = *utf8_2;
@@ -50,50 +81,17 @@
     } else if (c2 == 0) {
       return 1;
     }
-    // Assume 1-byte value and handle all cases first.
-    utf8_1++;
-    utf8_2++;
-    if ((c1 & 0x80) == 0) {
-      if (c1 == c2) {
-        // Matching 1-byte values.
-        continue;
-      } else {
-        // Non-matching values.
-        if ((c2 & 0x80) == 0) {
-          // 1-byte value, do nothing.
-        } else if ((c2 & 0x20) == 0) {
-          // 2-byte value.
-          c2 = ((c2 & 0x1f) << 6) | (*utf8_2 & 0x3f);
-        } else {
-          // 3-byte value.
-          c2 = ((c2 & 0x0f) << 12) | ((utf8_2[0] & 0x3f) << 6) | (utf8_2[1] & 0x3f);
-        }
-        return static_cast<int>(c1) - static_cast<int>(c2);
-      }
-    }
-    // Non-matching or multi-byte values.
-    if ((c1 & 0x20) == 0) {
-      // 2-byte value.
-      c1 = ((c1 & 0x1f) << 6) | (*utf8_1 & 0x3f);
-      utf8_1++;
-    } else {
-      // 3-byte value.
-      c1 = ((c1 & 0x0f) << 12) | ((utf8_1[0] & 0x3f) << 6) | (utf8_1[1] & 0x3f);
-      utf8_1 += 2;
-    }
-    if ((c2 & 0x80) == 0) {
-      // 1-byte value, do nothing.
-    } else if ((c2 & 0x20) == 0) {
-      // 2-byte value.
-      c2 = ((c2 & 0x1f) << 6) | (*utf8_2 & 0x3f);
-      utf8_2++;
-    } else {
-      // 3-byte value.
-      c2 = ((c2 & 0x0f) << 12) | ((utf8_2[0] & 0x3f) << 6) | (utf8_2[1] & 0x3f);
-      utf8_2 += 2;
-    }
+
+    c1 = GetUtf16FromUtf8(&utf8_1);
+    c2 = GetUtf16FromUtf8(&utf8_2);
   } while (c1 == c2);
-  return static_cast<int>(c1) - static_cast<int>(c2);
+
+  const uint32_t leading_surrogate_diff = GetLeadingUtf16Char(c1) - GetLeadingUtf16Char(c2);
+  if (leading_surrogate_diff != 0) {
+      return static_cast<int>(leading_surrogate_diff);
+  }
+
+  return GetTrailingUtf16Char(c1) - GetTrailingUtf16Char(c2);
 }
 
 }  // namespace art

diff --git a/runtime/utf.cc b/runtime/utf.cc
index 05b847b..39c8d15 100644
--- a/runtime/utf.cc
+++ b/runtime/utf.cc

@@ -38,15 +38,30 @@
       // two-byte encoding
       continue;
     }
-    // three-byte encoding
     utf8++;
+    if ((ic & 0x10) == 0) {
+      // three-byte encoding
+      continue;
+    }
+
+    // four-byte encoding: needs to be converted into a surrogate
+    // pair.
+    utf8++;
+    len++;
   }
   return len;
 }
 
 void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) {
   while (*utf8_data_in != '\0') {
-    *utf16_data_out++ = GetUtf16FromUtf8(&utf8_data_in);
+    const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in);
+    const uint16_t leading = GetLeadingUtf16Char(ch);
+    const uint16_t trailing = GetTrailingUtf16Char(ch);
+
+    *utf16_data_out++ = leading;
+    if (trailing != 0) {
+      *utf16_data_out++ = trailing;
+    }
   }
 }
 
@@ -93,19 +108,38 @@
   return static_cast<int32_t>(hash);
 }
 
-int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8_1, const uint16_t* utf8_2) {
+int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t* utf16,
+                                                size_t utf16_length) {
   for (;;) {
-    if (*utf8_1 == '\0') {
-      return (*utf8_2 == '\0') ? 0 : -1;
-    } else if (*utf8_2 == '\0') {
+    if (*utf8 == '\0') {
+      return (utf16_length == 0) ? 0 : -1;
+    } else if (utf16_length == 0) {
       return 1;
     }
 
-    int c1 = GetUtf16FromUtf8(&utf8_1);
-    int c2 = *utf8_2;
+    const uint32_t pair = GetUtf16FromUtf8(&utf8);
 
-    if (c1 != c2) {
-      return c1 > c2 ? 1 : -1;
+    // First compare the leading utf16 char.
+    const uint16_t lhs = GetLeadingUtf16Char(pair);
+    const uint16_t rhs = *utf16++;
+    --utf16_length;
+    if (lhs != rhs) {
+      return lhs > rhs ? 1 : -1;
+    }
+
+    // Then compare the trailing utf16 char. First check if there
+    // are any characters left to consume.
+    const uint16_t lhs2 = GetTrailingUtf16Char(pair);
+    if (lhs2 != 0) {
+      if (utf16_length == 0) {
+        return 1;
+      }
+
+      const uint16_t rhs2 = *utf16++;
+      --utf16_length;
+      if (lhs2 != rhs2) {
+        return lhs2 > rhs2 ? 1 : -1;
+      }
     }
   }
 }

diff --git a/runtime/utf.h b/runtime/utf.h
index b55227b..dd38afa 100644
--- a/runtime/utf.h
+++ b/runtime/utf.h

@@ -59,10 +59,11 @@
                                                                           const char* utf8_2);
 
 /*
- * Compare a modified UTF-8 string with a UTF-16 string as code point values in a non-locale
- * sensitive manner.
+ * Compare a null-terminated modified UTF-8 string with a UTF-16 string (not null-terminated)
+ * as code point values in a non-locale sensitive manner.
  */
-int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8_1, const uint16_t* utf8_2);
+int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t* utf16,
+                                                size_t utf16_length);
 
 /*
  * Convert from UTF-16 to Modified UTF-8. Note that the output is _not_
@@ -84,12 +85,16 @@
 size_t ComputeModifiedUtf8Hash(const char* chars);
 
 /*
- * Retrieve the next UTF-16 character from a UTF-8 string.
+ * Retrieve the next UTF-16 character or surrogate pair from a UTF-8 string.
+ * single byte, 2-byte and 3-byte UTF-8 sequences result in a single UTF-16
+ * character whereas 4-byte UTF-8 sequences result in a surrogate pair. Use
+ * GetLeadingUtf16Char and GetTrailingUtf16Char to process the return value
+ * of this function.
  *
  * Advances "*utf8_data_in" to the start of the next character.
  *
  * WARNING: If a string is corrupted by dropping a '\0' in the middle
- * of a 3-byte sequence, you can end up overrunning the buffer with
+ * of a multi byte sequence, you can end up overrunning the buffer with
  * reads (and possibly with the writes if the length was computed and
  * cached before the damage). For performance reasons, this function
  * assumes that the string being parsed is known to be valid (e.g., by
@@ -97,7 +102,19 @@
  * out of dex files or other internal translations, so the only real
  * risk comes from the JNI NewStringUTF call.
  */
-uint16_t GetUtf16FromUtf8(const char** utf8_data_in);
+uint32_t GetUtf16FromUtf8(const char** utf8_data_in);
+
+/**
+ * Gets the leading UTF-16 character from a surrogate pair, or the sole
+ * UTF-16 character from the return value of GetUtf16FromUtf8.
+ */
+ALWAYS_INLINE uint16_t GetLeadingUtf16Char(uint32_t maybe_pair);
+
+/**
+ * Gets the trailing UTF-16 character from a surrogate pair, or 0 otherwise
+ * from the return value of GetUtf16FromUtf8.
+ */
+ALWAYS_INLINE uint16_t GetTrailingUtf16Char(uint32_t maybe_pair);
 
 }  // namespace art
 

diff --git a/runtime/utf_test.cc b/runtime/utf_test.cc
new file mode 100644
index 0000000..8048bbd
--- /dev/null
+++ b/runtime/utf_test.cc

@@ -0,0 +1,113 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utf.h"
+
+#include "common_runtime_test.h"
+#include "utf-inl.h"
+
+namespace art {
+
+class UtfTest : public CommonRuntimeTest {};
+
+TEST_F(UtfTest, GetLeadingUtf16Char) {
+  EXPECT_EQ(0xffff, GetLeadingUtf16Char(0xeeeeffff));
+}
+
+TEST_F(UtfTest, GetTrailingUtf16Char) {
+  EXPECT_EQ(0xffff, GetTrailingUtf16Char(0xffffeeee));
+  EXPECT_EQ(0, GetTrailingUtf16Char(0x0000aaaa));
+}
+
+#define EXPECT_ARRAY_POSITION(expected, end, start) \
+  EXPECT_EQ(static_cast<uintptr_t>(expected), \
+            reinterpret_cast<uintptr_t>(end) - reinterpret_cast<uintptr_t>(start));
+
+// A test string containing one, two, three and four byte UTF-8 sequences.
+static const uint8_t kAllSequences[] = {
+    0x24,
+    0xc2, 0xa2,
+    0xe2, 0x82, 0xac,
+    0xf0, 0x9f, 0x8f, 0xa0,
+    0x00
+};
+
+// A test string that contains a UTF-8 encoding of a surrogate pair
+// (code point = U+10400)
+static const uint8_t kSurrogateEncoding[] = {
+    0xed, 0xa0, 0x81,
+    0xed, 0xb0, 0x80,
+    0x00
+};
+
+TEST_F(UtfTest, GetUtf16FromUtf8) {
+  const char* const start = reinterpret_cast<const char*>(kAllSequences);
+  const char* ptr = start;
+  uint32_t pair = 0;
+
+  // Single byte sequence.
+  pair = GetUtf16FromUtf8(&ptr);
+  EXPECT_EQ(0x24, GetLeadingUtf16Char(pair));
+  EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+  EXPECT_ARRAY_POSITION(1, ptr, start);
+
+  // Two byte sequence
+  pair = GetUtf16FromUtf8(&ptr);
+  EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair));
+  EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+  EXPECT_ARRAY_POSITION(3, ptr, start);
+
+  // Three byte sequence
+  pair = GetUtf16FromUtf8(&ptr);
+  EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair));
+  EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+  EXPECT_ARRAY_POSITION(6, ptr, start);
+
+  // Four byte sequence
+  pair = GetUtf16FromUtf8(&ptr);
+  EXPECT_EQ(0xd83c, GetLeadingUtf16Char(pair));
+  EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair));
+  EXPECT_ARRAY_POSITION(10, ptr, start);
+
+  // Null terminator
+  pair = GetUtf16FromUtf8(&ptr);
+  EXPECT_EQ(0, GetLeadingUtf16Char(pair));
+  EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+  EXPECT_ARRAY_POSITION(11, ptr, start);
+}
+
+TEST_F(UtfTest, GetUtf16FromUtf8_SurrogatesPassThrough) {
+  const char* const start = reinterpret_cast<const char *>(kSurrogateEncoding);
+  const char* ptr = start;
+  uint32_t pair = 0;
+
+  pair = GetUtf16FromUtf8(&ptr);
+  EXPECT_EQ(0xd801, GetLeadingUtf16Char(pair));
+  EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+  EXPECT_ARRAY_POSITION(3, ptr, start);
+
+  pair = GetUtf16FromUtf8(&ptr);
+  EXPECT_EQ(0xdc00, GetLeadingUtf16Char(pair));
+  EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+  EXPECT_ARRAY_POSITION(6, ptr, start);
+}
+
+TEST_F(UtfTest, CountModifiedUtf8Chars) {
+  EXPECT_EQ(5u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kAllSequences)));
+  EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding)));
+}
+
+}  // namespace art

diff --git a/runtime/utils.cc b/runtime/utils.cc
index af16d7e..3ec9561 100644
--- a/runtime/utils.cc
+++ b/runtime/utils.cc

@@ -625,7 +625,7 @@
   const char* p = utf;
   size_t char_count = CountModifiedUtf8Chars(p);
   for (size_t i = 0; i < char_count; ++i) {
-    uint16_t ch = GetUtf16FromUtf8(&p);
+    uint32_t ch = GetUtf16FromUtf8(&p);
     if (ch == '\\') {
       result += "\\\\";
     } else if (ch == '\n') {
@@ -634,10 +634,20 @@
       result += "\\r";
     } else if (ch == '\t') {
       result += "\\t";
-    } else if (NeedsEscaping(ch)) {
-      StringAppendF(&result, "\\u%04x", ch);
     } else {
-      result += ch;
+      const uint16_t leading = GetLeadingUtf16Char(ch);
+
+      if (NeedsEscaping(leading)) {
+        StringAppendF(&result, "\\u%04x", leading);
+      } else {
+        result += leading;
+      }
+
+      const uint32_t trailing = GetTrailingUtf16Char(ch);
+      if (trailing != 0) {
+        // All high surrogates will need escaping.
+        StringAppendF(&result, "\\u%04x", trailing);
+      }
     }
   }
   result += '"';
@@ -650,7 +660,7 @@
   size_t char_count = CountModifiedUtf8Chars(s.c_str());
   const char* cp = &s[0];
   for (size_t i = 0; i < char_count; ++i) {
-    uint16_t ch = GetUtf16FromUtf8(&cp);
+    uint32_t ch = GetUtf16FromUtf8(&cp);
     if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')) {
       result.push_back(ch);
     } else if (ch == '.' || ch == '/') {
@@ -662,7 +672,13 @@
     } else if (ch == '[') {
       result += "_3";
     } else {
-      StringAppendF(&result, "_0%04x", ch);
+      const uint16_t leading = GetLeadingUtf16Char(ch);
+      const uint32_t trailing = GetTrailingUtf16Char(ch);
+
+      StringAppendF(&result, "_0%04x", leading);
+      if (trailing != 0) {
+        StringAppendF(&result, "_0%04x", trailing);
+      }
     }
   }
   return result;
@@ -757,41 +773,50 @@
    * document.
    */
 
-  uint16_t utf16 = GetUtf16FromUtf8(pUtf8Ptr);
+  const uint32_t pair = GetUtf16FromUtf8(pUtf8Ptr);
 
-  // Perform follow-up tests based on the high 8 bits.
-  switch (utf16 >> 8) {
-  case 0x00:
-    // It's only valid if it's above the ISO-8859-1 high space (0xa0).
-    return (utf16 > 0x00a0);
-  case 0xd8:
-  case 0xd9:
-  case 0xda:
-  case 0xdb:
-    // It's a leading surrogate. Check to see that a trailing
-    // surrogate follows.
-    utf16 = GetUtf16FromUtf8(pUtf8Ptr);
-    return (utf16 >= 0xdc00) && (utf16 <= 0xdfff);
-  case 0xdc:
-  case 0xdd:
-  case 0xde:
-  case 0xdf:
-    // It's a trailing surrogate, which is not valid at this point.
-    return false;
-  case 0x20:
-  case 0xff:
-    // It's in the range that has spaces, controls, and specials.
-    switch (utf16 & 0xfff8) {
-    case 0x2000:
-    case 0x2008:
-    case 0x2028:
-    case 0xfff0:
-    case 0xfff8:
+  const uint16_t leading = GetLeadingUtf16Char(pair);
+  const uint32_t trailing = GetTrailingUtf16Char(pair);
+
+  if (trailing == 0) {
+    // Perform follow-up tests based on the high 8 bits of the
+    // lower surrogate.
+    switch (leading >> 8) {
+    case 0x00:
+      // It's only valid if it's above the ISO-8859-1 high space (0xa0).
+      return (leading > 0x00a0);
+    case 0xd8:
+    case 0xd9:
+    case 0xda:
+    case 0xdb:
+      // It looks like a leading surrogate but we didn't find a trailing
+      // surrogate if we're here.
       return false;
+    case 0xdc:
+    case 0xdd:
+    case 0xde:
+    case 0xdf:
+      // It's a trailing surrogate, which is not valid at this point.
+      return false;
+    case 0x20:
+    case 0xff:
+      // It's in the range that has spaces, controls, and specials.
+      switch (leading & 0xfff8) {
+      case 0x2000:
+      case 0x2008:
+      case 0x2028:
+      case 0xfff0:
+      case 0xfff8:
+        return false;
+      }
+      break;
     }
-    break;
+
+    return true;
   }
-  return true;
+
+  // We have a surrogate pair. Check that trailing surrogate is well formed.
+  return (trailing >= 0xdc00 && trailing <= 0xdfff);
 }
 
 /* Return whether the pointed-at modified-UTF-8 encoded character is
commit	8240a8af33aedea9a4fe5c3b394d7c025ad081fb	[log] [tgz]
author	Nicolas Geoffray <ngeoffray@google.com>	Fri Feb 13 05:44:19 2015 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	Fri Feb 13 05:44:20 2015 +0000
tree	3683dbd32af6254f18187caebe79b9e24f031f78
parent	5409701651407747e172d753f3fddeb6eb423927 [diff]
parent	da02afe615191a19eae9a039786c4c4fc20dbfff [diff]