Flesh out AllocString

Change-Id: Ie8c1170e71374942eafdcb40775ca2df3cf7bbc7
diff --git a/src/class_linker.cc b/src/class_linker.cc
index 92bb6d4..0a0bb04 100644
--- a/src/class_linker.cc
+++ b/src/class_linker.cc
@@ -137,6 +137,14 @@
                                               sizeof(Method)));
 }
 
+String* ClassLinker::AllocStringFromModifiedUtf8(int32_t utf16_length,
+                                                 const char* utf8_data_in) {
+  return String::AllocFromModifiedUtf8(java_lang_String_,
+                                       char_array_class_,
+                                       utf16_length,
+                                       utf8_data_in);
+}
+
 Class* ClassLinker::FindClass(const StringPiece& descriptor,
                               Object* class_loader,
                               const DexFile* dex_file) {
@@ -1497,10 +1505,9 @@
                                    uint32_t string_idx) {
   const DexFile* dex_file = FindDexFile(referring->GetDexCache());
   const DexFile::StringId& string_id = dex_file->GetStringId(string_idx);
-  const char* string_data = dex_file->GetStringData(string_id);
-  String* new_string = String::AllocFromModifiedUtf8(java_lang_String_,
-                                                     char_array_class_,
-                                                     string_data);
+  int32_t utf16_length = dex_file->GetStringLength(string_id);
+  const char* utf8_data = dex_file->GetStringData(string_id);
+  String* new_string = AllocStringFromModifiedUtf8(utf16_length, utf8_data);
   // TODO: intern the new string
   referring->GetDexCache()->SetResolvedString(string_idx, new_string);
   return new_string;
diff --git a/src/class_linker.h b/src/class_linker.h
index 738979b..46d4b1f 100644
--- a/src/class_linker.h
+++ b/src/class_linker.h
@@ -23,12 +23,15 @@
 
   ~ClassLinker() {}
 
+  // Alloc* convenience functions to avoid needing to pass in Class*
+  // values that are known to the ClassLinker such as
+  // object_array_class_ and java_lang_String_ etc.
   DexCache* AllocDexCache();
   Class* AllocClass(DexCache* dex_cache);
   StaticField* AllocStaticField();
   InstanceField* AllocInstanceField();
   Method* AllocMethod();
-
+  String* AllocStringFromModifiedUtf8(int32_t utf16_length, const char* utf8_data_in);
   template <class T>
   ObjectArray<T>* AllocObjectArray(size_t length) {
     return ObjectArray<T>::Alloc(object_array_class_, length);
diff --git a/src/dex_file.h b/src/dex_file.h
index 63fefae..c8f8afa 100644
--- a/src/dex_file.h
+++ b/src/dex_file.h
@@ -377,7 +377,7 @@
     const byte* ptr = base_ + string_id.string_data_off_;
     // Skip the uleb128 length.
     while (*(ptr++) > 0x7f) /* empty */ ;
-    return (const char*) ptr;
+    return reinterpret_cast<const char*>(ptr);
   }
 
   // return the UTF-8 encoded string with the specified string_id index
diff --git a/src/object.h b/src/object.h
index 77eb164..0abf444 100644
--- a/src/object.h
+++ b/src/object.h
@@ -607,12 +607,12 @@
   }
 
   T* Get(uint32_t i) const {
-    DCHECK_LT(i, GetLength());
+    CHECK_LT(i, GetLength());
     Object* const * data = reinterpret_cast<Object* const *>(GetData());
     return down_cast<T*>(data[i]);
   }
   void Set(uint32_t i, T* object) {
-    DCHECK_LT(i, GetLength());
+    CHECK_LT(i, GetLength());
     T** data = reinterpret_cast<T**>(GetData());
     data[i] = object;
   }
@@ -992,24 +992,53 @@
                                               length,
                                               sizeof(uint16_t)));
   }
+
+  uint16_t* GetChars() {
+    return reinterpret_cast<uint16_t*>(GetData());
+  }
+
+  const uint16_t* GetChars() const {
+    return reinterpret_cast<const uint16_t*>(GetData());
+  }
+
+  uint16_t GetChar(uint32_t i) const {
+    CHECK_LT(i, GetLength());
+    return GetChars()[i];
+  }
+
+  void  SetChar(uint32_t i, uint16_t ch) {
+    CHECK_LT(i, GetLength());
+    GetChars()[i] = ch;
+  }
+
  private:
   CharArray();
 };
 
 class String : public Object {
  public:
-  static String* Alloc(Class* java_lang_String) {
-    return down_cast<String*>(Object::Alloc(java_lang_String));
+  static String* AllocFromUtf16(Class* java_lang_String,
+                                Class* char_array,
+                                int32_t utf16_length,
+                                uint16_t* utf16_data_in) {
+    String* string = Alloc(java_lang_String, char_array, utf16_length);
+    uint16_t* utf16_data_out = string->array_->GetChars();
+    // TODO use 16-bit wide memset variant
+    for (int i = 0; i < utf16_length; i++ ) {
+        utf16_data_out[i] = utf16_data_in[i];
+    }
+    string->hash_code_ = ComputeUtf16Hash(utf16_data_out, utf16_length);
+    return string;
   }
 
   static String* AllocFromModifiedUtf8(Class* java_lang_String,
                                        Class* char_array,
-                                       const char* data) {
-    String* string = Alloc(java_lang_String);
-    uint32_t count = strlen(data);  // TODO
-    CharArray* array = CharArray::Alloc(char_array, count);
-    string->array_ = array;
-    string->count_ = count;
+                                       int32_t utf16_length,
+                                       const char* utf8_data_in) {
+    String* string = Alloc(java_lang_String, char_array, utf16_length);
+    uint16_t* utf16_data_out = string->array_->GetChars();
+    ConvertModifiedUtf8ToUtf16(utf16_data_out, utf8_data_in);
+    string->hash_code_ = ComputeUtf16Hash(utf16_data_out, utf16_length);
     return string;
   }
 
@@ -1022,6 +1051,65 @@
 
   uint32_t count_;
 
+  static String* Alloc(Class* java_lang_String,
+                       Class* char_array,
+                       int32_t utf16_length) {
+    String* string = down_cast<String*>(Object::Alloc(java_lang_String));
+    CharArray* array = CharArray::Alloc(char_array, utf16_length);
+    string->array_ = array;
+    string->count_ = utf16_length;
+    return string;
+  }
+
+  // Convert Modified UTF-8 to UTF-16
+  // http://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8
+  static void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) {
+    while (*utf8_data_in != '\0') {
+      *utf16_data_out++ = GetUtf16FromUtf8(&utf8_data_in);
+    }
+  }
+
+  // Retrieve the next UTF-16 character from a UTF-8 string.
+  //
+  // Advances "*pUtf8Ptr" to the start of the next character.
+  //
+  // WARNING: If a string is corrupted by dropping a '\0' in the middle
+  // of a 3-byte sequence, you can end up overrunning the buffer with
+  // reads (and possibly with the writes if the length was computed and
+  // cached before the damage). For performance reasons, this function
+  // assumes that the string being parsed is known to be valid (e.g., by
+  // already being verified). Most strings we process here are coming
+  // out of dex files or other internal translations, so the only real
+  // risk comes from the JNI NewStringUTF call.
+  static uint16_t GetUtf16FromUtf8(const char** utf8_data_in) {
+    uint8_t one = *(*utf8_data_in)++;
+    if ((one & 0x80) == 0) {
+      /* one-byte encoding */
+      return one;
+    }
+    /* two- or three-byte encoding */
+    uint8_t two = *(*utf8_data_in)++;
+    if ((one & 0x20) == 0) {
+      /* two-byte encoding */
+      return ((one & 0x1f) << 6) |
+              (two & 0x3f);
+    }
+    /* three-byte encoding */
+    uint8_t three = *(*utf8_data_in)++;
+    return ((one & 0x0f) << 12) |
+            ((two & 0x3f) << 6) |
+            (three & 0x3f);
+  }
+
+  // The java/lang/String.computeHashCode() algorithm
+  static uint32_t ComputeUtf16Hash(const uint16_t* string_data, size_t string_length) {
+    uint32_t hash = 0;
+    while (string_length--) {
+        hash = hash * 31 + *string_data++;
+    }
+    return hash;
+  }
+
  private:
   String();
 };
diff --git a/src/object_test.cc b/src/object_test.cc
index dc74b8b..969378c 100644
--- a/src/object_test.cc
+++ b/src/object_test.cc
@@ -8,12 +8,35 @@
 #include "object.h"
 #include "scoped_ptr.h"
 
+#include <stdint.h>
 #include <stdio.h>
 #include "gtest/gtest.h"
 
 namespace art {
 
-class ObjectTest : public RuntimeTest {};
+class ObjectTest : public RuntimeTest {
+ protected:
+  void AssertString(size_t length,
+                    const char* utf8_in,
+                    const char* utf16_expected_le,
+                    uint32_t hash_expected) {
+    uint16_t utf16_expected[length];
+    for (size_t i = 0; i < length; i++) {
+      uint16_t ch = (((utf16_expected_le[i*2 + 0] & 0xff) << 8) |
+                     ((utf16_expected_le[i*2 + 1] & 0xff) << 0));
+      utf16_expected[i] = ch;
+    }
+
+    String* string = class_linker_->AllocStringFromModifiedUtf8(length, utf8_in);
+    ASSERT_EQ(length, string->count_);
+    ASSERT_TRUE(string->array_ != NULL);
+    ASSERT_TRUE(string->array_->GetChars() != NULL);
+    for (size_t i = 0; i < length; i++) {
+      EXPECT_EQ(utf16_expected[i], string->array_->GetChar(i));
+    }
+    EXPECT_EQ(hash_expected, string->hash_code_);
+  }
+};
 
 TEST_F(ObjectTest, IsInSamePackage) {
   // Matches
@@ -42,4 +65,26 @@
     EXPECT_TRUE(oa->Get(1) == oa);
 }
 
+TEST_F(ObjectTest, String) {
+  // Test the empty string.
+  AssertString(0, "",     "", 0);
+
+  // Test one-byte characters.
+  AssertString(1, " ",    "\x00\x20",         0x20);
+  AssertString(1, "",     "\x00\x00",         0);
+  AssertString(1, "\x7f", "\x00\x7f",         0x7f);
+  AssertString(2, "hi",   "\x00\x68\x00\x69", (31 * 0x68) + 0x69);
+
+  // Test two-byte characters.
+  AssertString(1, "\xc2\x80",   "\x00\x80",                 0x80);
+  AssertString(1, "\xd9\xa6",   "\x06\x66",                 0x0666);
+  AssertString(1, "\xdf\xbf",   "\x07\xff",                 0x07ff);
+  AssertString(3, "h\xd9\xa6i", "\x00\x68\x06\x66\x00\x69", (31 * ((31 * 0x68) + 0x0666)) + 0x69);
+
+  // Test three-byte characters.
+  AssertString(1, "\xe0\xa0\x80",   "\x08\x00",                 0x0800);
+  AssertString(1, "\xe1\x88\xb4",   "\x12\x34",                 0x1234);
+  AssertString(1, "\xef\xbf\xbf",   "\xff\xff",                 0xffff);
+  AssertString(3, "h\xe1\x88\xb4i", "\x00\x68\x12\x34\x00\x69", (31 * ((31 * 0x68) + 0x1234)) + 0x69);
+}
 }  // namespace art