AArch64: Add memcmp16() for Arm64; ensure xSELF not clobbered

This patch modifies memcmp() to memcmp16(). Please note that this
implementation of memcmp16() is based on the bionic's memcmp().

However, to reflect a recent specification change, the file has been
modified to respect the new String.compareTo() behavior.

A test for memcmp16() has been added. The string_compareto test in
stub_test has been changed to invoke __memcmp16 in assembly stubs.

Add artIsAssignableFromCode to the list of native downcalls to
store and reload x18. Remove CheckSuspendFromCode, as it is unused.

Signed-off-by: Serban Constantinescu <serban.constantinescu@arm.com>
Change-Id: Ie0b5425ecfb62906d29a5d02e84c7e07ffb34a11
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 45bdbcb..28a91c1 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -66,6 +66,7 @@
 
 RUNTIME_GTEST_COMMON_SRC_FILES := \
   runtime/arch/arch_test.cc \
+  runtime/arch/memcmp16_test.cc \
   runtime/arch/stub_test.cc \
   runtime/barrier_test.cc \
   runtime/base/bit_field_test.cc \
diff --git a/compiler/oat_test.cc b/compiler/oat_test.cc
index 0b7272c..254faac 100644
--- a/compiler/oat_test.cc
+++ b/compiler/oat_test.cc
@@ -180,7 +180,7 @@
   EXPECT_EQ(80U, sizeof(OatHeader));
   EXPECT_EQ(8U, sizeof(OatMethodOffsets));
   EXPECT_EQ(24U, sizeof(OatQuickMethodHeader));
-  EXPECT_EQ(78 * GetInstructionSetPointerSize(kRuntimeISA), sizeof(QuickEntryPoints));
+  EXPECT_EQ(77 * GetInstructionSetPointerSize(kRuntimeISA), sizeof(QuickEntryPoints));
 }
 
 TEST_F(OatTest, OatHeaderIsValid) {
diff --git a/runtime/Android.mk b/runtime/Android.mk
index 3ce053c..7f5cf0c 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -223,6 +223,7 @@
 	arch/arm64/context_arm64.cc \
 	arch/arm64/entrypoints_init_arm64.cc \
 	arch/arm64/jni_entrypoints_arm64.S \
+	arch/arm64/memcmp16_arm64.S \
 	arch/arm64/portable_entrypoints_arm64.S \
 	arch/arm64/quick_entrypoints_arm64.S \
 	arch/arm64/thread_arm64.cc \
diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc
index ebceb63..3fa09cb 100644
--- a/runtime/arch/arm/entrypoints_init_arm.cc
+++ b/runtime/arch/arm/entrypoints_init_arm.cc
@@ -46,9 +46,6 @@
 extern "C" void* art_quick_initialize_type_and_verify_access(uint32_t, void*);
 extern "C" void* art_quick_resolve_string(void*, uint32_t);
 
-// Exception entrypoints.
-extern "C" void* GetAndClearException(Thread*);
-
 // Field entrypoints.
 extern "C" int art_quick_set32_instance(uint32_t, void*, int32_t);
 extern "C" int art_quick_set32_static(uint32_t, int32_t);
@@ -116,7 +113,6 @@
 extern "C" void art_quick_invoke_virtual_trampoline_with_access_check(uint32_t, void*);
 
 // Thread entrypoints.
-extern void CheckSuspendFromCode(Thread* thread);
 extern "C" void art_quick_test_suspend();
 
 // Throw entrypoints.
@@ -226,7 +222,6 @@
   qpoints->pInvokeVirtualTrampolineWithAccessCheck = art_quick_invoke_virtual_trampoline_with_access_check;
 
   // Thread
-  qpoints->pCheckSuspend = CheckSuspendFromCode;
   qpoints->pTestSuspend = art_quick_test_suspend;
 
   // Throws
diff --git a/runtime/arch/arm64/entrypoints_init_arm64.cc b/runtime/arch/arm64/entrypoints_init_arm64.cc
index cbb2c27..c19b79e 100644
--- a/runtime/arch/arm64/entrypoints_init_arm64.cc
+++ b/runtime/arch/arm64/entrypoints_init_arm64.cc
@@ -35,7 +35,7 @@
 extern "C" void art_portable_to_interpreter_bridge(mirror::ArtMethod*);
 
 // Cast entrypoints.
-extern "C" uint32_t artIsAssignableFromCode(const mirror::Class* klass,
+extern "C" uint32_t art_quick_assignable_from_code(const mirror::Class* klass,
                                             const mirror::Class* ref_class);
 extern "C" void art_quick_check_cast(void*, void*);
 
@@ -45,9 +45,6 @@
 extern "C" void* art_quick_initialize_type_and_verify_access(uint32_t, void*);
 extern "C" void* art_quick_resolve_string(void*, uint32_t);
 
-// Exception entrypoints.
-extern "C" void* GetAndClearException(Thread*);
-
 // Field entrypoints.
 extern "C" int art_quick_set32_instance(uint32_t, void*, int32_t);
 extern "C" int art_quick_set32_static(uint32_t, int32_t);
@@ -96,7 +93,6 @@
 extern "C" void art_quick_invoke_virtual_trampoline_with_access_check(uint32_t, void*);
 
 // Thread entrypoints.
-extern void CheckSuspendFromCode(Thread* thread);
 extern "C" void art_quick_test_suspend();
 
 // Throw entrypoints.
@@ -129,7 +125,7 @@
   ResetQuickAllocEntryPoints(qpoints);
 
   // Cast
-  qpoints->pInstanceofNonTrivial = artIsAssignableFromCode;
+  qpoints->pInstanceofNonTrivial = art_quick_assignable_from_code;
   qpoints->pCheckCast = art_quick_check_cast;
 
   // DexCache
@@ -209,7 +205,6 @@
   qpoints->pInvokeVirtualTrampolineWithAccessCheck = art_quick_invoke_virtual_trampoline_with_access_check;
 
   // Thread
-  qpoints->pCheckSuspend = CheckSuspendFromCode;
   qpoints->pTestSuspend = art_quick_test_suspend;
 
   // Throws
diff --git a/runtime/arch/arm64/memcmp.S b/runtime/arch/arm64/memcmp.S
deleted file mode 100644
index d73fb67..0000000
--- a/runtime/arch/arm64/memcmp.S
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64
- */
-
-#include <private/bionic_asm.h>
-
-/* Parameters and result.  */
-#define src1		x0
-#define src2		x1
-#define limit		x2
-#define result		x0
-
-/* Internal variables.  */
-#define data1		x3
-#define data1w		w3
-#define data2		x4
-#define data2w		w4
-#define has_nul		x5
-#define diff		x6
-#define endloop		x7
-#define tmp1		x8
-#define tmp2		x9
-#define tmp3		x10
-#define pos		x11
-#define limit_wd	x12
-#define mask		x13
-
-ENTRY(memcmp)
-	cbz	limit, .Lret0
-	eor	tmp1, src1, src2
-	tst	tmp1, #7
-	b.ne	.Lmisaligned8
-	ands	tmp1, src1, #7
-	b.ne	.Lmutual_align
-	add	limit_wd, limit, #7
-	lsr	limit_wd, limit_wd, #3
-	/* Start of performance-critical section  -- one 64B cache line.  */
-.Lloop_aligned:
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-.Lstart_realigned:
-	subs	limit_wd, limit_wd, #1
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, ne	/* Last Dword or differences.  */
-	cbz	endloop, .Lloop_aligned
-	/* End of performance-critical section  -- one 64B cache line.  */
-
-	/* Not reached the limit, must have found a diff.  */
-	cbnz	limit_wd, .Lnot_limit
-
-	/* Limit % 8 == 0 => all bytes significant.  */
-	ands	limit, limit, #7
-	b.eq	.Lnot_limit
-
-	lsl	limit, limit, #3	/* Bits -> bytes.  */
-	mov	mask, #~0
-#ifdef __AARCH64EB__
-	lsr	mask, mask, limit
-#else
-	lsl	mask, mask, limit
-#endif
-	bic	data1, data1, mask
-	bic	data2, data2, mask
-
-	orr	diff, diff, mask
-.Lnot_limit:
-
-#ifndef	__AARCH64EB__
-	rev	diff, diff
-	rev	data1, data1
-	rev	data2, data2
-#endif
-	/* The MS-non-zero bit of DIFF marks either the first bit
-	   that is different, or the end of the significant data.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-	clz	pos, diff
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	ret
-
-.Lmutual_align:
-	/* Sources are mutually aligned, but are not currently at an
-	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that precede the start point.  */
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	add	limit, limit, tmp1	/* Adjust the limit for the extra.  */
-	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
-	ldr	data1, [src1], #8
-	neg	tmp1, tmp1		/* Bits to alignment -64.  */
-	ldr	data2, [src2], #8
-	mov	tmp2, #~0
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#endif
-	add	limit_wd, limit, #7
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
-	lsr	limit_wd, limit_wd, #3
-	b	.Lstart_realigned
-
-.Lret0:
-	mov	result, #0
-	ret
-
-	.p2align 6
-.Lmisaligned8:
-	sub	limit, limit, #1
-1:
-	/* Perhaps we can do better than this.  */
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	limit, limit, #1
-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	1b
-	sub	result, data1, data2
-	ret
-END(memcmp)
diff --git a/runtime/arch/arm64/memcmp16_arm64.S b/runtime/arch/arm64/memcmp16_arm64.S
new file mode 100644
index 0000000..582940a
--- /dev/null
+++ b/runtime/arch/arm64/memcmp16_arm64.S
@@ -0,0 +1,143 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+#ifndef ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_
+#define ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_
+
+#include "asm_support_arm64.S"
+
+/* Parameters and result.  */
+#define src1        x0
+#define src2        x1
+#define limit       x2
+#define result      x0
+
+/* Internal variables.  */
+#define data1       x3
+#define data1w      w3
+#define data2       x4
+#define data2w      w4
+#define has_nul     x5
+#define diff        x6
+#define endloop     x7
+#define tmp1        x8
+#define tmp2        x9
+#define tmp3        x10
+#define limit_wd    x12
+#define mask        x13
+
+// WARNING: If you change this code to use x14 and x15, you must also change
+//          art_quick_string_compareto, which relies on these temps being unused.
+
+ENTRY __memcmp16
+  cbz     limit, .Lret0
+  lsl     limit, limit, #1  /* Half-words to bytes.  */
+  eor     tmp1, src1, src2
+  tst     tmp1, #7
+  b.ne    .Lmisaligned8
+  ands    tmp1, src1, #7
+  b.ne    .Lmutual_align
+  add     limit_wd, limit, #7
+  lsr     limit_wd, limit_wd, #3
+  /* Start of performance-critical section  -- one 64B cache line.  */
+.Lloop_aligned:
+  ldr     data1, [src1], #8
+  ldr     data2, [src2], #8
+.Lstart_realigned:
+  subs    limit_wd, limit_wd, #1
+  eor     diff, data1, data2  /* Non-zero if differences found.  */
+  csinv   endloop, diff, xzr, ne  /* Last Dword or differences.  */
+  cbz     endloop, .Lloop_aligned
+  /* End of performance-critical section  -- one 64B cache line.  */
+
+  /* Not reached the limit, must have found a diff.  */
+  cbnz    limit_wd, .Lnot_limit
+
+  /* Limit % 8 == 0 => all bytes significant.  */
+  ands    limit, limit, #7
+  b.eq    .Lnot_limit
+
+  lsl     limit, limit, #3  /* Bits -> bytes.  */
+  mov     mask, #~0
+  lsl     mask, mask, limit
+  bic     data1, data1, mask
+  bic     data2, data2, mask
+
+.Lnot_limit:
+
+  // Swap the byte order of diff. Exact reverse is not important, as we only need to detect
+  // the half-word.
+  rev     diff, diff
+  // The most significant bit of DIFF marks the least significant bit of change between DATA1/2
+  clz     diff, diff
+  // Mask off 0xF to have shift amount. Why does ARM64 not have BIC with immediate?!?!
+  bfi     diff, xzr, #0, #4
+  // Create a 16b mask
+  mov     mask, #0xFFFF
+  // Shift to the right half-word.
+  lsr     data1, data1, diff
+  lsr     data2, data2, diff
+  // Mask the lowest half-word.
+  and     data1, data1, mask
+  and     data2, data2, mask
+  // Compute difference.
+  sub     result, data1, data2
+  ret
+
+.Lmutual_align:
+  /* Sources are mutually aligned, but are not currently at an
+     alignment boundary.  Round down the addresses and then mask off
+     the bytes that precede the start point.  */
+  bic     src1, src1, #7
+  bic     src2, src2, #7
+  add     limit, limit, tmp1  /* Adjust the limit for the extra.  */
+  lsl     tmp1, tmp1, #3    /* Bytes beyond alignment -> bits.  */
+  ldr     data1, [src1], #8
+  neg     tmp1, tmp1    /* Bits to alignment -64.  */
+  ldr     data2, [src2], #8
+  mov     tmp2, #~0
+  /* Little-endian.  Early bytes are at LSB.  */
+  lsr     tmp2, tmp2, tmp1  /* Shift (tmp1 & 63).  */
+  add     limit_wd, limit, #7
+  orr     data1, data1, tmp2
+  orr     data2, data2, tmp2
+  lsr     limit_wd, limit_wd, #3
+  b       .Lstart_realigned
+
+.Lret0:
+  mov     result, #0
+  ret
+
+  .p2align 6
+.Lmisaligned8:
+  sub     limit, limit, #1
+1:
+  /* Perhaps we can do better than this.  */
+  ldrh    data1w, [src1], #2
+  ldrh    data2w, [src2], #2
+  subs    limit, limit, #2
+  ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
+  b.eq    1b
+  sub     result, data1, data2
+  ret
+END __memcmp16
+
+#endif  // ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 2e60b93..e088751 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1632,6 +1632,8 @@
     ldr   x0, [sp], 16        // Restore integer result, and drop stack area.
     .cfi_adjust_cfa_offset 16
 
+    // Need to restore x18.
+    ldr   xSELF, [sp, #72]
     POP_REF_ONLY_CALLEE_SAVE_FRAME
 
     br    x9                  // Tail-call out.
@@ -1647,6 +1649,7 @@
     mov    x0, xSELF          // Pass thread.
     mov    x1, sp             // Pass SP.
     bl     artDeoptimize      // artDeoptimize(Thread*, SP)
+    brk 0
 END art_quick_deoptimize
 
 
@@ -1757,7 +1760,7 @@
      *    x1:   comp object pointer
      *
      */
-    .extern memcmp16_generic_static
+    .extern __memcmp16
 ENTRY art_quick_string_compareto
     mov    x2, x0         // x0 is return, use x2 for first input.
     sub    x0, x2, x1     // Same string object?
@@ -1850,16 +1853,17 @@
     ret
 
 .Ldo_memcmp16:
-    str x0, [sp,#-16]!           // Save x0
+    mov x14, x0                  // Save x0 and LR. __memcmp16 does not use these temps.
+    mov x15, xLR                 //                 TODO: Codify and check that?
 
     mov x0, x2
     uxtw x2, w3
-    bl memcmp16_generic_static
+    bl __memcmp16
 
-    ldr x1, [sp], #16            // Restore old x0 = length diff
+    mov xLR, x15                 // Restore LR.
 
-    cmp x0, #0                   // Check the memcmp difference
-    csel x0, x0, x1, ne          // x0 := x0 != 0 ? x0 : x1
+    cmp x0, #0                   // Check the memcmp difference.
+    csel x0, x0, x14, ne         // x0 := x0 != 0 ? x14(prev x0=length diff) : x1.
     ret
 END art_quick_string_compareto
 
@@ -1869,11 +1873,9 @@
 .macro NATIVE_DOWNCALL name, entrypoint
     .extern \entrypoint
 ENTRY \name
-    sub    sp, sp, #16
-    stp    xSELF, xLR, [sp]
+    stp    xSELF, xLR, [sp, #-16]!
     bl     \entrypoint
-    ldp    xSELF, xLR, [sp]
-    add    sp, sp, #16
+    ldp    xSELF, xLR, [sp], #16
     ret
 END \name
 .endm
@@ -1881,3 +1883,4 @@
 NATIVE_DOWNCALL art_quick_fmod fmod
 NATIVE_DOWNCALL art_quick_fmodf fmodf
 NATIVE_DOWNCALL art_quick_memcpy memcpy
+NATIVE_DOWNCALL art_quick_assignable_from_code artIsAssignableFromCode
diff --git a/runtime/arch/memcmp16.h b/runtime/arch/memcmp16.h
index ad58588..1144c8c 100644
--- a/runtime/arch/memcmp16.h
+++ b/runtime/arch/memcmp16.h
@@ -30,7 +30,7 @@
 //
 // In both cases, MemCmp16 is declared.
 
-#if defined(__arm__) || defined(__mips)
+#if defined(__aarch64__) || defined(__arm__) || defined(__mips)
 
 extern "C" uint32_t __memcmp16(const uint16_t* s0, const uint16_t* s1, size_t count);
 #define MemCmp16 __memcmp16
diff --git a/runtime/arch/memcmp16_test.cc b/runtime/arch/memcmp16_test.cc
new file mode 100644
index 0000000..5747c67e
--- /dev/null
+++ b/runtime/arch/memcmp16_test.cc
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gtest/gtest.h"
+#include "memcmp16.h"
+
+class RandGen {
+ public:
+  explicit RandGen(uint32_t seed) : val_(seed) {}
+
+  uint32_t next() {
+    val_ = val_ * 48271 % 2147483647 + 13;
+    return val_;
+  }
+
+  uint32_t val_;
+};
+
+class MemCmp16Test : public testing::Test {
+};
+
+// A simple implementation to compare against.
+// Note: this version is equivalent to the generic one used when no optimized version is available.
+int32_t memcmp16_compare(const uint16_t* s0, const uint16_t* s1, size_t count) {
+  for (size_t i = 0; i < count; i++) {
+    if (s0[i] != s1[i]) {
+      return static_cast<int32_t>(s0[i]) - static_cast<int32_t>(s1[i]);
+    }
+  }
+  return 0;
+}
+
+static constexpr size_t kMemCmp16Rounds = 100000;
+
+static void CheckSeparate(size_t max_length, size_t min_length) {
+  RandGen r(0x1234);
+  size_t range_of_tests = 7;  // All four (weighted) tests active in the beginning.
+
+  for (size_t round = 0; round < kMemCmp16Rounds; ++round) {
+    size_t type = r.next() % range_of_tests;
+    size_t count1, count2;
+    uint16_t *s1, *s2;  // Use raw pointers to simplify using clobbered addresses
+
+    switch (type) {
+      case 0:  // random, non-zero lengths of both strings
+      case 1:
+      case 2:
+      case 3:
+        count1 = (r.next() % max_length) + min_length;
+        count2 = (r.next() % max_length) + min_length;
+        break;
+
+      case 4:  // random non-zero length of first, second is zero
+        count1 = (r.next() % max_length) + min_length;
+        count2 = 0U;
+        break;
+
+      case 5:  // random non-zero length of second, first is zero
+        count1 = 0U;
+        count2 = (r.next() % max_length) + min_length;
+        break;
+
+      case 6:  // both zero-length
+        count1 = 0U;
+        count2 = 0U;
+        range_of_tests = 6;  // Don't do zero-zero again.
+        break;
+
+      default:
+        ASSERT_TRUE(false) << "Should not get here.";
+        continue;
+    }
+
+    if (count1 > 0U) {
+      s1 = new uint16_t[count1];
+    } else {
+      // Leave a random pointer, should not be touched.
+      s1 = reinterpret_cast<uint16_t*>(0xebad1001);
+    }
+
+    if (count2 > 0U) {
+      s2 = new uint16_t[count2];
+    } else {
+      // Leave a random pointer, should not be touched.
+      s2 = reinterpret_cast<uint16_t*>(0xebad2002);
+    }
+
+    size_t min = count1 < count2 ? count1 : count2;
+    bool fill_same = r.next() % 1 == 1;
+
+    if (fill_same) {
+      for (size_t i = 0; i < min; ++i) {
+        s1[i] = static_cast<uint16_t>(r.next() & 0xFFFF);
+        s2[i] = s1[i];
+      }
+      for (size_t i = min; i < count1; ++i) {
+        s1[i] = static_cast<uint16_t>(r.next() & 0xFFFF);
+      }
+      for (size_t i = min; i < count2; ++i) {
+        s2[i] = static_cast<uint16_t>(r.next() & 0xFFFF);
+      }
+    } else {
+      for (size_t i = 0; i < count1; ++i) {
+        s1[i] = static_cast<uint16_t>(r.next() & 0xFFFF);
+      }
+      for (size_t i = 0; i < count2; ++i) {
+        s2[i] = static_cast<uint16_t>(r.next() & 0xFFFF);
+      }
+    }
+
+    uint16_t* s1_pot_unaligned = s1;
+    uint16_t* s2_pot_unaligned = s2;
+    size_t c1_mod = count1;
+    size_t c2_mod = count2;
+
+    if (!fill_same) {  // Don't waste a good "long" test.
+      if (count1 > 1 && r.next() % 10 == 0) {
+        c1_mod--;
+        s1_pot_unaligned++;
+      }
+      if (count2 > 1 && r.next() % 10 == 0) {
+        c2_mod--;
+        s2_pot_unaligned++;
+      }
+    }
+    size_t mod_min = c1_mod < c2_mod ? c1_mod : c2_mod;
+
+    int32_t expected = memcmp16_compare(s1_pot_unaligned, s2_pot_unaligned, mod_min);
+    int32_t computed = MemCmp16(s1_pot_unaligned, s2_pot_unaligned, mod_min);
+
+    ASSERT_EQ(expected, computed) << "Run " << round << ", c1=" << count1 << " c2=" << count2;
+
+    if (count1 > 0U) {
+      delete s1;
+    }
+    if (count2 > 0U) {
+      delete s2;
+    }
+  }
+}
+
+TEST_F(MemCmp16Test, RandomSeparateShort) {
+  CheckSeparate(5U, 1U);
+}
+
+TEST_F(MemCmp16Test, RandomSeparateLong) {
+  CheckSeparate(64U, 32U);
+}
+
+// TODO: What's a good test for overlapping memory. Is it important?
+// TEST_F(MemCmp16Test, RandomOverlay) {
+//
+// }
diff --git a/runtime/arch/mips/entrypoints_init_mips.cc b/runtime/arch/mips/entrypoints_init_mips.cc
index 08caa80..70a9619 100644
--- a/runtime/arch/mips/entrypoints_init_mips.cc
+++ b/runtime/arch/mips/entrypoints_init_mips.cc
@@ -45,9 +45,6 @@
 extern "C" void* art_quick_initialize_type_and_verify_access(uint32_t, void*);
 extern "C" void* art_quick_resolve_string(void*, uint32_t);
 
-// Exception entrypoints.
-extern "C" void* GetAndClearException(Thread*);
-
 // Field entrypoints.
 extern "C" int art_quick_set32_instance(uint32_t, void*, int32_t);
 extern "C" int art_quick_set32_static(uint32_t, int32_t);
@@ -117,7 +114,6 @@
 extern "C" void art_quick_invoke_virtual_trampoline_with_access_check(uint32_t, void*);
 
 // Thread entrypoints.
-extern void CheckSuspendFromCode(Thread* thread);
 extern "C" void art_quick_test_suspend();
 
 // Throw entrypoints.
@@ -229,7 +225,6 @@
   qpoints->pInvokeVirtualTrampolineWithAccessCheck = art_quick_invoke_virtual_trampoline_with_access_check;
 
   // Thread
-  qpoints->pCheckSuspend = CheckSuspendFromCode;
   qpoints->pTestSuspend = art_quick_test_suspend;
 
   // Throws
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index a31c08b..eb490eb 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -1222,8 +1222,12 @@
   // Use array so we can index into it and use a matrix for expected results
   // Setup: The first half is standard. The second half uses a non-zero offset.
   // TODO: Shared backing arrays.
-  static constexpr size_t kBaseStringCount  = 7;
-  const char* c[kBaseStringCount] = { "", "", "a", "aa", "ab", "aac", "aac" , };
+  static constexpr size_t kBaseStringCount  = 8;
+  const char* c[kBaseStringCount] = { "", "", "a", "aa", "ab",
+      "aacaacaacaacaacaac",  // This one's under the default limit to go to __memcmp16.
+      "aacaacaacaacaacaacaacaacaacaacaacaac",     // This one's over.
+      "aacaacaacaacaacaacaacaacaacaacaacaaca" };  // As is this one. We need a separate one to
+                                                  // defeat object-equal optimizations.
 
   static constexpr size_t kStringCount = 2 * kBaseStringCount;
 
diff --git a/runtime/arch/x86/entrypoints_init_x86.cc b/runtime/arch/x86/entrypoints_init_x86.cc
index a85e250..b217cd6 100644
--- a/runtime/arch/x86/entrypoints_init_x86.cc
+++ b/runtime/arch/x86/entrypoints_init_x86.cc
@@ -93,7 +93,6 @@
 extern "C" void art_quick_invoke_virtual_trampoline_with_access_check(uint32_t, void*);
 
 // Thread entrypoints.
-extern void CheckSuspendFromCode(Thread* thread);
 extern "C" void art_quick_test_suspend();
 
 // Throw entrypoints.
@@ -205,7 +204,6 @@
   qpoints->pInvokeVirtualTrampolineWithAccessCheck = art_quick_invoke_virtual_trampoline_with_access_check;
 
   // Thread
-  qpoints->pCheckSuspend = CheckSuspendFromCode;
   qpoints->pTestSuspend = art_quick_test_suspend;
 
   // Throws
diff --git a/runtime/arch/x86_64/entrypoints_init_x86_64.cc b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
index b6f51f7..609d1c6 100644
--- a/runtime/arch/x86_64/entrypoints_init_x86_64.cc
+++ b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
@@ -94,7 +94,6 @@
 extern "C" void art_quick_invoke_virtual_trampoline_with_access_check(uint32_t, void*);
 
 // Thread entrypoints.
-extern void CheckSuspendFromCode(Thread* thread);
 extern "C" void art_quick_test_suspend();
 
 // Throw entrypoints.
@@ -209,7 +208,6 @@
   qpoints->pInvokeVirtualTrampolineWithAccessCheck = art_quick_invoke_virtual_trampoline_with_access_check;
 
   // Thread
-  qpoints->pCheckSuspend = CheckSuspendFromCode;
   qpoints->pTestSuspend = art_quick_test_suspend;
 
   // Throws
diff --git a/runtime/entrypoints/quick/quick_entrypoints.h b/runtime/entrypoints/quick/quick_entrypoints.h
index 469d373..032f6be 100644
--- a/runtime/entrypoints/quick/quick_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_entrypoints.h
@@ -129,7 +129,6 @@
   void (*pInvokeVirtualTrampolineWithAccessCheck)(uint32_t, void*);
 
   // Thread
-  void (*pCheckSuspend)(Thread*);  // Stub that is called when the suspend count is non-zero
   void (*pTestSuspend)();  // Stub that is periodically called to test the suspend count
 
   // Throws
diff --git a/runtime/entrypoints/quick/quick_thread_entrypoints.cc b/runtime/entrypoints/quick/quick_thread_entrypoints.cc
index f61c754..5c48fc7 100644
--- a/runtime/entrypoints/quick/quick_thread_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_thread_entrypoints.cc
@@ -21,13 +21,6 @@
 
 namespace art {
 
-void CheckSuspendFromCode(Thread* thread)
-    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  // Called when thread->suspend_count_ != 0 on JNI return. JNI method acts as callee-save frame.
-  thread->VerifyStack();
-  CheckSuspend(thread);
-}
-
 extern "C" void artTestSuspendFromCode(Thread* thread, StackReference<mirror::ArtMethod>* sp)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   // Called when suspend count check value is 0 and thread->suspend_count_ != 0
diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc
index 0dd33cf..c572baf 100644
--- a/runtime/entrypoints_order_test.cc
+++ b/runtime/entrypoints_order_test.cc
@@ -251,8 +251,7 @@
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pInvokeSuperTrampolineWithAccessCheck,
                          pInvokeVirtualTrampolineWithAccessCheck, kPointerSize);
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pInvokeVirtualTrampolineWithAccessCheck,
-                         pCheckSuspend, kPointerSize);
-    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pCheckSuspend, pTestSuspend, kPointerSize);
+                         pTestSuspend, kPointerSize);
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pTestSuspend, pDeliverException, kPointerSize);
 
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pDeliverException, pThrowArrayBounds, kPointerSize);
diff --git a/runtime/oat.cc b/runtime/oat.cc
index f4721f2..857c0a2 100644
--- a/runtime/oat.cc
+++ b/runtime/oat.cc
@@ -22,7 +22,7 @@
 namespace art {
 
 const uint8_t OatHeader::kOatMagic[] = { 'o', 'a', 't', '\n' };
-const uint8_t OatHeader::kOatVersion[] = { '0', '3', '5', '\0' };
+const uint8_t OatHeader::kOatVersion[] = { '0', '3', '6', '\0' };
 
 OatHeader::OatHeader() {
   memset(this, 0, sizeof(*this));
diff --git a/runtime/thread.cc b/runtime/thread.cc
index ca8c2d7..d3487d0 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -1881,7 +1881,6 @@
   QUICK_ENTRY_POINT_INFO(pInvokeStaticTrampolineWithAccessCheck)
   QUICK_ENTRY_POINT_INFO(pInvokeSuperTrampolineWithAccessCheck)
   QUICK_ENTRY_POINT_INFO(pInvokeVirtualTrampolineWithAccessCheck)
-  QUICK_ENTRY_POINT_INFO(pCheckSuspend)
   QUICK_ENTRY_POINT_INFO(pTestSuspend)
   QUICK_ENTRY_POINT_INFO(pDeliverException)
   QUICK_ENTRY_POINT_INFO(pThrowArrayBounds)