AArch64: Jni compiler fixes

This patch fixes some of the issues with the ARM64 assembler and JNI
compiler.

The JNI compiler is not enabled by default, yet. To enable, change
line 1884 in compiler/driver/compiler_driver.cc, removing kArm64 from
the GenericJNI list.

The compiler passes all tests in jni_compiler_test.

Also change the common_compiler_test instruction-set-features logic.
We allow tests when the build-time features are a subset of the
runtime features.

Dex2oat cross-compiling is now working. A 32b version of dex2oat should
be able to compile correctly.

Change-Id: I51d1c24f2c75d4397a11c54724a8b277ff3b3df8
Signed-off-by: Serban Constantinescu <serban.constantinescu@arm.com>
diff --git a/compiler/common_compiler_test.h b/compiler/common_compiler_test.h
index 9a21da0..fdf09a5 100644
--- a/compiler/common_compiler_test.h
+++ b/compiler/common_compiler_test.h
@@ -299,7 +299,7 @@
 
       // for ARM, do a runtime check to make sure that the features we are passed from
       // the build match the features we actually determine at runtime.
-      ASSERT_EQ(instruction_set_features, runtime_features);
+      ASSERT_LE(instruction_set_features, runtime_features);
 #elif defined(__aarch64__)
       instruction_set = kArm64;
       // TODO: arm64 compilation support.
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.cc b/compiler/jni/quick/arm64/calling_convention_arm64.cc
index 2d1be9d..6212a23 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.cc
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.cc
@@ -21,14 +21,29 @@
 namespace art {
 namespace arm64 {
 
-// Calling convention
+static const Register kCoreArgumentRegisters[] = {
+  X0, X1, X2, X3, X4, X5, X6, X7
+};
 
+static const WRegister kWArgumentRegisters[] = {
+  W0, W1, W2, W3, W4, W5, W6, W7
+};
+
+static const DRegister kDArgumentRegisters[] = {
+  D0, D1, D2, D3, D4, D5, D6, D7
+};
+
+static const SRegister kSArgumentRegisters[] = {
+  S0, S1, S2, S3, S4, S5, S6, S7
+};
+
+// Calling convention
 ManagedRegister Arm64ManagedRuntimeCallingConvention::InterproceduralScratchRegister() {
-  return Arm64ManagedRegister::FromCoreRegister(IP0);  // X16
+  return Arm64ManagedRegister::FromCoreRegister(X20);  // saved on entry restored on exit
 }
 
 ManagedRegister Arm64JniCallingConvention::InterproceduralScratchRegister() {
-  return Arm64ManagedRegister::FromCoreRegister(IP0);  // X16
+  return Arm64ManagedRegister::FromCoreRegister(X20);  // saved on entry restored on exit
 }
 
 static ManagedRegister ReturnRegisterForShorty(const char* shorty) {
@@ -79,64 +94,64 @@
 FrameOffset Arm64ManagedRuntimeCallingConvention::CurrentParamStackOffset() {
   CHECK(IsCurrentParamOnStack());
   FrameOffset result =
-      FrameOffset(displacement_.Int32Value() +         // displacement
+      FrameOffset(displacement_.Int32Value() +   // displacement
                   kFramePointerSize +                 // Method*
-                  (itr_slots_ * kFramePointerSize));  // offset into in args
+                  (itr_slots_ * sizeof(uint32_t)));  // offset into in args
   return result;
 }
 
 const ManagedRegisterEntrySpills& Arm64ManagedRuntimeCallingConvention::EntrySpills() {
   // We spill the argument registers on ARM64 to free them up for scratch use, we then assume
   // all arguments are on the stack.
-  if (entry_spills_.size() == 0) {
-    // TODO Need fp regs spilled too.
-    //
-    size_t num_spills = NumArgs();
+  if ((entry_spills_.size() == 0) && (NumArgs() > 0)) {
+    int gp_reg_index = 1;   // we start from X1/W1, X0 holds ArtMethod*.
+    int fp_reg_index = 0;   // D0/S0.
 
-    // TODO Floating point need spilling too.
-    if (num_spills > 0) {
-      entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(X1));
-      if (num_spills > 1) {
-        entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(X2));
-        if (num_spills > 2) {
-          entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(X3));
-          if (num_spills > 3) {
-            entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(X5));
-            if (num_spills > 4) {
-              entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(X6));
-              if (num_spills > 5) {
-                entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(X7));
-              }
+    // We need to choose the correct register (D/S or X/W) since the managed
+    // stack uses 32bit stack slots.
+    ResetIterator(FrameOffset(0));
+    while (HasNext()) {
+      if (IsCurrentParamAFloatOrDouble()) {  // FP regs.
+          if (fp_reg_index < 8) {
+            if (!IsCurrentParamADouble()) {
+              entry_spills_.push_back(Arm64ManagedRegister::FromSRegister(kSArgumentRegisters[fp_reg_index]));
+            } else {
+              entry_spills_.push_back(Arm64ManagedRegister::FromDRegister(kDArgumentRegisters[fp_reg_index]));
             }
+            fp_reg_index++;
+          } else {  // just increase the stack offset.
+            if (!IsCurrentParamADouble()) {
+              entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
+            } else {
+              entry_spills_.push_back(ManagedRegister::NoRegister(), 8);
+            }
+          }
+      } else {  // GP regs.
+        if (gp_reg_index < 8) {
+          if (IsCurrentParamALong() && (!IsCurrentParamAReference())) {
+            entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(kCoreArgumentRegisters[gp_reg_index]));
+          } else {
+            entry_spills_.push_back(Arm64ManagedRegister::FromWRegister(kWArgumentRegisters[gp_reg_index]));
+          }
+          gp_reg_index++;
+        } else {  // just increase the stack offset.
+          if (IsCurrentParamALong() && (!IsCurrentParamAReference())) {
+              entry_spills_.push_back(ManagedRegister::NoRegister(), 8);
+          } else {
+              entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
           }
         }
       }
+      Next();
     }
   }
-
   return entry_spills_;
 }
-// JNI calling convention
 
+// JNI calling convention
 Arm64JniCallingConvention::Arm64JniCallingConvention(bool is_static, bool is_synchronized,
                                                      const char* shorty)
     : JniCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {
-  // TODO This needs to be converted to 64bit.
-  // Compute padding to ensure longs and doubles are not split in AAPCS. Ignore the 'this' jobject
-  // or jclass for static methods and the JNIEnv. We start at the aligned register r2.
-//  size_t padding = 0;
-//  for (size_t cur_arg = IsStatic() ? 0 : 1, cur_reg = 2; cur_arg < NumArgs(); cur_arg++) {
-//    if (IsParamALongOrDouble(cur_arg)) {
-//      if ((cur_reg & 1) != 0) {
-//        padding += 4;
-//        cur_reg++;  // additional bump to ensure alignment
-//      }
-//      cur_reg++;  // additional bump to skip extra long word
-//    }
-//    cur_reg++;  // bump the iterator for every argument
-//  }
-  padding_ =0;
-
   callee_save_regs_.push_back(Arm64ManagedRegister::FromCoreRegister(X19));
   callee_save_regs_.push_back(Arm64ManagedRegister::FromCoreRegister(X20));
   callee_save_regs_.push_back(Arm64ManagedRegister::FromCoreRegister(X21));
@@ -162,18 +177,26 @@
 uint32_t Arm64JniCallingConvention::CoreSpillMask() const {
   // Compute spill mask to agree with callee saves initialized in the constructor
   uint32_t result = 0;
-  result =  1 << X19 | 1 << X20 | 1 << X21 | 1 << X22 | 1 << X23 | 1 << X24 | 1 << X25
-      | 1 << X26 | 1 << X27 | 1 << X28 | 1<< X29 | 1 << LR;
+  result =  1 << X19 | 1 << X20 | 1 << X21 | 1 << X22 | 1 << X23 | 1 << X24 |
+            1 << X25 | 1 << X26 | 1 << X27 | 1 << X28 | 1 << X29 | 1 << LR;
+  return result;
+}
+
+uint32_t Arm64JniCallingConvention::FpSpillMask() const {
+  // Compute spill mask to agree with callee saves initialized in the constructor
+  uint32_t result = 0;
+  result = 1 << D8 | 1 << D9 | 1 << D10 | 1 << D11 | 1 << D12 | 1 << D13 |
+           1 << D14 | 1 << D15;
   return result;
 }
 
 ManagedRegister Arm64JniCallingConvention::ReturnScratchRegister() const {
-  return Arm64ManagedRegister::FromCoreRegister(X9);
+  return ManagedRegister::NoRegister();
 }
 
 size_t Arm64JniCallingConvention::FrameSize() {
-  // Method*, LR and callee save area size, local reference segment state
-  size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kFramePointerSize;
+  // Method*, callee save area size, local reference segment state
+  size_t frame_data_size = ((1 + CalleeSaveRegisters().size()) * kFramePointerSize) + sizeof(uint32_t);
   // References plus 2 words for SIRT header
   size_t sirt_size = StackIndirectReferenceTable::GetAlignedSirtSizeTarget(kFramePointerSize, ReferenceCount());
   // Plus return value spill area size
@@ -181,64 +204,60 @@
 }
 
 size_t Arm64JniCallingConvention::OutArgSize() {
-  return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize + padding_,
-                 kStackAlignment);
-}
-
-// JniCallingConvention ABI follows AAPCS where longs and doubles must occur
-// in even register numbers and stack slots
-void Arm64JniCallingConvention::Next() {
-  JniCallingConvention::Next();
-  size_t arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
-  if ((itr_args_ >= 2) &&
-      (arg_pos < NumArgs()) &&
-      IsParamALongOrDouble(arg_pos)) {
-    // itr_slots_ needs to be an even number, according to AAPCS.
-    if ((itr_slots_ & 0x1u) != 0) {
-      itr_slots_++;
-    }
-  }
+  return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize, kStackAlignment);
 }
 
 bool Arm64JniCallingConvention::IsCurrentParamInRegister() {
-  return itr_slots_ < 4;
+  if (IsCurrentParamAFloatOrDouble()) {
+    return (itr_float_and_doubles_ < 8);
+  } else {
+    return ((itr_args_ - itr_float_and_doubles_) < 8);
+  }
 }
 
 bool Arm64JniCallingConvention::IsCurrentParamOnStack() {
   return !IsCurrentParamInRegister();
 }
 
-// TODO and floating point?
-
-static const Register kJniArgumentRegisters[] = {
-  X0, X1, X2, X3, X4, X5, X6, X7
-};
 ManagedRegister Arm64JniCallingConvention::CurrentParamRegister() {
-  CHECK_LT(itr_slots_, 4u);
-  int arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
-  // TODO Floating point & 64bit registers.
-  if ((itr_args_ >= 2) && IsParamALongOrDouble(arg_pos)) {
-    CHECK_EQ(itr_slots_, 2u);
-    return Arm64ManagedRegister::FromCoreRegister(X1);
+  CHECK(IsCurrentParamInRegister());
+  if (IsCurrentParamAFloatOrDouble()) {
+    CHECK_LT(itr_float_and_doubles_, 8u);
+    if (IsCurrentParamADouble()) {
+      return Arm64ManagedRegister::FromDRegister(kDArgumentRegisters[itr_float_and_doubles_]);
+    } else {
+      return Arm64ManagedRegister::FromSRegister(kSArgumentRegisters[itr_float_and_doubles_]);
+    }
   } else {
-    return
-      Arm64ManagedRegister::FromCoreRegister(kJniArgumentRegisters[itr_slots_]);
+    int gp_reg = itr_args_ - itr_float_and_doubles_;
+    CHECK_LT(static_cast<unsigned int>(gp_reg), 8u);
+    if (IsCurrentParamALong() || IsCurrentParamAReference() || IsCurrentParamJniEnv())  {
+      return Arm64ManagedRegister::FromCoreRegister(kCoreArgumentRegisters[gp_reg]);
+    } else {
+      return Arm64ManagedRegister::FromWRegister(kWArgumentRegisters[gp_reg]);
+    }
   }
 }
 
 FrameOffset Arm64JniCallingConvention::CurrentParamStackOffset() {
-  CHECK_GE(itr_slots_, 4u);
-  size_t offset = displacement_.Int32Value() - OutArgSize() + ((itr_slots_ - 4) * kFramePointerSize);
+  CHECK(IsCurrentParamOnStack());
+  size_t args_on_stack = itr_args_
+                  - std::min(8u, itr_float_and_doubles_)
+                  - std::min(8u, (itr_args_ - itr_float_and_doubles_));
+  size_t offset = displacement_.Int32Value() - OutArgSize() + (args_on_stack * kFramePointerSize);
   CHECK_LT(offset, OutArgSize());
   return FrameOffset(offset);
 }
 
 size_t Arm64JniCallingConvention::NumberOfOutgoingStackArgs() {
-  size_t static_args = IsStatic() ? 1 : 0;  // count jclass
-  // regular argument parameters and this
-  size_t param_args = NumArgs() + NumLongOrDoubleArgs();
-  // count JNIEnv* less arguments in registers
-  return static_args + param_args + 1 - 4;
+  // all arguments including JNI args
+  size_t all_args = NumArgs() + NumberOfExtraArgumentsForJni();
+
+  size_t all_stack_args = all_args -
+            std::min(8u, static_cast<unsigned int>(NumFloatOrDoubleArgs())) -
+            std::min(8u, static_cast<unsigned int>((all_args - NumFloatOrDoubleArgs())));
+
+  return all_stack_args;
 }
 
 }  // namespace arm64
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.h b/compiler/jni/quick/arm64/calling_convention_arm64.h
index c18cd2b..92f547c 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.h
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.h
@@ -55,7 +55,6 @@
   ManagedRegister IntReturnRegister() OVERRIDE;
   ManagedRegister InterproceduralScratchRegister() OVERRIDE;
   // JNI calling convention
-  void Next() OVERRIDE;  // Override default behavior for AAPCS
   size_t FrameSize() OVERRIDE;
   size_t OutArgSize() OVERRIDE;
   const std::vector<ManagedRegister>& CalleeSaveRegisters() const OVERRIDE {
@@ -63,9 +62,7 @@
   }
   ManagedRegister ReturnScratchRegister() const OVERRIDE;
   uint32_t CoreSpillMask() const OVERRIDE;
-  uint32_t FpSpillMask() const OVERRIDE {
-    return 0;  // Floats aren't spilled in JNI down call
-  }
+  uint32_t FpSpillMask() const OVERRIDE;
   bool IsCurrentParamInRegister() OVERRIDE;
   bool IsCurrentParamOnStack() OVERRIDE;
   ManagedRegister CurrentParamRegister() OVERRIDE;
@@ -78,9 +75,6 @@
   // TODO: these values aren't unique and can be shared amongst instances
   std::vector<ManagedRegister> callee_save_regs_;
 
-  // Padding to ensure longs and doubles are not split in AAPCS
-  size_t padding_;
-
   DISALLOW_COPY_AND_ASSIGN(Arm64JniCallingConvention);
 };
 
diff --git a/compiler/jni/quick/calling_convention.cc b/compiler/jni/quick/calling_convention.cc
index 8efdcda..a99a4c2 100644
--- a/compiler/jni/quick/calling_convention.cc
+++ b/compiler/jni/quick/calling_convention.cc
@@ -90,6 +90,14 @@
   return IsParamAFloatOrDouble(itr_args_);
 }
 
+bool ManagedRuntimeCallingConvention::IsCurrentParamADouble() {
+  return IsParamADouble(itr_args_);
+}
+
+bool ManagedRuntimeCallingConvention::IsCurrentParamALong() {
+  return IsParamALong(itr_args_);
+}
+
 // JNI calling convention
 
 JniCallingConvention* JniCallingConvention::Create(bool is_static, bool is_synchronized,
@@ -168,6 +176,10 @@
   }
 }
 
+bool JniCallingConvention::IsCurrentParamJniEnv() {
+  return (itr_args_ == kJniEnv);
+}
+
 bool JniCallingConvention::IsCurrentParamAFloatOrDouble() {
   switch (itr_args_) {
     case kJniEnv:
@@ -181,6 +193,32 @@
   }
 }
 
+bool JniCallingConvention::IsCurrentParamADouble() {
+  switch (itr_args_) {
+    case kJniEnv:
+      return false;  // JNIEnv*
+    case kObjectOrClass:
+      return false;   // jobject or jclass
+    default: {
+      int arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
+      return IsParamADouble(arg_pos);
+    }
+  }
+}
+
+bool JniCallingConvention::IsCurrentParamALong() {
+  switch (itr_args_) {
+    case kJniEnv:
+      return false;  // JNIEnv*
+    case kObjectOrClass:
+      return false;   // jobject or jclass
+    default: {
+      int arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
+      return IsParamALong(arg_pos);
+    }
+  }
+}
+
 // Return position of SIRT entry holding reference at the current iterator
 // position
 FrameOffset JniCallingConvention::CurrentParamSirtEntryOffset() {
diff --git a/compiler/jni/quick/calling_convention.h b/compiler/jni/quick/calling_convention.h
index 76d237e..4d25d1c 100644
--- a/compiler/jni/quick/calling_convention.h
+++ b/compiler/jni/quick/calling_convention.h
@@ -126,6 +126,24 @@
     char ch = shorty_[param];
     return (ch == 'F' || ch == 'D');
   }
+  bool IsParamADouble(unsigned int param) const {
+    DCHECK_LT(param, NumArgs());
+    if (IsStatic()) {
+      param++;  // 0th argument must skip return value at start of the shorty
+    } else if (param == 0) {
+      return false;  // this argument
+    }
+    return shorty_[param] == 'D';
+  }
+  bool IsParamALong(unsigned int param) const {
+    DCHECK_LT(param, NumArgs());
+    if (IsStatic()) {
+      param++;  // 0th argument must skip return value at start of the shorty
+    } else if (param == 0) {
+      return true;  // this argument
+    }
+    return shorty_[param] == 'J';
+  }
   bool IsParamAReference(unsigned int param) const {
     DCHECK_LT(param, NumArgs());
     if (IsStatic()) {
@@ -214,6 +232,8 @@
   void Next();
   bool IsCurrentParamAReference();
   bool IsCurrentParamAFloatOrDouble();
+  bool IsCurrentParamADouble();
+  bool IsCurrentParamALong();
   bool IsCurrentArgExplicit();  // ie a non-implict argument such as this
   bool IsCurrentArgPossiblyNull();
   size_t CurrentParamSize();
@@ -283,6 +303,9 @@
   virtual void Next();
   bool IsCurrentParamAReference();
   bool IsCurrentParamAFloatOrDouble();
+  bool IsCurrentParamADouble();
+  bool IsCurrentParamALong();
+  bool IsCurrentParamJniEnv();
   size_t CurrentParamSize();
   virtual bool IsCurrentParamInRegister() = 0;
   virtual bool IsCurrentParamOnStack() = 0;
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index dcdcdd1..64508d1 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -29,6 +29,7 @@
 #include "utils/assembler.h"
 #include "utils/managed_register.h"
 #include "utils/arm/managed_register_arm.h"
+#include "utils/arm64/managed_register_arm64.h"
 #include "utils/mips/managed_register_mips.h"
 #include "utils/x86/managed_register_x86.h"
 #include "thread.h"
@@ -73,11 +74,17 @@
 
   // Calling conventions to call into JNI method "end" possibly passing a returned reference, the
   //     method and the current thread.
-  size_t jni_end_arg_count = 0;
-  if (reference_return) { jni_end_arg_count++; }
-  if (is_synchronized) { jni_end_arg_count++; }
-  const char* jni_end_shorty = jni_end_arg_count == 0 ? "I"
-                                                        : (jni_end_arg_count == 1 ? "II" : "III");
+  const char* jni_end_shorty;
+  if (reference_return && is_synchronized) {
+    jni_end_shorty = "ILL";
+  } else if (reference_return) {
+    jni_end_shorty = "IL";
+  } else if (is_synchronized) {
+    jni_end_shorty = "VL";
+  } else {
+    jni_end_shorty = "V";
+  }
+
   UniquePtr<JniCallingConvention> end_jni_conv(
       JniCallingConvention::Create(is_static, is_synchronized, jni_end_shorty, instruction_set));
 
@@ -101,12 +108,22 @@
   __ StoreImmediateToFrame(main_jni_conv->SirtNumRefsOffset(),
                            main_jni_conv->ReferenceCount(),
                            mr_conv->InterproceduralScratchRegister());
-  __ CopyRawPtrFromThread32(main_jni_conv->SirtLinkOffset(),
-                          Thread::TopSirtOffset<4>(),
-                          mr_conv->InterproceduralScratchRegister());
-  __ StoreStackOffsetToThread32(Thread::TopSirtOffset<4>(),
-                              main_jni_conv->SirtOffset(),
-                              mr_conv->InterproceduralScratchRegister());
+
+  if (instruction_set == kArm64 || instruction_set == kX86_64) {
+    __ CopyRawPtrFromThread64(main_jni_conv->SirtLinkOffset(),
+                            Thread::TopSirtOffset<8>(),
+                            mr_conv->InterproceduralScratchRegister());
+    __ StoreStackOffsetToThread64(Thread::TopSirtOffset<8>(),
+                                main_jni_conv->SirtOffset(),
+                                mr_conv->InterproceduralScratchRegister());
+  } else {
+    __ CopyRawPtrFromThread32(main_jni_conv->SirtLinkOffset(),
+                            Thread::TopSirtOffset<4>(),
+                            mr_conv->InterproceduralScratchRegister());
+    __ StoreStackOffsetToThread32(Thread::TopSirtOffset<4>(),
+                                main_jni_conv->SirtOffset(),
+                                mr_conv->InterproceduralScratchRegister());
+  }
 
   // 3. Place incoming reference arguments into SIRT
   main_jni_conv->Next();  // Skip JNIEnv*
@@ -154,9 +171,15 @@
   }
 
   // 4. Write out the end of the quick frames.
-  __ StoreStackPointerToThread32(Thread::TopOfManagedStackOffset<4>());
-  __ StoreImmediateToThread32(Thread::TopOfManagedStackPcOffset<4>(), 0,
-                            mr_conv->InterproceduralScratchRegister());
+  if (instruction_set == kArm64 || instruction_set == kX86_64) {
+    __ StoreStackPointerToThread64(Thread::TopOfManagedStackOffset<8>());
+    __ StoreImmediateToThread64(Thread::TopOfManagedStackPcOffset<8>(), 0,
+                              mr_conv->InterproceduralScratchRegister());
+  } else {
+    __ StoreStackPointerToThread32(Thread::TopOfManagedStackOffset<4>());
+    __ StoreImmediateToThread32(Thread::TopOfManagedStackPcOffset<4>(), 0,
+                              mr_conv->InterproceduralScratchRegister());
+  }
 
   // 5. Move frame down to allow space for out going args.
   const size_t main_out_arg_size = main_jni_conv->OutArgSize();
@@ -164,13 +187,14 @@
   const size_t max_out_arg_size = std::max(main_out_arg_size, end_out_arg_size);
   __ IncreaseFrameSize(max_out_arg_size);
 
-
   // 6. Call into appropriate JniMethodStart passing Thread* so that transition out of Runnable
   //    can occur. The result is the saved JNI local state that is restored by the exit call. We
   //    abuse the JNI calling convention here, that is guaranteed to support passing 2 pointer
   //    arguments.
-  ThreadOffset<4> jni_start = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodStartSynchronized)
-                                              : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodStart);
+  ThreadOffset<4> jni_start32 = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodStartSynchronized)
+                                                : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodStart);
+  ThreadOffset<8> jni_start64 = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(8, pJniMethodStartSynchronized)
+                                                : QUICK_ENTRYPOINT_OFFSET(8, pJniMethodStart);
   main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
   FrameOffset locked_object_sirt_offset(0);
   if (is_synchronized) {
@@ -192,12 +216,21 @@
   }
   if (main_jni_conv->IsCurrentParamInRegister()) {
     __ GetCurrentThread(main_jni_conv->CurrentParamRegister());
-    __ Call(main_jni_conv->CurrentParamRegister(), Offset(jni_start),
-            main_jni_conv->InterproceduralScratchRegister());
+    if (instruction_set == kArm64 || instruction_set == kX86_64) {
+      __ Call(main_jni_conv->CurrentParamRegister(), Offset(jni_start64),
+             main_jni_conv->InterproceduralScratchRegister());
+    } else {
+      __ Call(main_jni_conv->CurrentParamRegister(), Offset(jni_start32),
+             main_jni_conv->InterproceduralScratchRegister());
+    }
   } else {
     __ GetCurrentThread(main_jni_conv->CurrentParamStackOffset(),
                         main_jni_conv->InterproceduralScratchRegister());
-    __ CallFromThread32(jni_start, main_jni_conv->InterproceduralScratchRegister());
+    if (instruction_set == kArm64 || instruction_set == kX86_64) {
+      __ CallFromThread64(jni_start64, main_jni_conv->InterproceduralScratchRegister());
+    } else {
+      __ CallFromThread32(jni_start32, main_jni_conv->InterproceduralScratchRegister());
+    }
   }
   if (is_synchronized) {  // Check for exceptions from monitor enter.
     __ ExceptionPoll(main_jni_conv->InterproceduralScratchRegister(), main_out_arg_size);
@@ -259,11 +292,20 @@
   if (main_jni_conv->IsCurrentParamInRegister()) {
     ManagedRegister jni_env = main_jni_conv->CurrentParamRegister();
     DCHECK(!jni_env.Equals(main_jni_conv->InterproceduralScratchRegister()));
-    __ LoadRawPtrFromThread32(jni_env, Thread::JniEnvOffset<4>());
+    if (instruction_set == kArm64 || instruction_set == kX86_64) {
+      __ LoadRawPtrFromThread64(jni_env, Thread::JniEnvOffset<8>());
+    } else {
+      __ LoadRawPtrFromThread32(jni_env, Thread::JniEnvOffset<4>());
+    }
   } else {
     FrameOffset jni_env = main_jni_conv->CurrentParamStackOffset();
-    __ CopyRawPtrFromThread32(jni_env, Thread::JniEnvOffset<4>(),
+    if (instruction_set == kArm64 || instruction_set == kX86_64) {
+      __ CopyRawPtrFromThread64(jni_env, Thread::JniEnvOffset<8>(),
                             main_jni_conv->InterproceduralScratchRegister());
+    } else {
+      __ CopyRawPtrFromThread32(jni_env, Thread::JniEnvOffset<4>(),
+                            main_jni_conv->InterproceduralScratchRegister());
+    }
   }
 
   // 9. Plant call to native code associated with method.
@@ -295,19 +337,23 @@
     __ Store(return_save_location, main_jni_conv->ReturnRegister(), main_jni_conv->SizeOfReturnValue());
   }
 
-  // 12. Call into JNI method end possibly passing a returned reference, the method and the current
   //     thread.
   end_jni_conv->ResetIterator(FrameOffset(end_out_arg_size));
-  ThreadOffset<4> jni_end(-1);
+  ThreadOffset<4> jni_end32(-1);
+  ThreadOffset<8> jni_end64(-1);
   if (reference_return) {
     // Pass result.
-    jni_end = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndWithReferenceSynchronized)
-                              : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndWithReference);
+    jni_end32 = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndWithReferenceSynchronized)
+                                : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndWithReference);
+    jni_end64 = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(8, pJniMethodEndWithReferenceSynchronized)
+                                : QUICK_ENTRYPOINT_OFFSET(8, pJniMethodEndWithReference);
     SetNativeParameter(jni_asm.get(), end_jni_conv.get(), end_jni_conv->ReturnRegister());
     end_jni_conv->Next();
   } else {
-    jni_end = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndSynchronized)
-                              : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEnd);
+    jni_end32 = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndSynchronized)
+                                : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEnd);
+    jni_end64 = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(8, pJniMethodEndSynchronized)
+                                : QUICK_ENTRYPOINT_OFFSET(8, pJniMethodEnd);
   }
   // Pass saved local reference state.
   if (end_jni_conv->IsCurrentParamOnStack()) {
@@ -334,12 +380,21 @@
   }
   if (end_jni_conv->IsCurrentParamInRegister()) {
     __ GetCurrentThread(end_jni_conv->CurrentParamRegister());
-    __ Call(end_jni_conv->CurrentParamRegister(), Offset(jni_end),
-            end_jni_conv->InterproceduralScratchRegister());
+    if (instruction_set == kArm64 || instruction_set == kX86_64) {
+      __ Call(end_jni_conv->CurrentParamRegister(), Offset(jni_end64),
+              end_jni_conv->InterproceduralScratchRegister());
+    } else {
+      __ Call(end_jni_conv->CurrentParamRegister(), Offset(jni_end32),
+              end_jni_conv->InterproceduralScratchRegister());
+    }
   } else {
     __ GetCurrentThread(end_jni_conv->CurrentParamStackOffset(),
                         end_jni_conv->InterproceduralScratchRegister());
-    __ CallFromThread32(ThreadOffset<4>(jni_end), end_jni_conv->InterproceduralScratchRegister());
+    if (instruction_set == kArm64 || instruction_set == kX86_64) {
+      __ CallFromThread64(ThreadOffset<8>(jni_end64), end_jni_conv->InterproceduralScratchRegister());
+    } else {
+      __ CallFromThread32(ThreadOffset<4>(jni_end32), end_jni_conv->InterproceduralScratchRegister());
+    }
   }
 
   // 13. Reload return value
@@ -360,6 +415,10 @@
   // 17. Finalize code generation
   __ EmitSlowPaths();
   size_t cs = __ CodeSize();
+  if (instruction_set == kArm64) {
+    // Test that we do not exceed the buffer size.
+    CHECK(cs < arm64::kBufferSizeArm64);
+  }
   std::vector<uint8_t> managed_code(cs);
   MemoryRegion code(&managed_code[0], managed_code.size());
   __ FinalizeInstructions(code);
diff --git a/compiler/utils/arm64/assembler_arm64.cc b/compiler/utils/arm64/assembler_arm64.cc
index a11c2da..1d87eaa 100644
--- a/compiler/utils/arm64/assembler_arm64.cc
+++ b/compiler/utils/arm64/assembler_arm64.cc
@@ -50,11 +50,11 @@
 }
 
 void Arm64Assembler::GetCurrentThread(ManagedRegister tr) {
-  ___ Mov(reg_x(tr.AsArm64().AsCoreRegister()), reg_x(TR));
+  ___ Mov(reg_x(tr.AsArm64().AsCoreRegister()), reg_x(TR1));
 }
 
 void Arm64Assembler::GetCurrentThread(FrameOffset offset, ManagedRegister /* scratch */) {
-  StoreToOffset(TR, SP, offset.Int32Value());
+  StoreToOffset(TR1, SP, offset.Int32Value());
 }
 
 // See Arm64 PCS Section 5.2.2.1.
@@ -138,7 +138,8 @@
 void Arm64Assembler::StoreRef(FrameOffset offs, ManagedRegister m_src) {
   Arm64ManagedRegister src = m_src.AsArm64();
   CHECK(src.IsCoreRegister()) << src;
-  StoreToOffset(src.AsCoreRegister(), SP, offs.Int32Value());
+  StoreWToOffset(kStoreWord, src.AsOverlappingCoreRegisterLow(), SP,
+                 offs.Int32Value());
 }
 
 void Arm64Assembler::StoreRawPtr(FrameOffset offs, ManagedRegister m_src) {
@@ -152,30 +153,31 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   LoadImmediate(scratch.AsCoreRegister(), imm);
-  StoreToOffset(scratch.AsCoreRegister(), SP, offs.Int32Value());
+  StoreWToOffset(kStoreWord, scratch.AsOverlappingCoreRegisterLow(), SP,
+                 offs.Int32Value());
 }
 
-void Arm64Assembler::StoreImmediateToThread32(ThreadOffset<4> offs, uint32_t imm,
+void Arm64Assembler::StoreImmediateToThread64(ThreadOffset<8> offs, uint32_t imm,
                                             ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   LoadImmediate(scratch.AsCoreRegister(), imm);
-  StoreToOffset(scratch.AsCoreRegister(), TR, offs.Int32Value());
+  StoreToOffset(scratch.AsCoreRegister(), TR1, offs.Int32Value());
 }
 
-void Arm64Assembler::StoreStackOffsetToThread32(ThreadOffset<4> tr_offs,
+void Arm64Assembler::StoreStackOffsetToThread64(ThreadOffset<8> tr_offs,
                                               FrameOffset fr_offs,
                                               ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   AddConstant(scratch.AsCoreRegister(), SP, fr_offs.Int32Value());
-  StoreToOffset(scratch.AsCoreRegister(), TR, tr_offs.Int32Value());
+  StoreToOffset(scratch.AsCoreRegister(), TR1, tr_offs.Int32Value());
 }
 
-void Arm64Assembler::StoreStackPointerToThread32(ThreadOffset<4> tr_offs) {
+void Arm64Assembler::StoreStackPointerToThread64(ThreadOffset<8> tr_offs) {
   // Arm64 does not support: "str sp, [dest]" therefore we use IP1 as a temp reg.
   ___ Mov(reg_x(IP1), reg_x(SP));
-  StoreToOffset(IP1, TR, tr_offs.Int32Value());
+  StoreToOffset(IP1, TR1, tr_offs.Int32Value());
 }
 
 void Arm64Assembler::StoreSpanning(FrameOffset dest_off, ManagedRegister m_source,
@@ -254,9 +256,13 @@
     CHECK_EQ(4u, size) << dest;
     ___ Ldr(reg_w(dest.AsWRegister()), MEM_OP(reg_x(base), offset));
   } else if (dest.IsCoreRegister()) {
-    CHECK_EQ(8u, size) << dest;
     CHECK_NE(dest.AsCoreRegister(), SP) << dest;
-    ___ Ldr(reg_x(dest.AsCoreRegister()), MEM_OP(reg_x(base), offset));
+    if (size == 4u) {
+      ___ Ldr(reg_w(dest.AsOverlappingCoreRegisterLow()), MEM_OP(reg_x(base), offset));
+    } else {
+      CHECK_EQ(8u, size) << dest;
+      ___ Ldr(reg_x(dest.AsCoreRegister()), MEM_OP(reg_x(base), offset));
+    }
   } else if (dest.IsSRegister()) {
     ___ Ldr(reg_s(dest.AsSRegister()), MEM_OP(reg_x(base), offset));
   } else {
@@ -269,14 +275,14 @@
   return Load(m_dst.AsArm64(), SP, src.Int32Value(), size);
 }
 
-void Arm64Assembler::LoadFromThread32(ManagedRegister m_dst, ThreadOffset<4> src, size_t size) {
-  return Load(m_dst.AsArm64(), TR, src.Int32Value(), size);
+void Arm64Assembler::LoadFromThread64(ManagedRegister m_dst, ThreadOffset<8> src, size_t size) {
+  return Load(m_dst.AsArm64(), TR1, src.Int32Value(), size);
 }
 
 void Arm64Assembler::LoadRef(ManagedRegister m_dst, FrameOffset offs) {
   Arm64ManagedRegister dst = m_dst.AsArm64();
   CHECK(dst.IsCoreRegister()) << dst;
-  LoadFromOffset(dst.AsCoreRegister(), SP, offs.Int32Value());
+  LoadWFromOffset(kLoadWord, dst.AsOverlappingCoreRegisterLow(), SP, offs.Int32Value());
 }
 
 void Arm64Assembler::LoadRef(ManagedRegister m_dst, ManagedRegister m_base,
@@ -284,7 +290,8 @@
   Arm64ManagedRegister dst = m_dst.AsArm64();
   Arm64ManagedRegister base = m_base.AsArm64();
   CHECK(dst.IsCoreRegister() && base.IsCoreRegister());
-  LoadFromOffset(dst.AsCoreRegister(), base.AsCoreRegister(), offs.Int32Value());
+  LoadWFromOffset(kLoadWord, dst.AsOverlappingCoreRegisterLow(), base.AsCoreRegister(),
+                  offs.Int32Value());
 }
 
 void Arm64Assembler::LoadRawPtr(ManagedRegister m_dst, ManagedRegister m_base, Offset offs) {
@@ -294,10 +301,10 @@
   LoadFromOffset(dst.AsCoreRegister(), base.AsCoreRegister(), offs.Int32Value());
 }
 
-void Arm64Assembler::LoadRawPtrFromThread32(ManagedRegister m_dst, ThreadOffset<4> offs) {
+void Arm64Assembler::LoadRawPtrFromThread64(ManagedRegister m_dst, ThreadOffset<8> offs) {
   Arm64ManagedRegister dst = m_dst.AsArm64();
   CHECK(dst.IsCoreRegister()) << dst;
-  LoadFromOffset(dst.AsCoreRegister(), TR, offs.Int32Value());
+  LoadFromOffset(dst.AsCoreRegister(), TR1, offs.Int32Value());
 }
 
 // Copying routines.
@@ -306,8 +313,16 @@
   Arm64ManagedRegister src = m_src.AsArm64();
   if (!dst.Equals(src)) {
     if (dst.IsCoreRegister()) {
-      CHECK(src.IsCoreRegister()) << src;
-      ___ Mov(reg_x(dst.AsCoreRegister()), reg_x(src.AsCoreRegister()));
+      if (size == 4) {
+        CHECK(src.IsWRegister());
+        ___ Mov(reg_x(dst.AsCoreRegister()), reg_w(src.AsWRegister()));
+      } else {
+        if (src.IsCoreRegister()) {
+          ___ Mov(reg_x(dst.AsCoreRegister()), reg_x(src.AsCoreRegister()));
+        } else {
+          ___ Mov(reg_x(dst.AsCoreRegister()), reg_w(src.AsWRegister()));
+        }
+      }
     } else if (dst.IsWRegister()) {
       CHECK(src.IsWRegister()) << src;
       ___ Mov(reg_w(dst.AsWRegister()), reg_w(src.AsWRegister()));
@@ -322,40 +337,42 @@
   }
 }
 
-void Arm64Assembler::CopyRawPtrFromThread32(FrameOffset fr_offs,
-                                          ThreadOffset<4> tr_offs,
+void Arm64Assembler::CopyRawPtrFromThread64(FrameOffset fr_offs,
+                                          ThreadOffset<8> tr_offs,
                                           ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
-  LoadFromOffset(scratch.AsCoreRegister(), TR, tr_offs.Int32Value());
+  LoadFromOffset(scratch.AsCoreRegister(), TR1, tr_offs.Int32Value());
   StoreToOffset(scratch.AsCoreRegister(), SP, fr_offs.Int32Value());
 }
 
-void Arm64Assembler::CopyRawPtrToThread32(ThreadOffset<4> tr_offs,
+void Arm64Assembler::CopyRawPtrToThread64(ThreadOffset<8> tr_offs,
                                         FrameOffset fr_offs,
                                         ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   LoadFromOffset(scratch.AsCoreRegister(), SP, fr_offs.Int32Value());
-  StoreToOffset(scratch.AsCoreRegister(), TR, tr_offs.Int32Value());
+  StoreToOffset(scratch.AsCoreRegister(), TR1, tr_offs.Int32Value());
 }
 
 void Arm64Assembler::CopyRef(FrameOffset dest, FrameOffset src,
                              ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
-  LoadFromOffset(scratch.AsCoreRegister(), SP, src.Int32Value());
-  StoreToOffset(scratch.AsCoreRegister(), SP, dest.Int32Value());
+  LoadWFromOffset(kLoadWord, scratch.AsOverlappingCoreRegisterLow(),
+                  SP, src.Int32Value());
+  StoreWToOffset(kStoreWord, scratch.AsOverlappingCoreRegisterLow(),
+                 SP, dest.Int32Value());
 }
 
 void Arm64Assembler::Copy(FrameOffset dest, FrameOffset src,
                           ManagedRegister m_scratch, size_t size) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
-  CHECK(scratch.IsCoreRegister() || scratch.IsWRegister()) << scratch;
+  CHECK(scratch.IsCoreRegister()) << scratch;
   CHECK(size == 4 || size == 8) << size;
   if (size == 4) {
-    LoadWFromOffset(kLoadWord, scratch.AsWRegister(), SP, src.Int32Value());
-    StoreWToOffset(kStoreWord, scratch.AsWRegister(), SP, dest.Int32Value());
+    LoadWFromOffset(kLoadWord, scratch.AsOverlappingCoreRegisterLow(), SP, src.Int32Value());
+    StoreWToOffset(kStoreWord, scratch.AsOverlappingCoreRegisterLow(), SP, dest.Int32Value());
   } else if (size == 8) {
     LoadFromOffset(scratch.AsCoreRegister(), SP, src.Int32Value());
     StoreToOffset(scratch.AsCoreRegister(), SP, dest.Int32Value());
@@ -418,10 +435,17 @@
   CHECK(scratch.IsCoreRegister() || scratch.IsWRegister()) << scratch;
   CHECK(size == 4 || size == 8) << size;
   if (size == 4) {
-    LoadWFromOffset(kLoadWord, scratch.AsWRegister(), src.AsCoreRegister(),
+    if (scratch.IsWRegister()) {
+      LoadWFromOffset(kLoadWord, scratch.AsWRegister(), src.AsCoreRegister(),
                     src_offset.Int32Value());
-    StoreWToOffset(kStoreWord, scratch.AsWRegister(), dest.AsCoreRegister(),
+      StoreWToOffset(kStoreWord, scratch.AsWRegister(), dest.AsCoreRegister(),
                    dest_offset.Int32Value());
+    } else {
+      LoadWFromOffset(kLoadWord, scratch.AsOverlappingCoreRegisterLow(), src.AsCoreRegister(),
+                    src_offset.Int32Value());
+      StoreWToOffset(kStoreWord, scratch.AsOverlappingCoreRegisterLow(), dest.AsCoreRegister(),
+                   dest_offset.Int32Value());
+    }
   } else if (size == 8) {
     LoadFromOffset(scratch.AsCoreRegister(), src.AsCoreRegister(), src_offset.Int32Value());
     StoreToOffset(scratch.AsCoreRegister(), dest.AsCoreRegister(), dest_offset.Int32Value());
@@ -486,7 +510,7 @@
   ___ Blr(reg_x(scratch.AsCoreRegister()));
 }
 
-void Arm64Assembler::CallFromThread32(ThreadOffset<4> /*offset*/, ManagedRegister /*scratch*/) {
+void Arm64Assembler::CallFromThread64(ThreadOffset<8> /*offset*/, ManagedRegister /*scratch*/) {
   UNIMPLEMENTED(FATAL) << "Unimplemented Call() variant";
 }
 
@@ -502,10 +526,11 @@
     // the address in the SIRT holding the reference.
     // e.g. out_reg = (handle == 0) ? 0 : (SP+handle_offset)
     if (in_reg.IsNoRegister()) {
-      LoadFromOffset(out_reg.AsCoreRegister(), SP, sirt_offs.Int32Value());
+      LoadWFromOffset(kLoadWord, out_reg.AsOverlappingCoreRegisterLow(), SP,
+                      sirt_offs.Int32Value());
       in_reg = out_reg;
     }
-    ___ Cmp(reg_x(in_reg.AsCoreRegister()), 0);
+    ___ Cmp(reg_w(in_reg.AsOverlappingCoreRegisterLow()), 0);
     if (!out_reg.Equals(in_reg)) {
       LoadImmediate(out_reg.AsCoreRegister(), 0, EQ);
     }
@@ -520,11 +545,12 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   if (null_allowed) {
-    LoadFromOffset(scratch.AsCoreRegister(), SP, sirt_offset.Int32Value());
+    LoadWFromOffset(kLoadWord, scratch.AsOverlappingCoreRegisterLow(), SP,
+                    sirt_offset.Int32Value());
     // Null values get a SIRT entry value of 0.  Otherwise, the sirt entry is
     // the address in the SIRT holding the reference.
     // e.g. scratch = (scratch == 0) ? 0 : (SP+sirt_offset)
-    ___ Cmp(reg_x(scratch.AsCoreRegister()), 0);
+    ___ Cmp(reg_w(scratch.AsOverlappingCoreRegisterLow()), 0);
     // Move this logic in add constants with flags.
     AddConstant(scratch.AsCoreRegister(), SP, sirt_offset.Int32Value(), NE);
   } else {
@@ -555,7 +581,7 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   Arm64Exception *current_exception = new Arm64Exception(scratch, stack_adjust);
   exception_blocks_.push_back(current_exception);
-  LoadFromOffset(scratch.AsCoreRegister(), TR, Thread::ExceptionOffset<4>().Int32Value());
+  LoadFromOffset(scratch.AsCoreRegister(), TR1, Thread::ExceptionOffset<8>().Int32Value());
   ___ Cmp(reg_x(scratch.AsCoreRegister()), 0);
   ___ B(current_exception->Entry(), COND_OP(NE));
 }
@@ -569,7 +595,11 @@
   // Pass exception object as argument.
   // Don't care about preserving X0 as this won't return.
   ___ Mov(reg_x(X0), reg_x(exception->scratch_.AsCoreRegister()));
-  LoadFromOffset(IP1, TR, QUICK_ENTRYPOINT_OFFSET(8, pDeliverException).Int32Value());
+  LoadFromOffset(IP1, TR1, QUICK_ENTRYPOINT_OFFSET(8, pDeliverException).Int32Value());
+
+  // FIXME: Temporary fix for TR (XSELF).
+  ___ Mov(reg_x(TR), reg_x(TR1));
+
   ___ Blr(reg_x(IP1));
   // Call should never return.
   ___ Brk();
@@ -590,6 +620,9 @@
   CHECK_EQ(callee_save_regs.size(), kCalleeSavedRegsSize);
   ___ PushCalleeSavedRegisters();
 
+  // FIXME: Temporary fix for TR (XSELF).
+  ___ Mov(reg_x(TR1), reg_x(TR));
+
   // Increate frame to required size - must be at least space to push Method*.
   CHECK_GT(frame_size, kCalleeSavedRegsSize * kFramePointerSize);
   size_t adjust = frame_size - (kCalleeSavedRegsSize * kFramePointerSize);
@@ -598,11 +631,27 @@
   // Write Method*.
   StoreToOffset(X0, SP, 0);
 
-  // Write out entry spills, treated as X regs.
-  // TODO: we can implement a %2 STRP variant of StoreToOffset.
+  // Write out entry spills
+  int32_t offset = frame_size + kFramePointerSize;
   for (size_t i = 0; i < entry_spills.size(); ++i) {
-    Register reg = entry_spills.at(i).AsArm64().AsCoreRegister();
-    StoreToOffset(reg, SP, frame_size + kFramePointerSize + (i * kFramePointerSize));
+    Arm64ManagedRegister reg = entry_spills.at(i).AsArm64();
+    if (reg.IsNoRegister()) {
+      // only increment stack offset.
+      ManagedRegisterSpill spill = entry_spills.at(i);
+      offset += spill.getSize();
+    } else if (reg.IsCoreRegister()) {
+      StoreToOffset(reg.AsCoreRegister(), SP, offset);
+      offset += 8;
+    } else if (reg.IsWRegister()) {
+      StoreWToOffset(kStoreWord, reg.AsWRegister(), SP, offset);
+      offset += 4;
+    } else if (reg.IsDRegister()) {
+      StoreDToOffset(reg.AsDRegister(), SP, offset);
+      offset += 8;
+    } else if (reg.IsSRegister()) {
+      StoreSToOffset(reg.AsSRegister(), SP, offset);
+      offset += 4;
+    }
   }
 }
 
@@ -618,6 +667,9 @@
   size_t adjust = frame_size - (kCalleeSavedRegsSize * kFramePointerSize);
   DecreaseFrameSize(adjust);
 
+  // FIXME: Temporary fix for TR (XSELF).
+  ___ Mov(reg_x(TR), reg_x(TR1));
+
   // Pop callee saved and return to LR.
   ___ PopCalleeSavedRegisters();
   ___ Ret();
diff --git a/compiler/utils/arm64/assembler_arm64.h b/compiler/utils/arm64/assembler_arm64.h
index 8acd1f9..97fb93a 100644
--- a/compiler/utils/arm64/assembler_arm64.h
+++ b/compiler/utils/arm64/assembler_arm64.h
@@ -81,8 +81,8 @@
 
 class Arm64Assembler FINAL : public Assembler {
  public:
-  Arm64Assembler() : vixl_buf_(new byte[BUF_SIZE]),
-  vixl_masm_(new vixl::MacroAssembler(vixl_buf_, BUF_SIZE)) {}
+  Arm64Assembler() : vixl_buf_(new byte[kBufferSizeArm64]),
+  vixl_masm_(new vixl::MacroAssembler(vixl_buf_, kBufferSizeArm64)) {}
 
   virtual ~Arm64Assembler() {
     delete[] vixl_buf_;
@@ -114,27 +114,27 @@
   void StoreRef(FrameOffset dest, ManagedRegister src) OVERRIDE;
   void StoreRawPtr(FrameOffset dest, ManagedRegister src) OVERRIDE;
   void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister scratch) OVERRIDE;
-  void StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm, ManagedRegister scratch)
+  void StoreImmediateToThread64(ThreadOffset<8> dest, uint32_t imm, ManagedRegister scratch)
       OVERRIDE;
-  void StoreStackOffsetToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+  void StoreStackOffsetToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs,
                                   ManagedRegister scratch) OVERRIDE;
-  void StoreStackPointerToThread32(ThreadOffset<4> thr_offs) OVERRIDE;
+  void StoreStackPointerToThread64(ThreadOffset<8> thr_offs) OVERRIDE;
   void StoreSpanning(FrameOffset dest, ManagedRegister src, FrameOffset in_off,
                      ManagedRegister scratch) OVERRIDE;
 
   // Load routines.
   void Load(ManagedRegister dest, FrameOffset src, size_t size) OVERRIDE;
-  void LoadFromThread32(ManagedRegister dest, ThreadOffset<4> src, size_t size) OVERRIDE;
+  void LoadFromThread64(ManagedRegister dest, ThreadOffset<8> src, size_t size) OVERRIDE;
   void LoadRef(ManagedRegister dest, FrameOffset  src) OVERRIDE;
   void LoadRef(ManagedRegister dest, ManagedRegister base, MemberOffset offs) OVERRIDE;
   void LoadRawPtr(ManagedRegister dest, ManagedRegister base, Offset offs) OVERRIDE;
-  void LoadRawPtrFromThread32(ManagedRegister dest, ThreadOffset<4> offs) OVERRIDE;
+  void LoadRawPtrFromThread64(ManagedRegister dest, ThreadOffset<8> offs) OVERRIDE;
 
   // Copying routines.
   void Move(ManagedRegister dest, ManagedRegister src, size_t size) OVERRIDE;
-  void CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+  void CopyRawPtrFromThread64(FrameOffset fr_offs, ThreadOffset<8> thr_offs,
                               ManagedRegister scratch) OVERRIDE;
-  void CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
+  void CopyRawPtrToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
       OVERRIDE;
   void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister scratch) OVERRIDE;
   void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size) OVERRIDE;
@@ -183,7 +183,7 @@
   // Call to address held at [base+offset].
   void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) OVERRIDE;
   void Call(FrameOffset base, Offset offset, ManagedRegister scratch) OVERRIDE;
-  void CallFromThread32(ThreadOffset<4> offset, ManagedRegister scratch) OVERRIDE;
+  void CallFromThread64(ThreadOffset<8> offset, ManagedRegister scratch) OVERRIDE;
 
   // Jump to address (not setting link register)
   void JumpTo(ManagedRegister m_base, Offset offs, ManagedRegister m_scratch);
@@ -234,9 +234,6 @@
   void AddConstant(Register rd, int32_t value, Condition cond = AL);
   void AddConstant(Register rd, Register rn, int32_t value, Condition cond = AL);
 
-  // Vixl buffer size.
-  static constexpr size_t BUF_SIZE = 4096;
-
   // Vixl buffer.
   byte* vixl_buf_;
 
diff --git a/compiler/utils/arm64/constants_arm64.h b/compiler/utils/arm64/constants_arm64.h
index ecf9fbe..2a08c95 100644
--- a/compiler/utils/arm64/constants_arm64.h
+++ b/compiler/utils/arm64/constants_arm64.h
@@ -31,6 +31,9 @@
 
 constexpr unsigned int kCalleeSavedRegsSize = 20;
 
+// Vixl buffer size.
+constexpr size_t kBufferSizeArm64 = 4096*2;
+
 }  // arm64
 }  // art
 
diff --git a/compiler/utils/managed_register.h b/compiler/utils/managed_register.h
index f007d28..06ce3b4 100644
--- a/compiler/utils/managed_register.h
+++ b/compiler/utils/managed_register.h
@@ -89,6 +89,9 @@
   explicit ManagedRegisterSpill(const ManagedRegister& other)
       : ManagedRegister(other), size_(-1), spill_offset_(-1) { }
 
+  explicit ManagedRegisterSpill(const ManagedRegister& other, int32_t size)
+      : ManagedRegister(other), size_(size), spill_offset_(-1) { }
+
   int32_t getSpillOffset() {
     return spill_offset_;
   }
@@ -111,6 +114,11 @@
     std::vector<ManagedRegisterSpill>::push_back(spill);
   }
 
+  void push_back(ManagedRegister __x, int32_t __size) {
+    ManagedRegisterSpill spill(__x, __size);
+    std::vector<ManagedRegisterSpill>::push_back(spill);
+  }
+
   void push_back(ManagedRegisterSpill __x) {
     std::vector<ManagedRegisterSpill>::push_back(__x);
   }
diff --git a/runtime/arch/arm64/jni_entrypoints_arm64.S b/runtime/arch/arm64/jni_entrypoints_arm64.S
index d2ed692..ba783ab 100644
--- a/runtime/arch/arm64/jni_entrypoints_arm64.S
+++ b/runtime/arch/arm64/jni_entrypoints_arm64.S
@@ -20,11 +20,76 @@
      * Jni dlsym lookup stub.
      */
     .extern artFindNativeMethod
-UNIMPLEMENTED art_jni_dlsym_lookup_stub
+
+    // TODO: Add CFI directives.
+ENTRY art_jni_dlsym_lookup_stub
+  // spill regs.
+  stp   x29, x30, [sp, #-16]!
+  mov   x29, sp
+  stp   d6, d7,   [sp, #-16]!
+  stp   d4, d5,   [sp, #-16]!
+  stp   d2, d3,   [sp, #-16]!
+  stp   d0, d1,   [sp, #-16]!
+  stp   x6, x7,   [sp, #-16]!
+  stp   x4, x5,   [sp, #-16]!
+  stp   x2, x3,   [sp, #-16]!
+  stp   x0, x1,   [sp, #-16]!
+
+  bl  artFindNativeMethod
+  mov  x17, x0    // store result in scratch reg.
+
+  // load spill regs.
+  ldp   x0, x1,   [sp], #16
+  ldp   x2, x3,   [sp], #16
+  ldp   x4, x5,   [sp], #16
+  ldp   x6, x7,   [sp], #16
+  ldp   d0, d1,   [sp], #16
+  ldp   d2, d3,   [sp], #16
+  ldp   d4, d5,   [sp], #16
+  ldp   d6, d7,   [sp], #16
+  ldp   x29, x30, [sp], #16
+
+  cbz   x17, 1f   // is method code null ?
+  br    x17       // if non-null, tail call to method's code.
+
+1:
+  ret             // restore regs and return to caller to handle exception.
+END art_jni_dlsym_lookup_stub
 
     /*
      * Entry point of native methods when JNI bug compatibility is enabled.
      */
     .extern artWorkAroundAppJniBugs
-UNIMPLEMENTED art_work_around_app_jni_bugs
+ENTRY art_work_around_app_jni_bugs
+  // spill regs.
+  stp   x29, x30, [sp, #-16]!
+  mov   x29, sp
+  stp   d6, d7,   [sp, #-16]!
+  stp   d4, d5,   [sp, #-16]!
+  stp   d2, d3,   [sp, #-16]!
+  stp   d0, d1,   [sp, #-16]!
+  stp   x6, x7,   [sp, #-16]!
+  stp   x4, x5,   [sp, #-16]!
+  stp   x2, x3,   [sp, #-16]!
+  stp   x0, x1,   [sp, #-16]!
+
+  mov   x0, x19   // Thread::Current.
+  mov   x1, sp    // SP.
+  bl    artWorkAroundAppJniBugs   // (Thread*, SP).
+  mov   x17, x0   // save target return.
+
+  // load spill regs.
+  ldp   x0, x1,   [sp], #16
+  ldp   x2, x3,   [sp], #16
+  ldp   x4, x5,   [sp], #16
+  ldp   x6, x7,   [sp], #16
+  ldp   d0, d1,   [sp], #16
+  ldp   d2, d3,   [sp], #16
+  ldp   d4, d5,   [sp], #16
+  ldp   d6, d7,   [sp], #16
+  ldp   x29, x30, [sp], #16
+
+  //tail call into JNI routine.
+  br    x17
+END art_work_around_app_jni_bugs
 
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index dd34583..6ce5d06 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -332,9 +332,14 @@
 END \c_name
 .endm
 
+// FIXME: Temporary fix for TR(XSELF).
 .macro ONE_ARG_RUNTIME_EXCEPTION c_name, cxx_name
     .extern \cxx_name
 ENTRY \c_name
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME  // save all registers as basis for long jump context.
+    mov x1, x19                       // pass Thread::Current.
+    mov x2, sp                        // pass SP.
+    b   \cxx_name                     // \cxx_name(Thread*, SP).
     brk 0
 END \c_name
 .endm
diff --git a/runtime/arch/arm64/registers_arm64.h b/runtime/arch/arm64/registers_arm64.h
index 43c0ad6..2503918 100644
--- a/runtime/arch/arm64/registers_arm64.h
+++ b/runtime/arch/arm64/registers_arm64.h
@@ -56,7 +56,8 @@
   X29 = 29,
   X30 = 30,
   X31 = 31,
-  TR  = 18,     // ART Thread Register.
+  TR  = 18,     // ART Thread Register - Needs to be one of the callee saved regs.
+  TR1 = 19,     // FIXME!
   IP0 = 16,     // Used as scratch by VIXL.
   IP1 = 17,     // Used as scratch by ART JNI Assembler.
   FP  = 29,
diff --git a/runtime/instruction_set.h b/runtime/instruction_set.h
index f4eecfc..a08becf 100644
--- a/runtime/instruction_set.h
+++ b/runtime/instruction_set.h
@@ -81,6 +81,10 @@
     return mask_ != peer.mask_;
   }
 
+  bool operator<=(const InstructionSetFeatures &peer) const {
+    return (mask_ & peer.mask_) == mask_;
+  }
+
  private:
   uint32_t mask_;
 };