Quick Compiler: fix Arm cts failures

Fixes move_wide_16#testN1, move_wide_16#testN2

Two bugs for the price of one (thanks CTS!)

First, the new stack overflow checking code was broken for very
large frames.  For Arm on method entry, we only have 1 available
temp register, r12, until argument registers are flushed.
Previously, for explicit checks on large frames,
r12 was immediately loaded with the stack_end value.  However,
later on when the frame is extended, if the frame size exceeds
the range of a reg-reg-imm subtract, the codegen utilities will
allocate a new temporary register to complete the operation. r12
was getting clobbered.

What we should always do when directly using fixed registers like
this is to lock them to prevent them from being allocated as a
temp.  The other half of the first bug is easily solved by delaying
the load of stack_end until after the new sp is computed.  We'll
increase the stall cost, but this is an uncommon case.

The second bug was likely a typo in LoadValueDisp().  I'm a bit
surprised we hadn't hit this one earlier - but perhaps it was
recently introduced.  The wrong base register was being used in
the non-float, wide, excessive offset case (which I suppose is also
somewhat uncommon).

Change-Id: I2c5074c9570b022af680f472deac9fe72a2e827e
diff --git a/compiler/dex/quick/arm/call_arm.cc b/compiler/dex/quick/arm/call_arm.cc
index d3477c9..4850c3e 100644
--- a/compiler/dex/quick/arm/call_arm.cc
+++ b/compiler/dex/quick/arm/call_arm.cc
@@ -356,10 +356,14 @@
                             (static_cast<size_t>(frame_size_) <
                             Thread::kStackOverflowReservedBytes));
   NewLIR0(kPseudoMethodEntry);
+  bool large_frame = (static_cast<size_t>(frame_size_) > Thread::kStackOverflowReservedUsableBytes);
   if (!skip_overflow_check) {
     if (Runtime::Current()->ExplicitStackOverflowChecks()) {
-      /* Load stack limit */
-      Load32Disp(rs_rARM_SELF, Thread::StackEndOffset<4>().Int32Value(), rs_r12);
+      if (!large_frame) {
+        /* Load stack limit */
+        LockTemp(rs_r12);
+        Load32Disp(rs_rARM_SELF, Thread::StackEndOffset<4>().Int32Value(), rs_r12);
+      }
     } else {
       // Implicit stack overflow check.
       // Generate a load from [sp, #-overflowsize].  If this is in the stack
@@ -420,8 +424,10 @@
         const bool restore_lr_;
         const size_t sp_displace_;
       };
-      if (static_cast<size_t>(frame_size_) > Thread::kStackOverflowReservedUsableBytes) {
+      if (large_frame) {
+        // Note: may need a temp reg, and we only have r12 free at this point.
         OpRegRegImm(kOpSub, rs_rARM_LR, rs_rARM_SP, frame_size_without_spills);
+        Load32Disp(rs_rARM_SELF, Thread::StackEndOffset<4>().Int32Value(), rs_r12);
         LIR* branch = OpCmpBranch(kCondUlt, rs_rARM_LR, rs_r12, nullptr);
         // Need to restore LR since we used it as a temp.
         AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, true, spill_size));
@@ -448,6 +454,7 @@
   FreeTemp(rs_r1);
   FreeTemp(rs_r2);
   FreeTemp(rs_r3);
+  FreeTemp(rs_r12);
 }
 
 void ArmMir2Lir::GenExitSequence() {
diff --git a/compiler/dex/quick/arm/utility_arm.cc b/compiler/dex/quick/arm/utility_arm.cc
index fe18ed9..b0211d6 100644
--- a/compiler/dex/quick/arm/utility_arm.cc
+++ b/compiler/dex/quick/arm/utility_arm.cc
@@ -850,7 +850,7 @@
         DCHECK(!r_dest.IsPair());
         load = NewLIR3(kThumb2Vldrd, r_dest.GetReg(), r_ptr.GetReg(), encoded_disp);
       } else {
-        load = NewLIR4(kThumb2LdrdI8, r_dest.GetLowReg(), r_dest.GetHighReg(), r_base.GetReg(),
+        load = NewLIR4(kThumb2LdrdI8, r_dest.GetLowReg(), r_dest.GetHighReg(), r_ptr.GetReg(),
                        encoded_disp);
       }
       if ((displacement & ~1020) != 0 && !r_dest.IsFloat()) {