Quick Compiler: fix Arm cts failures

Fixes move_wide_16#testN1, move_wide_16#testN2

Two bugs for the price of one (thanks CTS!)

First, the new stack overflow checking code was broken for very
large frames.  For Arm on method entry, we only have 1 available
temp register, r12, until argument registers are flushed.
Previously, for explicit checks on large frames,
r12 was immediately loaded with the stack_end value.  However,
later on when the frame is extended, if the frame size exceeds
the range of a reg-reg-imm subtract, the codegen utilities will
allocate a new temporary register to complete the operation. r12
was getting clobbered.  Similarly, for medium-large frames r12
could get clobbered during frame creation.

What we should always do when directly using fixed registers like
this is to lock them to prevent them from being allocated as a
temp.  The other half of the first bug is easily solved by delaying
the load of stack_end until after the new sp is computed.  We'll
increase the stall cost, but this is an uncommon case.

The second bug was likely a typo in LoadValueDisp().  I'm a bit
surprised we hadn't hit this one earlier - but perhaps it was
recently introduced.  The wrong base register was being used in
the non-float, wide, excessive offset case (which I suppose is also
somewhat uncommon).

Cherry-pick of internal commit If5b30f729e31d86db604045dd7581fd4626e0b55

Change-Id: If5b30f729e31d86db604045dd7581fd4626e0b55
diff --git a/compiler/dex/quick/arm/call_arm.cc b/compiler/dex/quick/arm/call_arm.cc
index 435242a..bb84fb7 100644
--- a/compiler/dex/quick/arm/call_arm.cc
+++ b/compiler/dex/quick/arm/call_arm.cc
@@ -356,10 +356,14 @@
                             (static_cast<size_t>(frame_size_) <
                             Thread::kStackOverflowReservedBytes));
   NewLIR0(kPseudoMethodEntry);
+  bool large_frame = (static_cast<size_t>(frame_size_) > Thread::kStackOverflowReservedUsableBytes);
   if (!skip_overflow_check) {
     if (Runtime::Current()->ExplicitStackOverflowChecks()) {
-      /* Load stack limit */
-      Load32Disp(rs_rARM_SELF, Thread::StackEndOffset<4>().Int32Value(), rs_r12);
+      if (!large_frame) {
+        /* Load stack limit */
+        LockTemp(rs_r12);
+        Load32Disp(rs_rARM_SELF, Thread::StackEndOffset<4>().Int32Value(), rs_r12);
+      }
     } else {
       // Implicit stack overflow check.
       // Generate a load from [sp, #-overflowsize].  If this is in the stack
@@ -420,16 +424,26 @@
         const bool restore_lr_;
         const size_t sp_displace_;
       };
-      if (static_cast<size_t>(frame_size_) > Thread::kStackOverflowReservedUsableBytes) {
+      if (large_frame) {
+        // Note: may need a temp reg, and we only have r12 free at this point.
         OpRegRegImm(kOpSub, rs_rARM_LR, rs_rARM_SP, frame_size_without_spills);
+        Load32Disp(rs_rARM_SELF, Thread::StackEndOffset<4>().Int32Value(), rs_r12);
         LIR* branch = OpCmpBranch(kCondUlt, rs_rARM_LR, rs_r12, nullptr);
         // Need to restore LR since we used it as a temp.
         AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, true, spill_size));
         OpRegCopy(rs_rARM_SP, rs_rARM_LR);     // Establish stack
       } else {
-        // If the frame is small enough we are guaranteed to have enough space that remains to
-        // handle signals on the user stack.
+        /*
+         * If the frame is small enough we are guaranteed to have enough space that remains to
+         * handle signals on the user stack.  However, we may not have any free temp
+         * registers at this point, so we'll temporarily add LR to the temp pool.
+         */
+        DCHECK(!GetRegInfo(rs_rARM_LR)->IsTemp());
+        MarkTemp(rs_rARM_LR);
+        FreeTemp(rs_rARM_LR);
         OpRegRegImm(kOpSub, rs_rARM_SP, rs_rARM_SP, frame_size_without_spills);
+        Clobber(rs_rARM_LR);
+        UnmarkTemp(rs_rARM_LR);
         LIR* branch = OpCmpBranch(kCondUlt, rs_rARM_SP, rs_r12, nullptr);
         AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, false, frame_size_));
       }
@@ -448,6 +462,7 @@
   FreeTemp(rs_r1);
   FreeTemp(rs_r2);
   FreeTemp(rs_r3);
+  FreeTemp(rs_r12);
 }
 
 void ArmMir2Lir::GenExitSequence() {
diff --git a/compiler/dex/quick/arm/utility_arm.cc b/compiler/dex/quick/arm/utility_arm.cc
index fe18ed9..b0211d6 100644
--- a/compiler/dex/quick/arm/utility_arm.cc
+++ b/compiler/dex/quick/arm/utility_arm.cc
@@ -850,7 +850,7 @@
         DCHECK(!r_dest.IsPair());
         load = NewLIR3(kThumb2Vldrd, r_dest.GetReg(), r_ptr.GetReg(), encoded_disp);
       } else {
-        load = NewLIR4(kThumb2LdrdI8, r_dest.GetLowReg(), r_dest.GetHighReg(), r_base.GetReg(),
+        load = NewLIR4(kThumb2LdrdI8, r_dest.GetLowReg(), r_dest.GetHighReg(), r_ptr.GetReg(),
                        encoded_disp);
       }
       if ((displacement & ~1020) != 0 && !r_dest.IsFloat()) {