Optimize double/float immediate loading on arm.

Also reserve a D register for temp.

Change-Id: I6584d9005b0f5685c3afcd8e9153b4c87b56aa8e
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index e5de2ab..2a79f82 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -55,6 +55,10 @@
 static constexpr SRegister kFpuCalleeSaves[] =
     { S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31 };
 
+// D31 cannot be split into two S registers, and the register allocator only works on
+// S registers. Therefore there is no need to block it.
+static constexpr DRegister DTMP = D31;
+
 class InvokeRuntimeCallingConvention : public CallingConvention<Register, SRegister> {
  public:
   InvokeRuntimeCallingConvention()
@@ -3361,10 +3365,8 @@
     }
   } else if (source.IsDoubleStackSlot()) {
     if (destination.IsDoubleStackSlot()) {
-      __ LoadFromOffset(kLoadWord, IP, SP, source.GetStackIndex());
-      __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
-      __ LoadFromOffset(kLoadWord, IP, SP, source.GetHighStackIndex(kArmWordSize));
-      __ StoreToOffset(kStoreWord, IP, SP, destination.GetHighStackIndex(kArmWordSize));
+      __ LoadDFromOffset(DTMP, SP, source.GetStackIndex());
+      __ StoreDToOffset(DTMP, SP, destination.GetStackIndex());
     } else if (destination.IsRegisterPair()) {
       DCHECK(ExpectedPairLayout(destination));
       __ LoadFromOffset(
@@ -3484,16 +3486,13 @@
     __ vmovs(source.AsFpuRegister<SRegister>(), destination.AsFpuRegister<SRegister>());
     __ vmovsr(destination.AsFpuRegister<SRegister>(), IP);
   } else if (source.IsRegisterPair() && destination.IsRegisterPair()) {
-    __ Mov(IP, source.AsRegisterPairLow<Register>());
+    __ vmovdrr(DTMP, source.AsRegisterPairLow<Register>(), source.AsRegisterPairHigh<Register>());
     __ Mov(source.AsRegisterPairLow<Register>(), destination.AsRegisterPairLow<Register>());
-    __ Mov(destination.AsRegisterPairLow<Register>(), IP);
-    __ Mov(IP, source.AsRegisterPairHigh<Register>());
     __ Mov(source.AsRegisterPairHigh<Register>(), destination.AsRegisterPairHigh<Register>());
-    __ Mov(destination.AsRegisterPairHigh<Register>(), IP);
+    __ vmovrrd(destination.AsRegisterPairLow<Register>(),
+               destination.AsRegisterPairHigh<Register>(),
+               DTMP);
   } else if (source.IsRegisterPair() || destination.IsRegisterPair()) {
-    // TODO: Find a D register available in the parallel moves,
-    // or reserve globally a D register.
-    DRegister tmp = D0;
     Register low_reg = source.IsRegisterPair()
         ? source.AsRegisterPairLow<Register>()
         : destination.AsRegisterPairLow<Register>();
@@ -3501,27 +3500,15 @@
         ? destination.GetStackIndex()
         : source.GetStackIndex();
     DCHECK(ExpectedPairLayout(source.IsRegisterPair() ? source : destination));
-    // Make room for the pushed DRegister.
-    mem += 8;
-    __ vpushd(tmp, 1);
-    __ vmovdrr(tmp, low_reg, static_cast<Register>(low_reg + 1));
+    __ vmovdrr(DTMP, low_reg, static_cast<Register>(low_reg + 1));
     __ LoadFromOffset(kLoadWordPair, low_reg, SP, mem);
-    __ StoreDToOffset(tmp, SP, mem);
-    __ vpopd(tmp, 1);
+    __ StoreDToOffset(DTMP, SP, mem);
   } else if (source.IsFpuRegisterPair() && destination.IsFpuRegisterPair()) {
-    // TODO: Find a D register available in the parallel moves,
-    // or reserve globally a D register.
-    DRegister tmp = D0;
     DRegister first = FromLowSToD(source.AsFpuRegisterPairLow<SRegister>());
     DRegister second = FromLowSToD(destination.AsFpuRegisterPairLow<SRegister>());
-    while (tmp == first || tmp == second) {
-      tmp = static_cast<DRegister>(tmp + 1);
-    }
-    __ vpushd(tmp, 1);
-    __ vmovd(tmp, first);
+    __ vmovd(DTMP, first);
     __ vmovd(first, second);
-    __ vmovd(second, tmp);
-    __ vpopd(tmp, 1);
+    __ vmovd(second, DTMP);
   } else if (source.IsFpuRegisterPair() || destination.IsFpuRegisterPair()) {
     DRegister reg = source.IsFpuRegisterPair()
         ? FromLowSToD(source.AsFpuRegisterPairLow<SRegister>())
@@ -3529,15 +3516,9 @@
     int mem = source.IsFpuRegisterPair()
         ? destination.GetStackIndex()
         : source.GetStackIndex();
-    // TODO: Find or reserve a D register.
-    DRegister tmp = reg == D0 ? D1 : D0;
-    // Make room for the pushed DRegister.
-    mem += 8;
-    __ vpushd(tmp, 1);
-    __ vmovd(tmp, reg);
+    __ vmovd(DTMP, reg);
     __ LoadDFromOffset(reg, SP, mem);
-    __ StoreDToOffset(tmp, SP, mem);
-    __ vpopd(tmp, 1);
+    __ StoreDToOffset(DTMP, SP, mem);
   } else if (source.IsFpuRegister() || destination.IsFpuRegister()) {
     SRegister reg = source.IsFpuRegister() ? source.AsFpuRegister<SRegister>()
                                            : destination.AsFpuRegister<SRegister>();
diff --git a/compiler/utils/arm/assembler_arm.h b/compiler/utils/arm/assembler_arm.h
index 0d84ba7..8730f52 100644
--- a/compiler/utils/arm/assembler_arm.h
+++ b/compiler/utils/arm/assembler_arm.h
@@ -536,18 +536,44 @@
   virtual void LoadImmediate(Register rd, int32_t value, Condition cond = AL) = 0;
   void LoadSImmediate(SRegister sd, float value, Condition cond = AL) {
     if (!vmovs(sd, value, cond)) {
-      LoadImmediate(IP, bit_cast<int32_t, float>(value), cond);
-      vmovsr(sd, IP, cond);
+      int32_t int_value = bit_cast<int32_t, float>(value);
+      if (int_value == bit_cast<int32_t, float>(0.0f)) {
+        // 0.0 is quite common, so we special case it by loading
+        // 2.0 in `sd` and then substracting it.
+        bool success = vmovs(sd, 2.0, cond);
+        CHECK(success);
+        vsubs(sd, sd, sd, cond);
+      } else {
+        LoadImmediate(IP, int_value, cond);
+        vmovsr(sd, IP, cond);
+      }
     }
   }
 
   void LoadDImmediate(DRegister sd, double value, Condition cond = AL) {
     if (!vmovd(sd, value, cond)) {
       uint64_t int_value = bit_cast<uint64_t, double>(value);
-      LoadSImmediate(
-          static_cast<SRegister>(sd << 1), bit_cast<float, uint32_t>(Low32Bits(int_value)));
-      LoadSImmediate(
-          static_cast<SRegister>((sd << 1) + 1), bit_cast<float, uint32_t>(High32Bits(int_value)));
+      if (int_value == bit_cast<uint64_t, double>(0.0)) {
+        // 0.0 is quite common, so we special case it by loading
+        // 2.0 in `sd` and then substracting it.
+        bool success = vmovd(sd, 2.0, cond);
+        CHECK(success);
+        vsubd(sd, sd, sd, cond);
+      } else {
+        if (sd < 16) {
+          SRegister low = static_cast<SRegister>(sd << 1);
+          SRegister high = static_cast<SRegister>(low + 1);
+          LoadSImmediate(low, bit_cast<float, uint32_t>(Low32Bits(int_value)), cond);
+          if (High32Bits(int_value) == Low32Bits(int_value)) {
+            vmovs(high, low);
+          } else {
+            LoadSImmediate(high, bit_cast<float, uint32_t>(High32Bits(int_value)), cond);
+          }
+        } else {
+          LOG(FATAL) << "Unimplemented loading of double into a D register "
+                     << "that cannot be split into two S registers";
+        }
+      }
     }
   }