JIT: Performance fix for const doubles

Some recent Arm processors take a performance hit when
creating a floating point double by loading it as a pair of singles.
Legacy code to support soft floating point doubles as a pair of core
registers loaded double immediates in this way.

With the CL, we handle double immediates as a single unit.

(cherry-pick of c8129911e598ad0ca8d7b31012444ab6ce8bce45.)

Change-Id: Ic1512e34bfd233a6f5ffd58ce843965adbbad875
diff --git a/vm/compiler/codegen/arm/Assemble.cpp b/vm/compiler/codegen/arm/Assemble.cpp
index d1ecd97..7406d3e 100644
--- a/vm/compiler/codegen/arm/Assemble.cpp
+++ b/vm/compiler/codegen/arm/Assemble.cpp
@@ -952,6 +952,7 @@
         if (lir->opcode == kThumbLdrPcRel ||
             lir->opcode == kThumb2LdrPcRel12 ||
             lir->opcode == kThumbAddPcRel ||
+            ((lir->opcode == kThumb2Vldrd) && (lir->operands[1] == r15pc)) ||
             ((lir->opcode == kThumb2Vldrs) && (lir->operands[1] == r15pc))) {
             ArmLIR *lirTarget = (ArmLIR *) lir->generic.target;
             intptr_t pc = (lir->generic.offset + 4) & ~3;
@@ -976,7 +977,7 @@
                 }
                 return kRetryHalve;
             }
-            if (lir->opcode == kThumb2Vldrs) {
+            if ((lir->opcode == kThumb2Vldrs) || (lir->opcode == kThumb2Vldrd)) {
                 lir->operands[2] = delta >> 2;
             } else {
                 lir->operands[1] = (lir->opcode == kThumb2LdrPcRel12) ?
diff --git a/vm/compiler/codegen/arm/CodegenCommon.cpp b/vm/compiler/codegen/arm/CodegenCommon.cpp
index 07f3ac7..5c02678 100644
--- a/vm/compiler/codegen/arm/CodegenCommon.cpp
+++ b/vm/compiler/codegen/arm/CodegenCommon.cpp
@@ -368,6 +368,25 @@
     return NULL;
 }
 
+/* Search the existing constants in the literal pool for an exact wide match */
+ArmLIR* scanLiteralPoolWide(LIR* dataTarget, int valLo, int valHi)
+{
+  bool lowMatch = false;
+  ArmLIR* lowTarget = NULL;
+  while (dataTarget) {
+    if (lowMatch && (((ArmLIR *)dataTarget)->operands[0] == valHi)) {
+      return lowTarget;
+    }
+    lowMatch = false;
+    if (((ArmLIR *) dataTarget)->operands[0] == valLo) {
+      lowMatch = true;
+      lowTarget = (ArmLIR *) dataTarget;
+    }
+    dataTarget = dataTarget->next;
+  }
+  return NULL;
+}
+
 /*
  * The following are building blocks to insert constants into the pool or
  * instruction streams.
@@ -392,6 +411,14 @@
     return NULL;
 }
 
+/* Add a 64-bit constant to the literal pool or mixed with code */
+ArmLIR* addWideData(CompilationUnit* cUnit, LIR** constantListP,
+                 int valLo, int valHi)
+{
+    addWordData(cUnit, constantListP, valHi);
+    return addWordData(cUnit, constantListP, valLo);
+}
+
 static RegLocation inlinedTargetWide(CompilationUnit *cUnit, MIR *mir,
                                       bool fpHint)
 {
diff --git a/vm/compiler/codegen/arm/Thumb2/Factory.cpp b/vm/compiler/codegen/arm/Thumb2/Factory.cpp
index 9c9ce13..c3c3712 100644
--- a/vm/compiler/codegen/arm/Thumb2/Factory.cpp
+++ b/vm/compiler/codegen/arm/Thumb2/Factory.cpp
@@ -53,7 +53,14 @@
 {
     int encodedImm = encodeImmSingle(value);
     assert(SINGLEREG(rDest));
-    if (encodedImm >= 0) {
+    if (value == 0) {
+      // TODO: we need better info about the target CPU.  a vector exclusive or
+      //       would probably be better here if we could rely on its existance.
+      // Load an immediate +2.0 (which encodes to 0)
+      newLIR2(cUnit, kThumb2Vmovs_IMM8, rDest, 0);
+      // +0.0 = +2.0 - +2.0
+      return newLIR3(cUnit, kThumb2Vsubs, rDest, rDest, rDest);
+    } else if (encodedImm >= 0) {
         return newLIR2(cUnit, kThumb2Vmovs_IMM8, rDest, encodedImm);
     }
     ArmLIR *dataTarget = scanLiteralPool(cUnit->literalList, value, 0);
@@ -696,9 +703,34 @@
 {
     int encodedImm = encodeImmDouble(valLo, valHi);
     ArmLIR *res;
-    if (FPREG(rDestLo) && (encodedImm >= 0)) {
-        res = newLIR2(cUnit, kThumb2Vmovd_IMM8, S2D(rDestLo, rDestHi),
-                      encodedImm);
+    int targetReg = S2D(rDestLo, rDestHi);
+    if (FPREG(rDestLo)) {
+        if ((valLo == 0) && (valHi == 0)) {
+          // TODO: we need better info about the target CPU.  a vector
+          // exclusive or would probably be better here if we could rely on
+          // its existance.
+          // Load an immediate +2.0 (which encodes to 0)
+          newLIR2(cUnit, kThumb2Vmovd_IMM8, targetReg, 0);
+          // +0.0 = +2.0 - +2.0
+          res = newLIR3(cUnit, kThumb2Vsubd, targetReg, targetReg, targetReg);
+        } else if (encodedImm >= 0) {
+            res = newLIR2(cUnit, kThumb2Vmovd_IMM8, targetReg, encodedImm);
+        } else {
+            ArmLIR* dataTarget = scanLiteralPoolWide(cUnit->literalList, valLo, valHi);
+            if (dataTarget == NULL) {
+                dataTarget = addWideData(cUnit, &cUnit->literalList, valLo, valHi);
+            }
+            ArmLIR *loadPcRel = (ArmLIR *) dvmCompilerNew(sizeof(ArmLIR), true);
+            loadPcRel->opcode = kThumb2Vldrd;
+            loadPcRel->generic.target = (LIR *) dataTarget;
+            loadPcRel->operands[0] = targetReg;
+            loadPcRel->operands[1] = r15pc;
+            setupResourceMasks(loadPcRel);
+            setMemRefType(loadPcRel, true, kLiteral);
+            loadPcRel->aliasInfo = dataTarget->operands[0];
+            dvmCompilerAppendLIR(cUnit, (LIR *) loadPcRel);
+            res =  loadPcRel;
+        }
     } else {
         res = loadConstantNoClobber(cUnit, rDestLo, valLo);
         loadConstantNoClobber(cUnit, rDestHi, valHi);