ARM: Use 64-bit literals for LoadDImmediate().

And rewrite the medium-range long/fp literal to use
MOVW+ADD+LDRD/VLDR because the old instruction sequence
was broken if the "ADD ip, pc" was not 4-byte aligned.

Test: assembler_thumb2_test has been updated. Standard
      ART test suite has been run on host and Nexus 5.
Change-Id: I37c6a62aa6e77c6a9701b5a1fb4db2e666c1eae9
diff --git a/compiler/utils/arm/assembler_arm.h b/compiler/utils/arm/assembler_arm.h
index a571d14..9cf72a2 100644
--- a/compiler/utils/arm/assembler_arm.h
+++ b/compiler/utils/arm/assembler_arm.h
@@ -754,32 +754,7 @@
     }
   }
 
-  void LoadDImmediate(DRegister sd, double value, Condition cond = AL) {
-    if (!vmovd(sd, value, cond)) {
-      uint64_t int_value = bit_cast<uint64_t, double>(value);
-      if (int_value == bit_cast<uint64_t, double>(0.0)) {
-        // 0.0 is quite common, so we special case it by loading
-        // 2.0 in `sd` and then substracting it.
-        bool success = vmovd(sd, 2.0, cond);
-        CHECK(success);
-        vsubd(sd, sd, sd, cond);
-      } else {
-        if (sd < 16) {
-          SRegister low = static_cast<SRegister>(sd << 1);
-          SRegister high = static_cast<SRegister>(low + 1);
-          LoadSImmediate(low, bit_cast<float, uint32_t>(Low32Bits(int_value)), cond);
-          if (High32Bits(int_value) == Low32Bits(int_value)) {
-            vmovs(high, low);
-          } else {
-            LoadSImmediate(high, bit_cast<float, uint32_t>(High32Bits(int_value)), cond);
-          }
-        } else {
-          LOG(FATAL) << "Unimplemented loading of double into a D register "
-                     << "that cannot be split into two S registers";
-        }
-      }
-    }
-  }
+  virtual void LoadDImmediate(DRegister dd, double value, Condition cond = AL) = 0;
 
   virtual void MarkExceptionHandler(Label* label) = 0;
   virtual void LoadFromOffset(LoadOperandType type,
diff --git a/compiler/utils/arm/assembler_arm32.cc b/compiler/utils/arm/assembler_arm32.cc
index 6f7119d..c95dfa8 100644
--- a/compiler/utils/arm/assembler_arm32.cc
+++ b/compiler/utils/arm/assembler_arm32.cc
@@ -1486,6 +1486,34 @@
   }
 }
 
+void Arm32Assembler::LoadDImmediate(DRegister dd, double value, Condition cond) {
+  if (!vmovd(dd, value, cond)) {
+    uint64_t int_value = bit_cast<uint64_t, double>(value);
+    if (int_value == bit_cast<uint64_t, double>(0.0)) {
+      // 0.0 is quite common, so we special case it by loading
+      // 2.0 in `dd` and then subtracting it.
+      bool success = vmovd(dd, 2.0, cond);
+      CHECK(success);
+      vsubd(dd, dd, dd, cond);
+    } else {
+      if (dd < 16) {
+        // Note: Depending on the particular CPU, this may cause register
+        // forwarding hazard, negatively impacting the performance.
+        SRegister low = static_cast<SRegister>(dd << 1);
+        SRegister high = static_cast<SRegister>(low + 1);
+        LoadSImmediate(low, bit_cast<float, uint32_t>(Low32Bits(int_value)), cond);
+        if (High32Bits(int_value) == Low32Bits(int_value)) {
+          vmovs(high, low);
+        } else {
+          LoadSImmediate(high, bit_cast<float, uint32_t>(High32Bits(int_value)), cond);
+        }
+      } else {
+        LOG(FATAL) << "Unimplemented loading of double into a D register "
+                   << "that cannot be split into two S registers";
+      }
+    }
+  }
+}
 
 // Implementation note: this method must emit at most one instruction when
 // Address::CanHoldLoadOffsetArm.
diff --git a/compiler/utils/arm/assembler_arm32.h b/compiler/utils/arm/assembler_arm32.h
index 8726ac8..554dd23 100644
--- a/compiler/utils/arm/assembler_arm32.h
+++ b/compiler/utils/arm/assembler_arm32.h
@@ -270,6 +270,7 @@
 
   // Load and Store. May clobber IP.
   void LoadImmediate(Register rd, int32_t value, Condition cond = AL) OVERRIDE;
+  void LoadDImmediate(DRegister dd, double value, Condition cond = AL) OVERRIDE;
   void MarkExceptionHandler(Label* label) OVERRIDE;
   void LoadFromOffset(LoadOperandType type,
                       Register reg,
diff --git a/compiler/utils/arm/assembler_thumb2.cc b/compiler/utils/arm/assembler_thumb2.cc
index a72ea41..8747dad 100644
--- a/compiler/utils/arm/assembler_thumb2.cc
+++ b/compiler/utils/arm/assembler_thumb2.cc
@@ -1917,7 +1917,7 @@
 
     case kLongOrFPLiteral1KiB:
       return 4u;
-    case kLongOrFPLiteral256KiB:
+    case kLongOrFPLiteral64KiB:
       return 10u;
     case kLongOrFPLiteralFar:
       return 14u;
@@ -1989,7 +1989,7 @@
       break;
     case kLiteral1MiB:
     case kLiteral64KiB:
-    case kLongOrFPLiteral256KiB:
+    case kLongOrFPLiteral64KiB:
     case kLiteralAddr64KiB:
       DCHECK_GE(diff, 4);  // The target must be at least 4 bytes after the ADD rX, PC.
       diff -= 4;        // One extra 32-bit MOV.
@@ -2105,10 +2105,10 @@
       if (IsUint<10>(GetOffset(current_code_size))) {
         break;
       }
-      current_code_size += IncreaseSize(kLongOrFPLiteral256KiB);
+      current_code_size += IncreaseSize(kLongOrFPLiteral64KiB);
       FALLTHROUGH_INTENDED;
-    case kLongOrFPLiteral256KiB:
-      if (IsUint<18>(GetOffset(current_code_size))) {
+    case kLongOrFPLiteral64KiB:
+      if (IsUint<16>(GetOffset(current_code_size))) {
         break;
       }
       current_code_size += IncreaseSize(kLongOrFPLiteralFar);
@@ -2269,11 +2269,10 @@
       buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(encoding & 0xffff));
       break;
     }
-    case kLongOrFPLiteral256KiB: {
-      int32_t offset = GetOffset(code_size);
-      int32_t mov_encoding = MovModImmEncoding32(IP, offset & ~0x3ff);
+    case kLongOrFPLiteral64KiB: {
+      int32_t mov_encoding = MovwEncoding32(IP, GetOffset(code_size));
       int16_t add_pc_encoding = AddRdnRmEncoding16(IP, PC);
-      int32_t ldr_encoding = LoadWideOrFpEncoding(IP, offset & 0x3ff);    // DCHECKs type_.
+      int32_t ldr_encoding = LoadWideOrFpEncoding(IP, 0u);    // DCHECKs type_.
       buffer->Store<int16_t>(location_, mov_encoding >> 16);
       buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(mov_encoding & 0xffff));
       buffer->Store<int16_t>(location_ + 4u, add_pc_encoding);
@@ -3598,6 +3597,24 @@
   }
 }
 
+void Thumb2Assembler::LoadDImmediate(DRegister dd, double value, Condition cond) {
+  if (!vmovd(dd, value, cond)) {
+    uint64_t int_value = bit_cast<uint64_t, double>(value);
+    if (int_value == bit_cast<uint64_t, double>(0.0)) {
+      // 0.0 is quite common, so we special case it by loading
+      // 2.0 in `dd` and then subtracting it.
+      bool success = vmovd(dd, 2.0, cond);
+      CHECK(success);
+      vsubd(dd, dd, dd, cond);
+    } else {
+      Literal* literal = literal64_dedupe_map_.GetOrCreate(
+          int_value,
+          [this, int_value]() { return NewLiteral<uint64_t>(int_value); });
+      LoadLiteral(dd, literal);
+    }
+  }
+}
+
 int32_t Thumb2Assembler::GetAllowedLoadOffsetBits(LoadOperandType type) {
   switch (type) {
     case kLoadSignedByte:
diff --git a/compiler/utils/arm/assembler_thumb2.h b/compiler/utils/arm/assembler_thumb2.h
index 2ca74fc..4ee23c0 100644
--- a/compiler/utils/arm/assembler_thumb2.h
+++ b/compiler/utils/arm/assembler_thumb2.h
@@ -43,6 +43,7 @@
         fixups_(arena->Adapter(kArenaAllocAssembler)),
         fixup_dependents_(arena->Adapter(kArenaAllocAssembler)),
         literals_(arena->Adapter(kArenaAllocAssembler)),
+        literal64_dedupe_map_(std::less<uint64_t>(), arena->Adapter(kArenaAllocAssembler)),
         jump_tables_(arena->Adapter(kArenaAllocAssembler)),
         last_position_adjustment_(0u),
         last_old_position_(0u),
@@ -319,6 +320,7 @@
 
   // Load and Store. May clobber IP.
   void LoadImmediate(Register rd, int32_t value, Condition cond = AL) OVERRIDE;
+  void LoadDImmediate(DRegister dd, double value, Condition cond = AL) OVERRIDE;
   void MarkExceptionHandler(Label* label) OVERRIDE;
   void LoadFromOffset(LoadOperandType type,
                       Register reg,
@@ -464,8 +466,8 @@
       // Load long or FP literal variants.
       // VLDR s/dX, label; 32-bit insn, up to 1KiB offset; 4 bytes.
       kLongOrFPLiteral1KiB,
-      // MOV ip, modimm + ADD ip, pc + VLDR s/dX, [IP, #imm8*4]; up to 256KiB offset; 10 bytes.
-      kLongOrFPLiteral256KiB,
+      // MOV ip, imm16 + ADD ip, pc + VLDR s/dX, [IP, #0]; up to 64KiB offset; 10 bytes.
+      kLongOrFPLiteral64KiB,
       // MOV ip, imm16 + MOVT ip, imm16 + ADD ip, pc + VLDR s/dX, [IP]; any offset; 14 bytes.
       kLongOrFPLiteralFar,
     };
@@ -500,7 +502,7 @@
     // Load wide literal.
     static Fixup LoadWideLiteral(uint32_t location, Register rt, Register rt2,
                                  Size size = kLongOrFPLiteral1KiB) {
-      DCHECK(size == kLongOrFPLiteral1KiB || size == kLongOrFPLiteral256KiB ||
+      DCHECK(size == kLongOrFPLiteral1KiB || size == kLongOrFPLiteral64KiB ||
              size == kLongOrFPLiteralFar);
       DCHECK(!IsHighRegister(rt) || (size != kLiteral1KiB && size != kLiteral64KiB));
       return Fixup(rt, rt2, kNoSRegister, kNoDRegister,
@@ -510,7 +512,7 @@
     // Load FP single literal.
     static Fixup LoadSingleLiteral(uint32_t location, SRegister sd,
                                    Size size = kLongOrFPLiteral1KiB) {
-      DCHECK(size == kLongOrFPLiteral1KiB || size == kLongOrFPLiteral256KiB ||
+      DCHECK(size == kLongOrFPLiteral1KiB || size == kLongOrFPLiteral64KiB ||
              size == kLongOrFPLiteralFar);
       return Fixup(kNoRegister, kNoRegister, sd, kNoDRegister,
                    AL, kLoadFPLiteralSingle, size, location);
@@ -519,7 +521,7 @@
     // Load FP double literal.
     static Fixup LoadDoubleLiteral(uint32_t location, DRegister dd,
                                    Size size = kLongOrFPLiteral1KiB) {
-      DCHECK(size == kLongOrFPLiteral1KiB || size == kLongOrFPLiteral256KiB ||
+      DCHECK(size == kLongOrFPLiteral1KiB || size == kLongOrFPLiteral64KiB ||
              size == kLongOrFPLiteralFar);
       return Fixup(kNoRegister, kNoRegister, kNoSRegister, dd,
                    AL, kLoadFPLiteralDouble, size, location);
@@ -870,6 +872,9 @@
   // without invalidating pointers and references to existing elements.
   ArenaDeque<Literal> literals_;
 
+  // Deduplication map for 64-bit literals, used for LoadDImmediate().
+  ArenaSafeMap<uint64_t, Literal*> literal64_dedupe_map_;
+
   // Jump table list.
   ArenaDeque<JumpTable> jump_tables_;
 
diff --git a/compiler/utils/arm/assembler_thumb2_test.cc b/compiler/utils/arm/assembler_thumb2_test.cc
index 7f1dc49..f3fa72c 100644
--- a/compiler/utils/arm/assembler_thumb2_test.cc
+++ b/compiler/utils/arm/assembler_thumb2_test.cc
@@ -869,10 +869,11 @@
   }
 
   std::string expected =
-      "mov.w ip, #((2f - 1f - 4) & ~0x3ff)\n"
+      // "as" does not consider ((2f - 1f - 4) & 0xffff) a constant expression for movw.
+      "movw ip, #(0x408 - 0x4 - 4)\n"
       "1:\n"
       "add ip, pc\n"
-      "ldrd r1, r3, [ip, #((2f - 1b - 4) & 0x3ff)]\n" +
+      "ldrd r1, r3, [ip, #0]\n" +
       RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
       ".align 2, 0\n"
       "2:\n"
@@ -884,48 +885,78 @@
             __ GetAdjustedPosition(label.Position()));
 }
 
-TEST_F(AssemblerThumb2Test, LoadLiteralSingleMax256KiB) {
+TEST_F(AssemblerThumb2Test, LoadLiteralSingleMax64KiB) {
   // The literal size must match but the type doesn't, so use an int32_t rather than float.
   arm::Literal* literal = __ NewLiteral<int32_t>(0x12345678);
   __ LoadLiteral(arm::S3, literal);
   Label label;
   __ Bind(&label);
-  constexpr size_t kLdrR0R0Count = (1 << 17) - 3u;
-  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
-    __ ldr(arm::R0, arm::Address(arm::R0));
-  }
-
-  std::string expected =
-      "mov.w ip, #((2f - 1f - 4) & ~0x3ff)\n"
-      "1:\n"
-      "add ip, pc\n"
-      "vldr s3, [ip, #((2f - 1b - 4) & 0x3ff)]\n" +
-      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
-      ".align 2, 0\n"
-      "2:\n"
-      ".word 0x12345678\n";
-  DriverStr(expected, "LoadLiteralSingleMax256KiB");
-
-  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 6u,
-            __ GetAdjustedPosition(label.Position()));
-}
-
-TEST_F(AssemblerThumb2Test, LoadLiteralDoubleBeyondMax256KiB) {
-  // The literal size must match but the type doesn't, so use an int64_t rather than double.
-  arm::Literal* literal = __ NewLiteral<int64_t>(INT64_C(0x1234567887654321));
-  __ LoadLiteral(arm::D3, literal);
-  Label label;
-  __ Bind(&label);
-  constexpr size_t kLdrR0R0Count = (1 << 17) - 2u;
+  constexpr size_t kLdrR0R0Count = (1 << 15) - 3u;
   for (size_t i = 0; i != kLdrR0R0Count; ++i) {
     __ ldr(arm::R0, arm::Address(arm::R0));
   }
 
   std::string expected =
       // "as" does not consider ((2f - 1f - 4) & 0xffff) a constant expression for movw.
-      "movw ip, #(0x40000 & 0xffff)\n"
+      "movw ip, #(0x10004 - 0x4 - 4)\n"
+      "1:\n"
+      "add ip, pc\n"
+      "vldr s3, [ip, #0]\n" +
+      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
+      ".align 2, 0\n"
+      "2:\n"
+      ".word 0x12345678\n";
+  DriverStr(expected, "LoadLiteralSingleMax64KiB");
+
+  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 6u,
+            __ GetAdjustedPosition(label.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, LoadLiteralSingleMax64KiB_UnalignedPC) {
+  // The literal size must match but the type doesn't, so use an int32_t rather than float.
+  arm::Literal* literal = __ NewLiteral<int32_t>(0x12345678);
+  __ ldr(arm::R0, arm::Address(arm::R0));
+  __ LoadLiteral(arm::S3, literal);
+  Label label;
+  __ Bind(&label);
+  constexpr size_t kLdrR0R0Count = (1 << 15) - 4u;
+  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+
+  std::string expected =
+      "ldr r0, [r0]\n"
+      // "as" does not consider ((2f - 1f - 4) & 0xffff) a constant expression for movw.
+      "movw ip, #(0x10004 - 0x6 - 4)\n"
+      "1:\n"
+      "add ip, pc\n"
+      "vldr s3, [ip, #0]\n" +
+      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
+      ".align 2, 0\n"
+      "2:\n"
+      ".word 0x12345678\n";
+  DriverStr(expected, "LoadLiteralSingleMax64KiB_UnalignedPC");
+
+  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 6u,
+            __ GetAdjustedPosition(label.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, LoadLiteralDoubleBeyondMax64KiB) {
+  // The literal size must match but the type doesn't, so use an int64_t rather than double.
+  arm::Literal* literal = __ NewLiteral<int64_t>(INT64_C(0x1234567887654321));
+  __ LoadLiteral(arm::D3, literal);
+  Label label;
+  __ Bind(&label);
+  constexpr size_t kLdrR0R0Count = (1 << 15) - 2u;
+  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+
+  std::string expected =
+      // "as" does not consider ((2f - 1f - 4) & 0xffff) a constant expression for movw.
+      "movw ip, #((0x1000c - 0x8 - 4) & 0xffff)\n"
       // "as" does not consider ((2f - 1f - 4) >> 16) a constant expression for movt.
-      "movt ip, #(0x40000 >> 16)\n"
+      "movt ip, #((0x1000c - 0x8 - 4) >> 16)\n"
       "1:\n"
       "add ip, pc\n"
       "vldr d3, [ip, #0]\n" +
@@ -934,7 +965,7 @@
       "2:\n"
       ".word 0x87654321\n"
       ".word 0x12345678\n";
-  DriverStr(expected, "LoadLiteralDoubleBeyondMax256KiB");
+  DriverStr(expected, "LoadLiteralDoubleBeyondMax64KiB");
 
   EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 10u,
             __ GetAdjustedPosition(label.Position()));
@@ -946,16 +977,16 @@
   __ LoadLiteral(arm::D3, literal);
   Label label;
   __ Bind(&label);
-  constexpr size_t kLdrR0R0Count = (1 << 17) - 2u + 0x1234;
+  constexpr size_t kLdrR0R0Count = (1 << 15) - 2u + 0x1234;
   for (size_t i = 0; i != kLdrR0R0Count; ++i) {
     __ ldr(arm::R0, arm::Address(arm::R0));
   }
 
   std::string expected =
       // "as" does not consider ((2f - 1f - 4) & 0xffff) a constant expression for movw.
-      "movw ip, #((0x40000 + 2 * 0x1234) & 0xffff)\n"
+      "movw ip, #((0x1000c + 2 * 0x1234 - 0x8 - 4) & 0xffff)\n"
       // "as" does not consider ((2f - 1f - 4) >> 16) a constant expression for movt.
-      "movt ip, #((0x40000 + 2 * 0x1234) >> 16)\n"
+      "movt ip, #((0x1000c + 2 * 0x1234 - 0x8 - 4) >> 16)\n"
       "1:\n"
       "add ip, pc\n"
       "vldr d3, [ip, #0]\n" +