AArch64: fixes in A64 code generation.

- Disabled special method compilation, as it requires hard-float ABI,
- Disabled suspend checks, as runtime is not yet ready (e.g. trampolines
  are not setting the suspend register, etc),
- Changing definition of zero register (the zero register has now 0x3f
  as its register number),
- Fixing some issues with handling of cmp instructions in the assembler:
  we now use the shift-register rather than the extended-register variant
  of cmp and cmn,
- Partially fixing register setup (register sN is now mapped to dN),
- Fixing and completing implementation of register spills/unspills,
- Fixing LoadBaseDispBody() and StoreBaseDispBody().

Change-Id: Ia49ba48b6ca0f782380066345b7a198cb6c1dc1d
diff --git a/compiler/dex/quick/arm64/arm64_lir.h b/compiler/dex/quick/arm64/arm64_lir.h
index f98e366..3d28665 100644
--- a/compiler/dex/quick/arm64/arm64_lir.h
+++ b/compiler/dex/quick/arm64/arm64_lir.h
@@ -95,13 +95,8 @@
  * +========================+
  */
 
-// Offset to distinguish FP regs.
-#define ARM_FP_REG_OFFSET 32
 // First FP callee save.
-#define ARM_FP_CALLEE_SAVE_BASE 16
-
-// Mask to strip off fp flags.
-#define ARM_FP_REG_MASK (ARM_FP_REG_OFFSET - 1)
+#define A64_FP_CALLEE_SAVE_BASE 16
 
 // Temporary macros, used to mark code which wants to distinguish betweek zr/sp.
 #define A64_REG_IS_SP(reg_num) ((reg_num) == rwsp || (reg_num) == rsp)
@@ -147,14 +142,11 @@
 
   // TODO(Arm64): can we change the lines below such that rwzr != rwsp && rxzr != rsp?
   //   This would be desirable to allow detecting usage-errors in the assembler.
-  rwzr = rw31,
-  rxzr = rx31,
+  rwzr = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 0x3f,
+  rxzr = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 0x3f,
   rwsp = rw31,
   rsp = rx31,
-  // TODO: rx4 is an argument register in C ABI which is not a good idea,
-  // But we need to decide to use caller save register in C ABI or callee save register.
-  // Because it will result to different implementation in the trampoline.
-  rA64_SUSPEND = rx4,
+  rA64_SUSPEND = rx19,
   rA64_SELF = rx18,
   rA64_SP = rx31,
   rA64_LR = rx30
@@ -233,9 +225,11 @@
   kA64B1t,           // b   [00010100] offset_26[25-0].
   kA64Cbnz2rt,       // cbnz[00110101] imm_19[23-5] rt[4-0].
   kA64Cbz2rt,        // cbz [00110100] imm_19[23-5] rt[4-0].
-  kA64Cmn3Rro,       // cmn [s0101011001] rm[20-16] option[15-13] imm_3[12-10] rn[9-5] [11111].
+  kA64Cmn3rro,       // cmn [s0101011] shift[23-22] [0] rm[20-16] imm_6[15-10] rn[9-5] [11111].
+  kA64Cmn3Rre,       // cmn [s0101011001] rm[20-16] option[15-13] imm_3[12-10] rn[9-5] [11111].
   kA64Cmn3RdT,       // cmn [00110001] shift[23-22] imm_12[21-10] rn[9-5] [11111].
-  kA64Cmp3Rro,       // cmp [s1101011001] rm[20-16] option[15-13] imm_3[12-10] rn[9-5] [11111].
+  kA64Cmp3rro,       // cmp [s1101011] shift[23-22] [0] rm[20-16] imm_6[15-10] rn[9-5] [11111].
+  kA64Cmp3Rre,       // cmp [s1101011001] rm[20-16] option[15-13] imm_3[12-10] rn[9-5] [11111].
   kA64Cmp3RdT,       // cmp [01110001] shift[23-22] imm_12[21-10] rn[9-5] [11111].
   kA64Csel4rrrc,     // csel[s0011010100] rm[20-16] cond[15-12] [00] rn[9-5] rd[4-0].
   kA64Csinc4rrrc,    // csinc [s0011010100] rm[20-16] cond[15-12] [01] rn[9-5] rd[4-0].
@@ -279,6 +273,7 @@
   kA64Ldr4fXxG,      // ldr [1s111100011] rm[20-16] [011] S[12] [10] rn[9-5] rt[4-0].
   kA64Ldr4rXxG,      // ldr [1s111000011] rm[20-16] [011] S[12] [10] rn[9-5] rt[4-0].
   kA64LdrPost3rXd,   // ldr [1s111000010] imm_9[20-12] [01] rn[9-5] rt[4-0].
+  kA64Ldp4ffXD,      // ldp [0s10110101] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
   kA64Ldp4rrXD,      // ldp [s010100101] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
   kA64LdpPost4rrXD,  // ldp [s010100011] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
   kA64Ldur3fXd,      // ldur[1s111100010] imm_9[20-12] [00] rn[9-5] rt[4-0].
@@ -306,7 +301,8 @@
   kA64Scvtf2fx,      // scvtf  [100111100s100010000000] rn[9-5] rd[4-0].
   kA64Sdiv3rrr,      // sdiv[s0011010110] rm[20-16] [000011] rn[9-5] rd[4-0].
   kA64Smaddl4xwwx,   // smaddl [10011011001] rm[20-16] [0] ra[14-10] rn[9-5] rd[4-0].
-  kA64Stp4rrXD,      // stp [s010100101] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
+  kA64Stp4ffXD,      // stp [0s10110100] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
+  kA64Stp4rrXD,      // stp [s010100100] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
   kA64StpPost4rrXD,  // stp [s010100010] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
   kA64StpPre4rrXD,   // stp [s010100110] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
   kA64Str3fXD,       // str [1s11110100] imm_12[21-10] rn[9-5] rt[4-0].
@@ -355,9 +351,6 @@
 #define FUNWIDE UNWIDE
 #define IS_FWIDE IS_WIDE
 
-#define OP_KIND_UNWIDE(opcode) (opcode)
-#define OP_KIND_IS_WIDE(opcode) (false)
-
 enum ArmOpDmbOptions {
   kSY = 0xf,
   kST = 0xe,
@@ -390,6 +383,9 @@
   kFmtSkip,      // Unused field, but continue to next.
 };
 
+// TODO(Arm64): should we get rid of kFmtExtend?
+//   Note: the only instructions that use it (cmp, cmn) are not used themselves.
+
 // Struct used to define the snippet positions for each A64 opcode.
 struct ArmEncodingMap {
   uint32_t wskeleton;
diff --git a/compiler/dex/quick/arm64/assemble_arm64.cc b/compiler/dex/quick/arm64/assemble_arm64.cc
index 93caf89..01fcc0d 100644
--- a/compiler/dex/quick/arm64/assemble_arm64.cc
+++ b/compiler/dex/quick/arm64/assemble_arm64.cc
@@ -168,18 +168,26 @@
                  kFmtUnused, -1, -1,
                  IS_BINARY_OP | REG_USE0 | IS_BRANCH  | NEEDS_FIXUP,
                  "cbz", "!0r, !1t", kFixupCBxZ),
-    ENCODING_MAP(WIDE(kA64Cmn3Rro), SF_VARIANTS(0x6b20001f),
-                 kFmtRegROrSp, 9, 5, kFmtRegR, 20, 16, kFmtShift, -1, -1,
+    ENCODING_MAP(WIDE(kA64Cmn3rro), SF_VARIANTS(0x2b00001f),
+                 kFmtRegR, 9, 5, kFmtRegR, 20, 16, kFmtShift, -1, -1,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_USE01 | SETS_CCODES,
-                 "cmn", "!0R, !1r!2o", kFixupNone),
+                 "cmn", "!0r, !1r!2o", kFixupNone),
+    ENCODING_MAP(WIDE(kA64Cmn3Rre), SF_VARIANTS(0x2b20001f),
+                 kFmtRegROrSp, 9, 5, kFmtRegR, 20, 16, kFmtExtend, -1, -1,
+                 kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_USE01 | SETS_CCODES,
+                 "cmn", "!0R, !1r!2e", kFixupNone),
     ENCODING_MAP(WIDE(kA64Cmn3RdT), SF_VARIANTS(0x3100001f),
                  kFmtRegROrSp, 9, 5, kFmtBitBlt, 21, 10, kFmtBitBlt, 23, 22,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_USE0 | SETS_CCODES,
                  "cmn", "!0R, #!1d!2T", kFixupNone),
-    ENCODING_MAP(WIDE(kA64Cmp3Rro), SF_VARIANTS(0x6b00001f),
-                 kFmtRegROrSp, 9, 5, kFmtRegR, 20, 16, kFmtShift, -1, -1,
+    ENCODING_MAP(WIDE(kA64Cmp3rro), SF_VARIANTS(0x6b00001f),
+                 kFmtRegR, 9, 5, kFmtRegR, 20, 16, kFmtShift, -1, -1,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_USE01 | SETS_CCODES,
-                 "cmp", "!0R, !1r!2o", kFixupNone),
+                 "cmp", "!0r, !1r!2o", kFixupNone),
+    ENCODING_MAP(WIDE(kA64Cmp3Rre), SF_VARIANTS(0x6b20001f),
+                 kFmtRegROrSp, 9, 5, kFmtRegR, 20, 16, kFmtExtend, -1, -1,
+                 kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_USE01 | SETS_CCODES,
+                 "cmp", "!0R, !1r!2e", kFixupNone),
     ENCODING_MAP(WIDE(kA64Cmp3RdT), SF_VARIANTS(0x7100001f),
                  kFmtRegROrSp, 9, 5, kFmtBitBlt, 21, 10, kFmtBitBlt, 23, 22,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_USE0 | SETS_CCODES,
@@ -354,9 +362,13 @@
                  kFmtRegR, 4, 0, kFmtRegXOrSp, 9, 5, kFmtBitBlt, 20, 12,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF01 | REG_USE1 | IS_LOAD,
                  "ldr", "!0r, [!1X], #!2d", kFixupNone),
+    ENCODING_MAP(WIDE(kA64Ldp4ffXD), CUSTOM_VARIANTS(0x2d400000, 0x6d400000),
+                 kFmtRegF, 4, 0, kFmtRegF, 14, 10, kFmtRegXOrSp, 9, 5,
+                 kFmtBitBlt, 21, 15, IS_QUAD_OP | REG_USE2 | REG_DEF01 | IS_LOAD,
+                 "ldp", "!0f, !1f, [!2X, #!3D]", kFixupNone),
     ENCODING_MAP(WIDE(kA64Ldp4rrXD), SF_VARIANTS(0x29400000),
                  kFmtRegR, 4, 0, kFmtRegR, 14, 10, kFmtRegXOrSp, 9, 5,
-                 kFmtBitBlt, 21, 15, IS_QUAD_OP | REG_USE2 | REG_DEF012 | IS_LOAD,
+                 kFmtBitBlt, 21, 15, IS_QUAD_OP | REG_USE2 | REG_DEF01 | IS_LOAD,
                  "ldp", "!0r, !1r, [!2X, #!3D]", kFixupNone),
     ENCODING_MAP(WIDE(kA64LdpPost4rrXD), CUSTOM_VARIANTS(0x28c00000, 0xa8c00000),
                  kFmtRegR, 4, 0, kFmtRegR, 14, 10, kFmtRegXOrSp, 9, 5,
@@ -462,9 +474,13 @@
                  kFmtRegX, 4, 0, kFmtRegW, 9, 5, kFmtRegW, 20, 16,
                  kFmtRegX, -1, -1, IS_QUAD_OP | REG_DEF0_USE123,
                  "smaddl", "!0x, !1w, !2w, !3x", kFixupNone),
+    ENCODING_MAP(WIDE(kA64Stp4ffXD), CUSTOM_VARIANTS(0x2d000000, 0x6d000000),
+                 kFmtRegF, 4, 0, kFmtRegF, 14, 10, kFmtRegXOrSp, 9, 5,
+                 kFmtBitBlt, 21, 15, IS_QUAD_OP | REG_USE012 | IS_STORE,
+                 "stp", "!0f, !1f, [!2X, #!3D]", kFixupNone),
     ENCODING_MAP(WIDE(kA64Stp4rrXD), SF_VARIANTS(0x29000000),
                  kFmtRegR, 4, 0, kFmtRegR, 14, 10, kFmtRegXOrSp, 9, 5,
-                 kFmtBitBlt, 21, 15, IS_QUAD_OP | REG_DEF2 | REG_USE012 | IS_STORE,
+                 kFmtBitBlt, 21, 15, IS_QUAD_OP | REG_USE012 | IS_STORE,
                  "stp", "!0r, !1r, [!2X, #!3D]", kFixupNone),
     ENCODING_MAP(WIDE(kA64StpPost4rrXD), CUSTOM_VARIANTS(0x28800000, 0xa8800000),
                  kFmtRegR, 4, 0, kFmtRegR, 14, 10, kFmtRegXOrSp, 9, 5,
diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc
index 136a04f..f7a0199 100644
--- a/compiler/dex/quick/arm64/call_arm64.cc
+++ b/compiler/dex/quick/arm64/call_arm64.cc
@@ -25,7 +25,10 @@
 
 bool Arm64Mir2Lir::GenSpecialCase(BasicBlock* bb, MIR* mir,
                                   const InlineMethod& special) {
-  return Mir2Lir::GenSpecialCase(bb, mir, special);
+  // TODO(Arm64): re-enable this, once hard-float ABI is implemented.
+  //   (this currently does not work, as GetArgMappingToPhysicalReg returns InvalidReg()).
+  // return Mir2Lir::GenSpecialCase(bb, mir, special);
+  return false;
 }
 
 /*
@@ -348,18 +351,16 @@
     OpRegImm64(kOpSub, rs_rA64_SP, frame_size_, /*is_wide*/true);
   }
 
-  /* Spill core callee saves */
-  if (core_spill_mask_) {
-    SpillCoreRegs(rs_rA64_SP, frame_size_, core_spill_mask_);
-  }
   /* Need to spill any FP regs? */
-  if (num_fp_spills_) {
-    /*
-     * NOTE: fp spills are a little different from core spills in that
-     * they are pushed as a contiguous block.  When promoting from
-     * the fp set, we must allocate all singles from s16..highest-promoted
-     */
-    // TODO(Arm64): SpillFPRegs(rA64_SP, frame_size_, core_spill_mask_);
+  if (fp_spill_mask_) {
+    int spill_offset = frame_size_ - kArm64PointerSize*(num_fp_spills_ + num_core_spills_);
+    SpillFPRegs(rs_rA64_SP, spill_offset, fp_spill_mask_);
+  }
+
+  /* Spill core callee saves. */
+  if (core_spill_mask_) {
+    int spill_offset = frame_size_ - kArm64PointerSize*num_core_spills_;
+    SpillCoreRegs(rs_rA64_SP, spill_offset, core_spill_mask_);
   }
 
   FlushIns(ArgLocs, rl_method);
@@ -379,12 +380,15 @@
   LockTemp(rs_x1);
 
   NewLIR0(kPseudoMethodExit);
+
   /* Need to restore any FP callee saves? */
-  if (num_fp_spills_) {
-    // TODO(Arm64): UnspillFPRegs(num_fp_spills_);
+  if (fp_spill_mask_) {
+    int spill_offset = frame_size_ - kArm64PointerSize*(num_fp_spills_ + num_core_spills_);
+    UnSpillFPRegs(rs_rA64_SP, spill_offset, fp_spill_mask_);
   }
   if (core_spill_mask_) {
-    UnSpillCoreRegs(rs_rA64_SP, frame_size_, core_spill_mask_);
+    int spill_offset = frame_size_ - kArm64PointerSize*num_core_spills_;
+    UnSpillCoreRegs(rs_rA64_SP, spill_offset, core_spill_mask_);
   }
 
   OpRegImm64(kOpAdd, rs_rA64_SP, frame_size_, /*is_wide*/true);
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index 418a989..404138c 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -73,7 +73,6 @@
     void MarkPreservedSingle(int v_reg, RegStorage reg);
     void MarkPreservedDouble(int v_reg, RegStorage reg);
     void CompilerInitializeRegAlloc();
-    RegStorage AllocPreservedDouble(int s_reg);
 
     // Required for target - miscellaneous.
     void AssembleLIR();
@@ -157,6 +156,8 @@
     uint32_t GenPairWise(uint32_t reg_mask, int* reg1, int* reg2);
     void UnSpillCoreRegs(RegStorage base, int offset, uint32_t reg_mask);
     void SpillCoreRegs(RegStorage base, int offset, uint32_t reg_mask);
+    void UnSpillFPRegs(RegStorage base, int offset, uint32_t reg_mask);
+    void SpillFPRegs(RegStorage base, int offset, uint32_t reg_mask);
 
     // Required for target - single operation generators.
     LIR* OpUnconditionalBranch(LIR* target);
@@ -195,7 +196,7 @@
     LIR* StoreBaseDispBody(RegStorage r_base, int displacement, RegStorage r_src, OpSize size);
     LIR* OpRegRegRegShift(OpKind op, int r_dest, int r_src1, int r_src2, int shift,
                           bool is_wide = false);
-    LIR* OpRegRegShift(OpKind op, int r_dest_src1, int r_src2, int shift, bool is_wide = false);
+    LIR* OpRegRegShift(OpKind op, RegStorage r_dest_src1, RegStorage r_src2, int shift);
     static const ArmEncodingMap EncodingMap[kA64Last];
     int EncodeShift(int code, int amount);
     int EncodeExtend(int extend_type, int amount);
diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc
index f2a57e7..b0f5904 100644
--- a/compiler/dex/quick/arm64/int_arm64.cc
+++ b/compiler/dex/quick/arm64/int_arm64.cc
@@ -697,11 +697,19 @@
   GenDivZeroCheck(kCondEq);
 }
 
-// TODO(Arm64): the function below should go.
 // Test suspend flag, return target of taken suspend branch
 LIR* Arm64Mir2Lir::OpTestSuspend(LIR* target) {
+  // TODO(Arm64): re-enable suspend checks, once art_quick_test_suspend is implemented and
+  //   the suspend register is properly handled in the trampolines.
+#if 0
   NewLIR3(kA64Subs3rRd, rA64_SUSPEND, rA64_SUSPEND, 1);
   return OpCondBranch((target == NULL) ? kCondEq : kCondNe, target);
+#else
+  // TODO(Arm64): Fake suspend check. Will always fail to branch. Remove this.
+  LIR* branch = NewLIR2((target == NULL) ? kA64Cbnz2rt : kA64Cbz2rt, rwzr, 0);
+  branch->target = target;
+  return branch;
+#endif
 }
 
 // Decrement register and branch on condition
@@ -1199,34 +1207,61 @@
 
 void Arm64Mir2Lir::UnSpillCoreRegs(RegStorage base, int offset, uint32_t reg_mask) {
   int reg1 = -1, reg2 = -1;
-  const int pop_log2_size = 3;
+  const int reg_log2_size = 3;
 
-  for (offset = (offset >> pop_log2_size) - 1; reg_mask; offset--) {
+  for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
      reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
     if (UNLIKELY(reg2 < 0)) {
-      // TODO(Arm64): replace Solo32 with Solo64, once rxN are defined properly.
-      NewLIR3(WIDE(kA64Ldr3rXD), RegStorage::Solo32(reg1).GetReg(), base.GetReg(), offset);
+      NewLIR3(WIDE(kA64Ldr3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
     } else {
-      // TODO(Arm64): replace Solo32 with Solo64 (twice below), once rxN are defined properly.
-      NewLIR4(WIDE(kA64Ldp4rrXD), RegStorage::Solo32(reg1).GetReg(),
-              RegStorage::Solo32(reg2).GetReg(), base.GetReg(), offset);
+      NewLIR4(WIDE(kA64Ldp4rrXD), RegStorage::Solo64(reg2).GetReg(),
+              RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
     }
   }
 }
 
 void Arm64Mir2Lir::SpillCoreRegs(RegStorage base, int offset, uint32_t reg_mask) {
   int reg1 = -1, reg2 = -1;
-  const int pop_log2_size = 3;
+  const int reg_log2_size = 3;
 
-  for (offset = (offset >> pop_log2_size) - 1; reg_mask; offset--) {
+  for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
     reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
     if (UNLIKELY(reg2 < 0)) {
-      // TODO(Arm64): replace Solo32 with Solo64, once rxN are defined properly.
-      NewLIR3(WIDE(kA64Str3rXD), RegStorage::Solo32(reg1).GetReg(), base.GetReg(), offset);
+      NewLIR3(WIDE(kA64Str3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
     } else {
-      // TODO(Arm64): replace Solo32 with Solo64 (twice below), once rxN are defined properly.
-      NewLIR4(WIDE(kA64Stp4rrXD), RegStorage::Solo32(reg1).GetReg(),
-              RegStorage::Solo32(reg2).GetReg(), base.GetReg(), offset);
+      NewLIR4(WIDE(kA64Stp4rrXD), RegStorage::Solo64(reg2).GetReg(),
+              RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
+    }
+  }
+}
+
+void Arm64Mir2Lir::UnSpillFPRegs(RegStorage base, int offset, uint32_t reg_mask) {
+  int reg1 = -1, reg2 = -1;
+  const int reg_log2_size = 3;
+
+  for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
+     reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
+    if (UNLIKELY(reg2 < 0)) {
+      NewLIR3(FWIDE(kA64Ldr3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
+    } else {
+      NewLIR4(WIDE(kA64Ldp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
+              RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
+    }
+  }
+}
+
+// TODO(Arm64): consider using ld1 and st1?
+void Arm64Mir2Lir::SpillFPRegs(RegStorage base, int offset, uint32_t reg_mask) {
+  int reg1 = -1, reg2 = -1;
+  const int reg_log2_size = 3;
+
+  for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
+    reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
+    if (UNLIKELY(reg2 < 0)) {
+      NewLIR3(FWIDE(kA64Str3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
+    } else {
+      NewLIR4(WIDE(kA64Stp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
+              RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
     }
   }
 }
diff --git a/compiler/dex/quick/arm64/target_arm64.cc b/compiler/dex/quick/arm64/target_arm64.cc
index 10be0d6..c072ae3 100644
--- a/compiler/dex/quick/arm64/target_arm64.cc
+++ b/compiler/dex/quick/arm64/target_arm64.cc
@@ -30,7 +30,8 @@
     {rs_x0, rs_x1, rs_x2, rs_x3, rs_x4, rs_x5, rs_x6, rs_x7,
      rs_x8, rs_x9, rs_x10, rs_x11, rs_x12, rs_x13, rs_x14, rs_x15,
      rs_x16, rs_x17, rs_x18, rs_x19, rs_x20, rs_x21, rs_x22, rs_x23,
-     rs_x24, rs_x25, rs_x26, rs_x27, rs_x28, rs_x29, rs_x30, rs_x31};
+     rs_x24, rs_x25, rs_x26, rs_x27, rs_x28, rs_x29, rs_x30, rs_x31,
+     rs_xzr};
 static const RegStorage sp_regs_arr[] =
     {rs_f0, rs_f1, rs_f2, rs_f3, rs_f4, rs_f5, rs_f6, rs_f7,
      rs_f8, rs_f9, rs_f10, rs_f11, rs_f12, rs_f13, rs_f14, rs_f15,
@@ -42,8 +43,8 @@
      rs_d16, rs_d17, rs_d18, rs_d19, rs_d20, rs_d21, rs_d22, rs_d23,
      rs_d24, rs_d25, rs_d26, rs_d27, rs_d28, rs_d29, rs_d30, rs_d31};
 static const RegStorage reserved_regs_arr[] =
-    {rs_rA64_SUSPEND, rs_rA64_SELF, rs_rA64_SP, rs_rA64_LR};
-// TUING: Are there too many temp registers and too less promote target?
+    {rs_rA64_SUSPEND, rs_rA64_SELF, rs_rA64_SP, rs_rA64_LR, rs_xzr};
+// TUNING: Are there too many temp registers and too less promote target?
 // This definition need to be matched with runtime.cc, quick entry assembly and JNI compiler
 // Note: we are not able to call to C function directly if it un-match C ABI.
 // Currently, rs_rA64_SELF is not a callee save register which does not match C ABI.
@@ -377,14 +378,14 @@
              strcpy(tbuf, name);
              break;
            case 's':
-             snprintf(tbuf, arraysize(tbuf), "s%d", operand & ARM_FP_REG_MASK);
+             snprintf(tbuf, arraysize(tbuf), "s%d", operand & RegStorage::kRegNumMask);
              break;
            case 'S':
-             snprintf(tbuf, arraysize(tbuf), "d%d", operand & ARM_FP_REG_MASK);
+             snprintf(tbuf, arraysize(tbuf), "d%d", operand & RegStorage::kRegNumMask);
              break;
            case 'f':
              snprintf(tbuf, arraysize(tbuf), "%c%d", (IS_FWIDE(lir->opcode)) ? 'd' : 's',
-                      operand & ARM_FP_REG_MASK);
+                      operand & RegStorage::kRegNumMask);
              break;
            case 'l': {
                bool is_wide = IS_WIDE(lir->opcode);
@@ -463,7 +464,7 @@
              break;
            case 'R': {
                bool is_wide = IS_WIDE(lir->opcode);
-               if (LIKELY(operand != rwsp || operand != rsp)) {
+               if (LIKELY(operand != rwsp && operand != rsp)) {
                  snprintf(tbuf, arraysize(tbuf), "%c%d", (is_wide) ? 'x' : 'w',
                           operand & RegStorage::kRegNumMask);
                } else {
@@ -599,13 +600,11 @@
                                         core_temps, sp_temps, dp_temps);
 
   // Target-specific adjustments.
-
-  // Alias single precision floats to appropriate half of overlapping double.
-  GrowableArray<RegisterInfo*>::Iterator it(&reg_pool_->sp_regs_);
-  for (RegisterInfo* info = it.Next(); info != nullptr; info = it.Next()) {
-    int sp_reg_num = info->GetReg().GetRegNum();
-    int dp_reg_num = sp_reg_num >> 1;
-    RegStorage dp_reg = RegStorage::Solo64(RegStorage::kFloatingPoint | dp_reg_num);
+  // Alias single precision float registers to corresponding double registers.
+  GrowableArray<RegisterInfo*>::Iterator fp_it(&reg_pool_->sp_regs_);
+  for (RegisterInfo* info = fp_it.Next(); info != nullptr; info = fp_it.Next()) {
+    int fp_reg_num = info->GetReg().GetRegNum();
+    RegStorage dp_reg = RegStorage::Solo64(RegStorage::kFloatingPoint | fp_reg_num);
     RegisterInfo* dp_reg_info = GetRegInfo(dp_reg);
     // Double precision register's master storage should refer to itself.
     DCHECK_EQ(dp_reg_info, dp_reg_info->Master());
@@ -613,10 +612,6 @@
     info->SetMaster(dp_reg_info);
     // Singles should show a single 32-bit mask bit, at first referring to the low half.
     DCHECK_EQ(info->StorageMask(), 0x1U);
-    if (sp_reg_num & 1) {
-      // For odd singles, change to user the high word of the backing double.
-      info->SetStorageMask(0x2);
-    }
   }
 
   // TODO: re-enable this when we can safely save r4 over the suspension code path.
@@ -648,14 +643,11 @@
 }
 
 /*
- * Mark a callee-save fp register as promoted.  Note that
- * vpush/vpop uses contiguous register lists so we must
- * include any holes in the mask.  Associate holes with
- * Dalvik register INVALID_VREG (0xFFFFU).
+ * Mark a callee-save fp register as promoted.
  */
 void Arm64Mir2Lir::MarkPreservedSingle(int v_reg, RegStorage reg) {
-  DCHECK_GE(reg.GetRegNum(), ARM_FP_CALLEE_SAVE_BASE);
-  int adjusted_reg_num = reg.GetRegNum() - ARM_FP_CALLEE_SAVE_BASE;
+  DCHECK(reg.IsFloat());
+  int adjusted_reg_num = reg.GetRegNum() - A64_FP_CALLEE_SAVE_BASE;
   // Ensure fp_vmap_table is large enough
   int table_size = fp_vmap_table_.size();
   for (int i = table_size; i < (adjusted_reg_num + 1); i++) {
@@ -665,29 +657,36 @@
   fp_vmap_table_[adjusted_reg_num] = v_reg;
   // Size of fp_vmap_table is high-water mark, use to set mask
   num_fp_spills_ = fp_vmap_table_.size();
-  fp_spill_mask_ = ((1 << num_fp_spills_) - 1) << ARM_FP_CALLEE_SAVE_BASE;
+  fp_spill_mask_ = ((1 << num_fp_spills_) - 1) << A64_FP_CALLEE_SAVE_BASE;
 }
 
 void Arm64Mir2Lir::MarkPreservedDouble(int v_reg, RegStorage reg) {
-  // TEMP: perform as 2 singles.
-  int reg_num = reg.GetRegNum() << 1;
-  RegStorage lo = RegStorage::Solo32(RegStorage::kFloatingPoint | reg_num);
-  RegStorage hi = RegStorage::Solo32(RegStorage::kFloatingPoint | reg_num | 1);
-  MarkPreservedSingle(v_reg, lo);
-  MarkPreservedSingle(v_reg + 1, hi);
+  DCHECK(reg.IsDouble());
+  MarkPreservedSingle(v_reg, reg);
 }
 
 /* Clobber all regs that might be used by an external C call */
 void Arm64Mir2Lir::ClobberCallerSave() {
-  // TODO(Arm64): implement this.
-  UNIMPLEMENTED(WARNING);
-
   Clobber(rs_x0);
   Clobber(rs_x1);
   Clobber(rs_x2);
   Clobber(rs_x3);
+  Clobber(rs_x4);
+  Clobber(rs_x5);
+  Clobber(rs_x6);
+  Clobber(rs_x7);
+  Clobber(rs_x8);
+  Clobber(rs_x9);
+  Clobber(rs_x10);
+  Clobber(rs_x11);
   Clobber(rs_x12);
+  Clobber(rs_x13);
+  Clobber(rs_x14);
+  Clobber(rs_x15);
+  Clobber(rs_x16);
+  Clobber(rs_x17);
   Clobber(rs_x30);
+
   Clobber(rs_f0);
   Clobber(rs_f1);
   Clobber(rs_f2);
@@ -696,14 +695,22 @@
   Clobber(rs_f5);
   Clobber(rs_f6);
   Clobber(rs_f7);
-  Clobber(rs_f8);
-  Clobber(rs_f9);
-  Clobber(rs_f10);
-  Clobber(rs_f11);
-  Clobber(rs_f12);
-  Clobber(rs_f13);
-  Clobber(rs_f14);
-  Clobber(rs_f15);
+  Clobber(rs_f16);
+  Clobber(rs_f17);
+  Clobber(rs_f18);
+  Clobber(rs_f19);
+  Clobber(rs_f20);
+  Clobber(rs_f21);
+  Clobber(rs_f22);
+  Clobber(rs_f23);
+  Clobber(rs_f24);
+  Clobber(rs_f25);
+  Clobber(rs_f26);
+  Clobber(rs_f27);
+  Clobber(rs_f28);
+  Clobber(rs_f29);
+  Clobber(rs_f30);
+  Clobber(rs_f31);
 }
 
 RegLocation Arm64Mir2Lir::GetReturnWideAlt() {
@@ -776,61 +783,6 @@
   return Arm64Mir2Lir::EncodingMap[UNWIDE(opcode)].fmt;
 }
 
-/*
- * Somewhat messy code here.  We want to allocate a pair of contiguous
- * physical single-precision floating point registers starting with
- * an even numbered reg.  It is possible that the paired s_reg (s_reg+1)
- * has already been allocated - try to fit if possible.  Fail to
- * allocate if we can't meet the requirements for the pair of
- * s_reg<=sX[even] & (s_reg+1)<= sX+1.
- */
-// TODO: needs rewrite to support non-backed 64-bit float regs.
-RegStorage Arm64Mir2Lir::AllocPreservedDouble(int s_reg) {
-  RegStorage res;
-  int v_reg = mir_graph_->SRegToVReg(s_reg);
-  int p_map_idx = SRegToPMap(s_reg);
-  if (promotion_map_[p_map_idx+1].fp_location == kLocPhysReg) {
-    // Upper reg is already allocated.  Can we fit?
-    int high_reg = promotion_map_[p_map_idx+1].FpReg;
-    if ((high_reg & 1) == 0) {
-      // High reg is even - fail.
-      return res;  // Invalid.
-    }
-    // Is the low reg of the pair free?
-    // FIXME: rework.
-    RegisterInfo* p = GetRegInfo(RegStorage::FloatSolo32(high_reg - 1));
-    if (p->InUse() || p->IsTemp()) {
-      // Already allocated or not preserved - fail.
-      return res;  // Invalid.
-    }
-    // OK - good to go.
-    res = RegStorage::FloatSolo64(p->GetReg().GetRegNum() >> 1);
-    p->MarkInUse();
-    MarkPreservedSingle(v_reg, p->GetReg());
-  } else {
-    /*
-     * TODO: until runtime support is in, make sure we avoid promoting the same vreg to
-     * different underlying physical registers.
-     */
-    GrowableArray<RegisterInfo*>::Iterator it(&reg_pool_->dp_regs_);
-    for (RegisterInfo* info = it.Next(); info != nullptr; info = it.Next()) {
-      if (!info->IsTemp() && !info->InUse()) {
-        res = info->GetReg();
-        info->MarkInUse();
-        MarkPreservedDouble(v_reg, info->GetReg());
-        break;
-      }
-    }
-  }
-  if (res.Valid()) {
-    promotion_map_[p_map_idx].fp_location = kLocPhysReg;
-    promotion_map_[p_map_idx].FpReg = res.DoubleToLowSingle().GetReg();
-    promotion_map_[p_map_idx+1].fp_location = kLocPhysReg;
-    promotion_map_[p_map_idx+1].FpReg = res.DoubleToHighSingle().GetReg();
-  }
-  return res;
-}
-
 // TODO(Arm64): reuse info in QuickArgumentVisitor?
 static RegStorage GetArgPhysicalReg(RegLocation* loc, int* num_gpr_used, int* num_fpr_used,
                                     OpSize* op_size) {
diff --git a/compiler/dex/quick/arm64/utility_arm64.cc b/compiler/dex/quick/arm64/utility_arm64.cc
index 77e4c3c..39e9fad 100644
--- a/compiler/dex/quick/arm64/utility_arm64.cc
+++ b/compiler/dex/quick/arm64/utility_arm64.cc
@@ -360,18 +360,17 @@
   return NewLIR1(opcode, r_dest_src.GetReg());
 }
 
-LIR* Arm64Mir2Lir::OpRegRegShift(OpKind op, int r_dest_src1, int r_src2,
-                                 int shift, bool is_wide) {
-  ArmOpcode wide = (is_wide) ? WIDE(0) : UNWIDE(0);
+LIR* Arm64Mir2Lir::OpRegRegShift(OpKind op, RegStorage r_dest_src1, RegStorage r_src2, int shift) {
+  ArmOpcode wide = (r_dest_src1.Is64Bit()) ? WIDE(0) : UNWIDE(0);
+  CHECK_EQ(r_dest_src1.Is64Bit(), r_src2.Is64Bit());
   ArmOpcode opcode = kA64Brk1d;
 
-  switch (OP_KIND_UNWIDE(op)) {
+  switch (op) {
     case kOpCmn:
-      opcode = kA64Cmn3Rro;
+      opcode = kA64Cmn3rro;
       break;
     case kOpCmp:
-      // TODO(Arm64): check the instruction above: "cmp w0, w1" is rendered as "cmp w0, w1, uxtb".
-      opcode = kA64Cmp3Rro;
+      opcode = kA64Cmp3rro;
       break;
     case kOpMov:
       opcode = kA64Mov2rr;
@@ -388,39 +387,38 @@
     case kOpRev:
       DCHECK_EQ(shift, 0);
       // Binary, but rm is encoded twice.
-      return NewLIR3(kA64Rev2rr | wide, r_dest_src1, r_src2, r_src2);
+      return NewLIR3(kA64Rev2rr | wide, r_dest_src1.GetReg(), r_src2.GetReg(), r_src2.GetReg());
       break;
     case kOpRevsh:
       // Binary, but rm is encoded twice.
-      return NewLIR3(kA64Rev162rr | wide, r_dest_src1, r_src2, r_src2);
+      return NewLIR3(kA64Rev162rr | wide, r_dest_src1.GetReg(), r_src2.GetReg(), r_src2.GetReg());
       break;
     case kOp2Byte:
       DCHECK_EQ(shift, ENCODE_NO_SHIFT);
       // "sbfx r1, r2, #imm1, #imm2" is "sbfm r1, r2, #imm1, #(imm1 + imm2 - 1)".
       // For now we use sbfm directly.
-      return NewLIR4(kA64Sbfm4rrdd | wide, r_dest_src1, r_src2, 0, 7);
+      return NewLIR4(kA64Sbfm4rrdd | wide, r_dest_src1.GetReg(), r_src2.GetReg(), 0, 7);
     case kOp2Short:
       DCHECK_EQ(shift, ENCODE_NO_SHIFT);
       // For now we use sbfm rather than its alias, sbfx.
-      return NewLIR4(kA64Sbfm4rrdd | wide, r_dest_src1, r_src2, 0, 15);
+      return NewLIR4(kA64Sbfm4rrdd | wide, r_dest_src1.GetReg(), r_src2.GetReg(), 0, 15);
     case kOp2Char:
       // "ubfx r1, r2, #imm1, #imm2" is "ubfm r1, r2, #imm1, #(imm1 + imm2 - 1)".
       // For now we use ubfm directly.
       DCHECK_EQ(shift, ENCODE_NO_SHIFT);
-      return NewLIR4(kA64Ubfm4rrdd | wide, r_dest_src1, r_src2, 0, 15);
+      return NewLIR4(kA64Ubfm4rrdd | wide, r_dest_src1.GetReg(), r_src2.GetReg(), 0, 15);
     default:
-      return OpRegRegRegShift(op, r_dest_src1, r_dest_src1, r_src2, shift);
+      return OpRegRegRegShift(op, r_dest_src1.GetReg(), r_dest_src1.GetReg(), r_src2.GetReg(), shift);
   }
 
   DCHECK(!IsPseudoLirOp(opcode));
   if (EncodingMap[opcode].flags & IS_BINARY_OP) {
     DCHECK_EQ(shift, ENCODE_NO_SHIFT);
-    return NewLIR2(opcode | wide, r_dest_src1, r_src2);
+    return NewLIR2(opcode | wide, r_dest_src1.GetReg(), r_src2.GetReg());
   } else if (EncodingMap[opcode].flags & IS_TERTIARY_OP) {
     ArmEncodingKind kind = EncodingMap[opcode].field_loc[2].kind;
-    if (kind == kFmtExtend || kind == kFmtShift) {
-      DCHECK_EQ(kind == kFmtExtend, IsExtendEncoding(shift));
-      return NewLIR3(opcode | wide, r_dest_src1, r_src2, shift);
+    if (kind == kFmtShift) {
+      return NewLIR3(opcode | wide, r_dest_src1.GetReg(), r_src2.GetReg(), shift);
     }
   }
 
@@ -429,8 +427,7 @@
 }
 
 LIR* Arm64Mir2Lir::OpRegReg(OpKind op, RegStorage r_dest_src1, RegStorage r_src2) {
-  return OpRegRegShift(op, r_dest_src1.GetReg(), r_src2.GetReg(), ENCODE_NO_SHIFT,
-                       r_dest_src1.Is64Bit());
+  return OpRegRegShift(op, r_dest_src1, r_src2, ENCODE_NO_SHIFT);
 }
 
 LIR* Arm64Mir2Lir::OpMovRegMem(RegStorage r_dest, RegStorage r_base, int offset, MoveType move_type) {
@@ -452,7 +449,7 @@
                                     int r_src2, int shift, bool is_wide) {
   ArmOpcode opcode = kA64Brk1d;
 
-  switch (OP_KIND_UNWIDE(op)) {
+  switch (op) {
     case kOpAdd:
       opcode = kA64Add4rrro;
       break;
@@ -525,10 +522,10 @@
   ArmOpcode opcode = kA64Brk1d;
   ArmOpcode alt_opcode = kA64Brk1d;
   int32_t log_imm = -1;
-  bool is_wide = OP_KIND_IS_WIDE(op);
+  bool is_wide = r_dest.Is64Bit();
   ArmOpcode wide = (is_wide) ? WIDE(0) : UNWIDE(0);
 
-  switch (OP_KIND_UNWIDE(op)) {
+  switch (op) {
     case kOpLsl: {
       // "lsl w1, w2, #imm" is an alias of "ubfm w1, w2, #(-imm MOD 32), #(31-imm)"
       // and "lsl x1, x2, #imm" of "ubfm x1, x2, #(-imm MOD 32), #(31-imm)".
@@ -639,7 +636,7 @@
     return res;
   }
 
-  switch (OP_KIND_UNWIDE(op)) {
+  switch (op) {
     case kOpAdd:
       neg_opcode = kA64Sub4RRdT;
       opcode = kA64Add4RRdT;
@@ -828,99 +825,66 @@
                                     OpSize size) {
   LIR* load = NULL;
   ArmOpcode opcode = kA64Brk1d;
-  bool short_form = false;
-  int encoded_disp = displacement;
+  ArmOpcode alt_opcode = kA64Brk1d;
+  int scale = 0;
+
   switch (size) {
     case kDouble:     // Intentional fall-through.
     case kWord:       // Intentional fall-through.
     case k64:
-      DCHECK_EQ(encoded_disp & 0x3, 0);
+      scale = 3;
       if (r_dest.IsFloat()) {
-        // Currently double values may be misaligned.
-        if ((displacement & 0x7) == 0 && displacement >= 0 && displacement <= 32760) {
-          // Can use scaled load.
-          opcode = FWIDE(kA64Ldr3fXD);
-          encoded_disp >>= 3;
-          short_form = true;
-        } else if (IS_SIGNED_IMM9(displacement)) {
-          // Can use unscaled load.
-          opcode = FWIDE(kA64Ldur3fXd);
-          short_form = true;
-        } else {
-          short_form = false;
-        }
+        DCHECK(r_dest.IsDouble());
+        opcode = FWIDE(kA64Ldr3fXD);
+        alt_opcode = FWIDE(kA64Ldur3fXd);
       } else {
-        // Currently long values may be misaligned.
-        if ((displacement & 0x7) == 0 && displacement >= 0 && displacement <= 32760) {
-          // Can use scaled store.
-          opcode = FWIDE(kA64Ldr3rXD);
-          encoded_disp >>= 3;
-          short_form = true;
-        } else if (IS_SIGNED_IMM9(displacement)) {
-          // Can use unscaled store.
-          opcode = FWIDE(kA64Ldur3rXd);
-          short_form = true;
-        }  // else: use long sequence (short_form = false).
+        opcode = FWIDE(kA64Ldr3rXD);
+        alt_opcode = FWIDE(kA64Ldur3rXd);
       }
       break;
     case kSingle:     // Intentional fall-through.
     case k32:         // Intentional fall-trough.
     case kReference:
+      scale = 2;
       if (r_dest.IsFloat()) {
+        DCHECK(r_dest.IsSingle());
         opcode = kA64Ldr3fXD;
-        if (displacement <= 1020) {
-          short_form = true;
-          encoded_disp >>= 2;
-        }
-        break;
-      }
-      if (displacement <= 16380 && displacement >= 0) {
-        DCHECK_EQ((displacement & 0x3), 0);
-        short_form = true;
-        encoded_disp >>= 2;
+      } else {
         opcode = kA64Ldr3rXD;
       }
       break;
     case kUnsignedHalf:
-      if (displacement < 64 && displacement >= 0) {
-        DCHECK_EQ((displacement & 0x1), 0);
-        short_form = true;
-        encoded_disp >>= 1;
-        opcode = kA64Ldrh3wXF;
-      } else if (displacement < 4092 && displacement >= 0) {
-        short_form = true;
-        opcode = kA64Ldrh3wXF;
-      }
+      scale = 1;
+      opcode = kA64Ldrh3wXF;
       break;
     case kSignedHalf:
-      short_form = true;
+      scale = 1;
       opcode = kA64Ldrsh3rXF;
       break;
     case kUnsignedByte:
-      short_form = true;
       opcode = kA64Ldrb3wXd;
       break;
     case kSignedByte:
-      short_form = true;
       opcode = kA64Ldrsb3rXd;
       break;
     default:
       LOG(FATAL) << "Bad size: " << size;
   }
 
-  if (short_form) {
-    load = NewLIR3(opcode, r_dest.GetReg(), r_base.GetReg(), encoded_disp);
+  bool displacement_is_aligned = (displacement & ((1 << scale) - 1)) == 0;
+  int scaled_disp = displacement >> scale;
+  if (displacement_is_aligned && scaled_disp >= 0 && scaled_disp < 4096) {
+    // Can use scaled load.
+    load = NewLIR3(opcode, r_dest.GetReg(), r_base.GetReg(), scaled_disp);
+  } else if (alt_opcode != kA64Brk1d && IS_SIGNED_IMM9(displacement)) {
+    // Can use unscaled load.
+    load = NewLIR3(alt_opcode, r_dest.GetReg(), r_base.GetReg(), displacement);
   } else {
-    RegStorage reg_offset = AllocTemp();
-    LoadConstant(reg_offset, encoded_disp);
-    if (r_dest.IsFloat()) {
-      // No index ops - must use a long sequence.  Turn the offset into a direct pointer.
-      OpRegReg(kOpAdd, reg_offset, r_base);
-      load = LoadBaseDispBody(reg_offset, 0, r_dest, size);
-    } else {
-      load = LoadBaseIndexed(r_base, reg_offset, r_dest, 0, size);
-    }
-    FreeTemp(reg_offset);
+    // Use long sequence.
+    RegStorage r_scratch = AllocTemp();
+    LoadConstant(r_scratch, displacement);
+    load = LoadBaseIndexed(r_base, r_scratch, r_dest, 0, size);
+    FreeTemp(r_scratch);
   }
 
   // TODO: in future may need to differentiate Dalvik accesses w/ spills
@@ -947,92 +911,64 @@
                                      OpSize size) {
   LIR* store = NULL;
   ArmOpcode opcode = kA64Brk1d;
-  bool short_form = false;
-  int encoded_disp = displacement;
+  ArmOpcode alt_opcode = kA64Brk1d;
+  int scale = 0;
+
   switch (size) {
     case kDouble:     // Intentional fall-through.
     case kWord:       // Intentional fall-through.
     case k64:
-      DCHECK_EQ(encoded_disp & 0x3, 0);
+      scale = 3;
       if (r_src.IsFloat()) {
-        // Currently double values may be misaligned.
-        if ((displacement & 0x7) == 0 && displacement >= 0 && displacement <= 32760) {
-          // Can use scaled store.
-          opcode = FWIDE(kA64Str3fXD);
-          encoded_disp >>= 3;
-          short_form = true;
-        } else if (IS_SIGNED_IMM9(displacement)) {
-          // Can use unscaled store.
-          opcode = FWIDE(kA64Stur3fXd);
-          short_form = true;
-        }  // else: use long sequence (short_form = false).
+        DCHECK(r_src.IsDouble());
+        opcode = FWIDE(kA64Str3fXD);
+        alt_opcode = FWIDE(kA64Stur3fXd);
       } else {
-        // Currently long values may be misaligned.
-        if ((displacement & 0x7) == 0 && displacement >= 0 && displacement <= 32760) {
-          // Can use scaled store.
-          opcode = FWIDE(kA64Str3rXD);
-          encoded_disp >>= 3;
-          short_form = true;
-        } else if (IS_SIGNED_IMM9(displacement)) {
-          // Can use unscaled store.
-          opcode = FWIDE(kA64Stur3rXd);
-          short_form = true;
-        }  // else: use long sequence (short_form = false).
+        opcode = FWIDE(kA64Str3rXD);
+        alt_opcode = FWIDE(kA64Stur3rXd);
       }
       break;
     case kSingle:     // Intentional fall-through.
     case k32:         // Intentional fall-trough.
     case kReference:
+      scale = 2;
       if (r_src.IsFloat()) {
         DCHECK(r_src.IsSingle());
-        DCHECK_EQ(encoded_disp & 0x3, 0);
         opcode = kA64Str3fXD;
-        if (displacement <= 1020) {
-          short_form = true;
-          encoded_disp >>= 2;
-        }
-        break;
-      }
-
-      if (displacement <= 16380 && displacement >= 0) {
-        DCHECK_EQ((displacement & 0x3), 0);
-        short_form = true;
-        encoded_disp >>= 2;
+      } else {
         opcode = kA64Str3rXD;
       }
       break;
     case kUnsignedHalf:
     case kSignedHalf:
-      DCHECK_EQ((displacement & 0x1), 0);
-      short_form = true;
-      encoded_disp >>= 1;
+      scale = 1;
       opcode = kA64Strh3wXF;
       break;
     case kUnsignedByte:
     case kSignedByte:
-      short_form = true;
       opcode = kA64Strb3wXd;
       break;
     default:
       LOG(FATAL) << "Bad size: " << size;
   }
 
-  if (short_form) {
-    store = NewLIR3(opcode, r_src.GetReg(), r_base.GetReg(), encoded_disp);
+  bool displacement_is_aligned = (displacement & ((1 << scale) - 1)) == 0;
+  int scaled_disp = displacement >> scale;
+  if (displacement_is_aligned && scaled_disp >= 0 && scaled_disp < 4096) {
+    // Can use scaled store.
+    store = NewLIR3(opcode, r_src.GetReg(), r_base.GetReg(), scaled_disp);
+  } else if (alt_opcode != kA64Brk1d && IS_SIGNED_IMM9(displacement)) {
+    // Can use unscaled store.
+    store = NewLIR3(alt_opcode, r_src.GetReg(), r_base.GetReg(), displacement);
   } else {
+    // Use long sequence.
     RegStorage r_scratch = AllocTemp();
-    LoadConstant(r_scratch, encoded_disp);
-    if (r_src.IsFloat()) {
-      // No index ops - must use a long sequence.  Turn the offset into a direct pointer.
-      OpRegReg(kOpAdd, r_scratch, r_base);
-      store = StoreBaseDispBody(r_scratch, 0, r_src, size);
-    } else {
-      store = StoreBaseIndexed(r_base, r_scratch, r_src, 0, size);
-    }
+    LoadConstant(r_scratch, displacement);
+    store = StoreBaseIndexed(r_base, r_scratch, r_src, 0, size);
     FreeTemp(r_scratch);
   }
 
-  // TODO: In future, may need to differentiate Dalvik & spill accesses
+  // TODO: In future, may need to differentiate Dalvik & spill accesses.
   if (r_base == rs_rA64_SP) {
     AnnotateDalvikRegAccess(store, displacement >> 2, false /* is_load */, r_src.Is64Bit());
   }