AArch64: improve 64-bit immediates loads.

Improve the quick backend to load immediates by choosing the best
of the following strategies:

  - use wzr, xzr to load 0 (via mov) or -1 (via mvn),
  - use logical immediates (orr),
  - use one movz/movn optionally followed by one or more movk,
  - use the literal pool.

Change-Id: I8e46e6d9eaf46b717761dd9d60e63ee3f2a5422b
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index a79c4fa..b1b83f0 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -306,8 +306,8 @@
       return ret_val;
     }
 
-    LIR* LoadFPConstantValue(int r_dest, int32_t value);
-    LIR* LoadFPConstantValueWide(int r_dest, int64_t value);
+    LIR* LoadFPConstantValue(RegStorage r_dest, int32_t value);
+    LIR* LoadFPConstantValueWide(RegStorage r_dest, int64_t value);
     void ReplaceFixup(LIR* prev_lir, LIR* orig_lir, LIR* new_lir);
     void InsertFixupBefore(LIR* prev_lir, LIR* orig_lir, LIR* new_lir);
     void AssignDataOffsets();
diff --git a/compiler/dex/quick/arm64/utility_arm64.cc b/compiler/dex/quick/arm64/utility_arm64.cc
index f384293..2254b8b 100644
--- a/compiler/dex/quick/arm64/utility_arm64.cc
+++ b/compiler/dex/quick/arm64/utility_arm64.cc
@@ -86,14 +86,14 @@
   return (bit7 | bit6 | bit5_to_0);
 }
 
-LIR* Arm64Mir2Lir::LoadFPConstantValue(int r_dest, int32_t value) {
-  DCHECK(RegStorage::IsSingle(r_dest));
+LIR* Arm64Mir2Lir::LoadFPConstantValue(RegStorage r_dest, int32_t value) {
+  DCHECK(r_dest.IsSingle());
   if (value == 0) {
-    return NewLIR2(kA64Fmov2sw, r_dest, rwzr);
+    return NewLIR2(kA64Fmov2sw, r_dest.GetReg(), rwzr);
   } else {
     int32_t encoded_imm = EncodeImmSingle((uint32_t)value);
     if (encoded_imm >= 0) {
-      return NewLIR2(kA64Fmov2fI, r_dest, encoded_imm);
+      return NewLIR2(kA64Fmov2fI, r_dest.GetReg(), encoded_imm);
     }
   }
 
@@ -104,19 +104,19 @@
 
   ScopedMemRefType mem_ref_type(this, ResourceMask::kLiteral);
   LIR* load_pc_rel = RawLIR(current_dalvik_offset_, kA64Ldr2fp,
-                            r_dest, 0, 0, 0, 0, data_target);
+                            r_dest.GetReg(), 0, 0, 0, 0, data_target);
   AppendLIR(load_pc_rel);
   return load_pc_rel;
 }
 
-LIR* Arm64Mir2Lir::LoadFPConstantValueWide(int r_dest, int64_t value) {
-  DCHECK(RegStorage::IsDouble(r_dest));
+LIR* Arm64Mir2Lir::LoadFPConstantValueWide(RegStorage r_dest, int64_t value) {
+  DCHECK(r_dest.IsDouble());
   if (value == 0) {
-    return NewLIR2(kA64Fmov2Sx, r_dest, rxzr);
+    return NewLIR2(kA64Fmov2Sx, r_dest.GetReg(), rxzr);
   } else {
     int32_t encoded_imm = EncodeImmDouble(value);
     if (encoded_imm >= 0) {
-      return NewLIR2(FWIDE(kA64Fmov2fI), r_dest, encoded_imm);
+      return NewLIR2(FWIDE(kA64Fmov2fI), r_dest.GetReg(), encoded_imm);
     }
   }
 
@@ -128,20 +128,19 @@
     data_target = AddWideData(&literal_list_, val_lo, val_hi);
   }
 
-  DCHECK(RegStorage::IsFloat(r_dest));
   ScopedMemRefType mem_ref_type(this, ResourceMask::kLiteral);
   LIR* load_pc_rel = RawLIR(current_dalvik_offset_, FWIDE(kA64Ldr2fp),
-                            r_dest, 0, 0, 0, 0, data_target);
+                            r_dest.GetReg(), 0, 0, 0, 0, data_target);
   AppendLIR(load_pc_rel);
   return load_pc_rel;
 }
 
 static int CountLeadingZeros(bool is_wide, uint64_t value) {
-  return (is_wide) ? __builtin_clzl(value) : __builtin_clz((uint32_t)value);
+  return (is_wide) ? __builtin_clzll(value) : __builtin_clz((uint32_t)value);
 }
 
 static int CountTrailingZeros(bool is_wide, uint64_t value) {
-  return (is_wide) ? __builtin_ctzl(value) : __builtin_ctz((uint32_t)value);
+  return (is_wide) ? __builtin_ctzll(value) : __builtin_ctz((uint32_t)value);
 }
 
 static int CountSetBits(bool is_wide, uint64_t value) {
@@ -276,12 +275,16 @@
   LIR* res;
 
   if (r_dest.IsFloat()) {
-    return LoadFPConstantValue(r_dest.GetReg(), value);
+    return LoadFPConstantValue(r_dest, value);
+  }
+
+  if (r_dest.Is64Bit()) {
+    return LoadConstantWide(r_dest, value);
   }
 
   // Loading SP/ZR with an immediate is not supported.
-  DCHECK_NE(r_dest.GetReg(), rwsp);
-  DCHECK_NE(r_dest.GetReg(), rwzr);
+  DCHECK(!A64_REG_IS_SP(r_dest.GetReg()));
+  DCHECK(!A64_REG_IS_ZR(r_dest.GetReg()));
 
   // Compute how many movk, movz instructions are needed to load the value.
   uint16_t high_bits = High16Bits(value);
@@ -331,6 +334,98 @@
   return res;
 }
 
+// TODO: clean up the names. LoadConstantWide() should really be LoadConstantNoClobberWide().
+LIR* Arm64Mir2Lir::LoadConstantWide(RegStorage r_dest, int64_t value) {
+  // Maximum number of instructions to use for encoding the immediate.
+  const int max_num_ops = 2;
+
+  if (r_dest.IsFloat()) {
+    return LoadFPConstantValueWide(r_dest, value);
+  }
+
+  DCHECK(r_dest.Is64Bit());
+
+  // Loading SP/ZR with an immediate is not supported.
+  DCHECK(!A64_REG_IS_SP(r_dest.GetReg()));
+  DCHECK(!A64_REG_IS_ZR(r_dest.GetReg()));
+
+  if (LIKELY(value == INT64_C(0) || value == INT64_C(-1))) {
+    // value is either 0 or -1: we can just use xzr.
+    ArmOpcode opcode = LIKELY(value == 0) ? WIDE(kA64Mov2rr) : WIDE(kA64Mvn2rr);
+    return NewLIR2(opcode, r_dest.GetReg(), rxzr);
+  }
+
+  // At least one in value's halfwords is not 0x0, nor 0xffff: find out how many.
+  int num_0000_halfwords = 0;
+  int num_ffff_halfwords = 0;
+  uint64_t uvalue = static_cast<uint64_t>(value);
+  for (int shift = 0; shift < 64; shift += 16) {
+    uint16_t halfword = static_cast<uint16_t>(uvalue >> shift);
+    if (halfword == 0)
+      num_0000_halfwords++;
+    else if (halfword == UINT16_C(0xffff))
+      num_ffff_halfwords++;
+  }
+  int num_fast_halfwords = std::max(num_0000_halfwords, num_ffff_halfwords);
+
+  if (num_fast_halfwords < 3) {
+    // A single movz/movn is not enough. Try the logical immediate route.
+    int log_imm = EncodeLogicalImmediate(/*is_wide=*/true, value);
+    if (log_imm >= 0) {
+      return NewLIR3(WIDE(kA64Orr3Rrl), r_dest.GetReg(), rxzr, log_imm);
+    }
+  }
+
+  if (num_fast_halfwords >= 4 - max_num_ops) {
+    // We can encode the number using a movz/movn followed by one or more movk.
+    ArmOpcode op;
+    uint16_t background;
+    LIR* res = nullptr;
+
+    // Decide whether to use a movz or a movn.
+    if (num_0000_halfwords >= num_ffff_halfwords) {
+      op = WIDE(kA64Movz3rdM);
+      background = 0;
+    } else {
+      op = WIDE(kA64Movn3rdM);
+      background = 0xffff;
+    }
+
+    // Emit the first instruction (movz, movn).
+    int shift;
+    for (shift = 0; shift < 4; shift++) {
+      uint16_t halfword = static_cast<uint16_t>(uvalue >> (shift << 4));
+      if (halfword != background) {
+        res = NewLIR3(op, r_dest.GetReg(), halfword ^ background, shift);
+        break;
+      }
+    }
+
+    // Emit the movk instructions.
+    for (shift++; shift < 4; shift++) {
+      uint16_t halfword = static_cast<uint16_t>(uvalue >> (shift << 4));
+      if (halfword != background) {
+        NewLIR3(WIDE(kA64Movk3rdM), r_dest.GetReg(), halfword, shift);
+      }
+    }
+    return res;
+  }
+
+  // Use the literal pool.
+  int32_t val_lo = Low32Bits(value);
+  int32_t val_hi = High32Bits(value);
+  LIR* data_target = ScanLiteralPoolWide(literal_list_, val_lo, val_hi);
+  if (data_target == NULL) {
+    data_target = AddWideData(&literal_list_, val_lo, val_hi);
+  }
+
+  ScopedMemRefType mem_ref_type(this, ResourceMask::kLiteral);
+  LIR *res = RawLIR(current_dalvik_offset_, WIDE(kA64Ldr2rp),
+                    r_dest.GetReg(), 0, 0, 0, 0, data_target);
+  AppendLIR(res);
+  return res;
+}
+
 LIR* Arm64Mir2Lir::OpUnconditionalBranch(LIR* target) {
   LIR* res = NewLIR1(kA64B1t, 0 /* offset to be patched  during assembly */);
   res->target = target;
@@ -738,29 +833,6 @@
     return NewLIR3(opcode | wide, r_dest_src1.GetReg(), abs_value, (shift) ? 1 : 0);
 }
 
-LIR* Arm64Mir2Lir::LoadConstantWide(RegStorage r_dest, int64_t value) {
-  if (r_dest.IsFloat()) {
-    return LoadFPConstantValueWide(r_dest.GetReg(), value);
-  } else {
-    // TODO(Arm64): check whether we can load the immediate with a short form.
-    //   e.g. via movz, movk or via logical immediate.
-
-    // No short form - load from the literal pool.
-    int32_t val_lo = Low32Bits(value);
-    int32_t val_hi = High32Bits(value);
-    LIR* data_target = ScanLiteralPoolWide(literal_list_, val_lo, val_hi);
-    if (data_target == NULL) {
-      data_target = AddWideData(&literal_list_, val_lo, val_hi);
-    }
-
-    ScopedMemRefType mem_ref_type(this, ResourceMask::kLiteral);
-    LIR* res = RawLIR(current_dalvik_offset_, WIDE(kA64Ldr2rp),
-                      r_dest.GetReg(), 0, 0, 0, 0, data_target);
-    AppendLIR(res);
-    return res;
-  }
-}
-
 int Arm64Mir2Lir::EncodeShift(int shift_type, int amount) {
   return ((shift_type & 0x3) << 7) | (amount & 0x1f);
 }