AArch64: Implement InexpensiveConstant methods.

Implement IsInexpensiveConstant and friends for A64.
Also extending the methods to take the opcode with respect to which
the constant is inexpensive. Additionally, logical operations (i.e.
and, or, xor) can now handle the immediates 0 and ~0 (which are not
logical immediates).

Change-Id: I46ce1287703765c5ab54983d13c1b3a1f5838622
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index 2c587b8..4680f8f 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -240,6 +240,7 @@
   void OpRegCopyWide(RegStorage dest, RegStorage src) OVERRIDE;
 
   bool InexpensiveConstantInt(int32_t value) OVERRIDE;
+  bool InexpensiveConstantInt(int32_t value, Instruction::Code opcode) OVERRIDE;
   bool InexpensiveConstantFloat(int32_t value) OVERRIDE;
   bool InexpensiveConstantLong(int64_t value) OVERRIDE;
   bool InexpensiveConstantDouble(int64_t value) OVERRIDE;
diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc
index 9403d5e..cc0d967 100644
--- a/compiler/dex/quick/arm64/int_arm64.cc
+++ b/compiler/dex/quick/arm64/int_arm64.cc
@@ -1192,22 +1192,7 @@
 
 void Arm64Mir2Lir::GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
                                      RegLocation rl_src1, RegLocation rl_src2) {
-  if ((opcode == Instruction::SUB_LONG) || (opcode == Instruction::SUB_LONG_2ADDR)) {
-    if (!rl_src2.is_const) {
-      return GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
-    }
-  } else {
-    // Associativity.
-    if (!rl_src2.is_const) {
-      DCHECK(rl_src1.is_const);
-      std::swap(rl_src1, rl_src2);
-    }
-  }
-  DCHECK(rl_src2.is_const);
-
   OpKind op = kOpBkpt;
-  int64_t val = mir_graph_->ConstantValueWide(rl_src2);
-
   switch (opcode) {
     case Instruction::ADD_LONG:
     case Instruction::ADD_LONG_2ADDR:
@@ -1233,6 +1218,20 @@
       LOG(FATAL) << "Unexpected opcode";
   }
 
+  if (op == kOpSub) {
+    if (!rl_src2.is_const) {
+      return GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
+    }
+  } else {
+    // Associativity.
+    if (!rl_src2.is_const) {
+      DCHECK(rl_src1.is_const);
+      std::swap(rl_src1, rl_src2);
+    }
+  }
+  DCHECK(rl_src2.is_const);
+  int64_t val = mir_graph_->ConstantValueWide(rl_src2);
+
   rl_src1 = LoadValueWide(rl_src1, kCoreReg);
   RegLocation rl_result = EvalLocWide(rl_dest, kCoreReg, true);
   OpRegRegImm64(op, rl_result.reg, rl_src1.reg, val);
diff --git a/compiler/dex/quick/arm64/utility_arm64.cc b/compiler/dex/quick/arm64/utility_arm64.cc
index cd1840a..5326e74 100644
--- a/compiler/dex/quick/arm64/utility_arm64.cc
+++ b/compiler/dex/quick/arm64/utility_arm64.cc
@@ -269,8 +269,47 @@
   return (n << 12 | imm_r << 6 | imm_s);
 }
 
+// Maximum number of instructions to use for encoding the immediate.
+static const int max_num_ops_per_const_load = 2;
+
+/**
+ * @brief Return the number of fast halfwords in the given uint64_t integer.
+ * @details The input integer is split into 4 halfwords (bits 0-15, 16-31, 32-47, 48-63). The
+ *   number of fast halfwords (halfwords that are either 0 or 0xffff) is returned. See below for
+ *   a more accurate description.
+ * @param value The input 64-bit integer.
+ * @return Return @c retval such that (retval & 0x7) is the maximum between n and m, where n is
+ *   the number of halfwords with all bits unset (0) and m is the number of halfwords with all bits
+ *   set (0xffff). Additionally (retval & 0x8) is set when m > n.
+ */
+static int GetNumFastHalfWords(uint64_t value) {
+  unsigned int num_0000_halfwords = 0;
+  unsigned int num_ffff_halfwords = 0;
+  for (int shift = 0; shift < 64; shift += 16) {
+    uint16_t halfword = static_cast<uint16_t>(value >> shift);
+    if (halfword == 0)
+      num_0000_halfwords++;
+    else if (halfword == UINT16_C(0xffff))
+      num_ffff_halfwords++;
+  }
+  if (num_0000_halfwords >= num_ffff_halfwords) {
+    DCHECK_LE(num_0000_halfwords, 4U);
+    return num_0000_halfwords;
+  } else {
+    DCHECK_LE(num_ffff_halfwords, 4U);
+    return num_ffff_halfwords | 0x8;
+  }
+}
+
+// The InexpensiveConstantXXX variants below are used in the promotion algorithm to determine how a
+// constant is considered for promotion. If the constant is "inexpensive" then the promotion
+// algorithm will give it a low priority for promotion, even when it is referenced many times in
+// the code.
+
 bool Arm64Mir2Lir::InexpensiveConstantInt(int32_t value) {
-  return false;  // (ModifiedImmediate(value) >= 0) || (ModifiedImmediate(~value) >= 0);
+  // A 32-bit int can always be loaded with 2 instructions (and without using the literal pool).
+  // We therefore return true and give it a low priority for promotion.
+  return true;
 }
 
 bool Arm64Mir2Lir::InexpensiveConstantFloat(int32_t value) {
@@ -278,13 +317,70 @@
 }
 
 bool Arm64Mir2Lir::InexpensiveConstantLong(int64_t value) {
-  return InexpensiveConstantInt(High32Bits(value)) && InexpensiveConstantInt(Low32Bits(value));
+  int num_slow_halfwords = 4 - (GetNumFastHalfWords(value) & 0x7);
+  if (num_slow_halfwords <= max_num_ops_per_const_load) {
+    return true;
+  }
+  return (EncodeLogicalImmediate(/*is_wide=*/true, value) >= 0);
 }
 
 bool Arm64Mir2Lir::InexpensiveConstantDouble(int64_t value) {
   return EncodeImmDouble(value) >= 0;
 }
 
+// The InexpensiveConstantXXX variants below are used to determine which A64 instructions to use
+// when one of the operands is an immediate (e.g. register version or immediate version of add).
+
+bool Arm64Mir2Lir::InexpensiveConstantInt(int32_t value, Instruction::Code opcode) {
+  switch (opcode) {
+  case Instruction::IF_EQ:
+  case Instruction::IF_NE:
+  case Instruction::IF_LT:
+  case Instruction::IF_GE:
+  case Instruction::IF_GT:
+  case Instruction::IF_LE:
+  case Instruction::ADD_INT:
+  case Instruction::ADD_INT_2ADDR:
+  case Instruction::SUB_INT:
+  case Instruction::SUB_INT_2ADDR:
+    // The code below is consistent with the implementation of OpRegRegImm().
+    {
+      int32_t abs_value = std::abs(value);
+      if (abs_value < 0x1000) {
+        return true;
+      } else if ((abs_value & UINT64_C(0xfff)) == 0 && ((abs_value >> 12) < 0x1000)) {
+        return true;
+      }
+      return false;
+    }
+  case Instruction::SHL_INT:
+  case Instruction::SHL_INT_2ADDR:
+  case Instruction::SHR_INT:
+  case Instruction::SHR_INT_2ADDR:
+  case Instruction::USHR_INT:
+  case Instruction::USHR_INT_2ADDR:
+    return true;
+  case Instruction::AND_INT:
+  case Instruction::AND_INT_2ADDR:
+  case Instruction::AND_INT_LIT16:
+  case Instruction::AND_INT_LIT8:
+  case Instruction::OR_INT:
+  case Instruction::OR_INT_2ADDR:
+  case Instruction::OR_INT_LIT16:
+  case Instruction::OR_INT_LIT8:
+  case Instruction::XOR_INT:
+  case Instruction::XOR_INT_2ADDR:
+  case Instruction::XOR_INT_LIT16:
+  case Instruction::XOR_INT_LIT8:
+    if (value == 0 || value == INT32_C(-1)) {
+      return true;
+    }
+    return (EncodeLogicalImmediate(/*is_wide=*/false, value) >= 0);
+  default:
+    return false;
+  }
+}
+
 /*
  * Load a immediate using one single instruction when possible; otherwise
  * use a pair of movz and movk instructions.
@@ -358,9 +454,6 @@
 
 // TODO: clean up the names. LoadConstantWide() should really be LoadConstantNoClobberWide().
 LIR* Arm64Mir2Lir::LoadConstantWide(RegStorage r_dest, int64_t value) {
-  // Maximum number of instructions to use for encoding the immediate.
-  const int max_num_ops = 2;
-
   if (r_dest.IsFloat()) {
     return LoadFPConstantValueWide(r_dest, value);
   }
@@ -378,19 +471,12 @@
   }
 
   // At least one in value's halfwords is not 0x0, nor 0xffff: find out how many.
-  int num_0000_halfwords = 0;
-  int num_ffff_halfwords = 0;
   uint64_t uvalue = static_cast<uint64_t>(value);
-  for (int shift = 0; shift < 64; shift += 16) {
-    uint16_t halfword = static_cast<uint16_t>(uvalue >> shift);
-    if (halfword == 0)
-      num_0000_halfwords++;
-    else if (halfword == UINT16_C(0xffff))
-      num_ffff_halfwords++;
-  }
-  int num_fast_halfwords = std::max(num_0000_halfwords, num_ffff_halfwords);
+  int num_fast_halfwords = GetNumFastHalfWords(uvalue);
+  int num_slow_halfwords = 4 - (num_fast_halfwords & 0x7);
+  bool more_ffff_halfwords = (num_fast_halfwords & 0x8) != 0;
 
-  if (num_fast_halfwords < 3) {
+  if (num_slow_halfwords > 1) {
     // A single movz/movn is not enough. Try the logical immediate route.
     int log_imm = EncodeLogicalImmediate(/*is_wide=*/true, value);
     if (log_imm >= 0) {
@@ -398,19 +484,19 @@
     }
   }
 
-  if (num_fast_halfwords >= 4 - max_num_ops) {
+  if (num_slow_halfwords <= max_num_ops_per_const_load) {
     // We can encode the number using a movz/movn followed by one or more movk.
     ArmOpcode op;
     uint16_t background;
     LIR* res = nullptr;
 
     // Decide whether to use a movz or a movn.
-    if (num_0000_halfwords >= num_ffff_halfwords) {
-      op = WIDE(kA64Movz3rdM);
-      background = 0;
-    } else {
+    if (more_ffff_halfwords) {
       op = WIDE(kA64Movn3rdM);
       background = 0xffff;
+    } else {
+      op = WIDE(kA64Movz3rdM);
+      background = 0;
     }
 
     // Emit the first instruction (movz, movn).
@@ -726,7 +812,7 @@
   int64_t abs_value = (neg) ? -value : value;
   ArmOpcode opcode = kA64Brk1d;
   ArmOpcode alt_opcode = kA64Brk1d;
-  int32_t log_imm = -1;
+  bool is_logical = false;
   bool is_wide = r_dest.Is64Bit();
   ArmOpcode wide = (is_wide) ? WIDE(0) : UNWIDE(0);
   int info = 0;
@@ -761,65 +847,89 @@
         opcode = (neg) ? kA64Add4RRdT : kA64Sub4RRdT;
         return NewLIR4(opcode | wide, r_dest.GetReg(), r_src1.GetReg(), abs_value >> 12, 1);
       } else {
-        log_imm = -1;
         alt_opcode = (op == kOpAdd) ? kA64Add4RRre : kA64Sub4RRre;
         info = EncodeExtend(is_wide ? kA64Uxtx : kA64Uxtw, 0);
       }
       break;
-    // case kOpRsub:
-    //   opcode = kThumb2RsubRRI8M;
-    //   alt_opcode = kThumb2RsubRRR;
-    //   break;
     case kOpAdc:
-      log_imm = -1;
       alt_opcode = kA64Adc3rrr;
       break;
     case kOpSbc:
-      log_imm = -1;
       alt_opcode = kA64Sbc3rrr;
       break;
     case kOpOr:
-      log_imm = EncodeLogicalImmediate(is_wide, value);
+      is_logical = true;
       opcode = kA64Orr3Rrl;
       alt_opcode = kA64Orr4rrro;
       break;
     case kOpAnd:
-      log_imm = EncodeLogicalImmediate(is_wide, value);
+      is_logical = true;
       opcode = kA64And3Rrl;
       alt_opcode = kA64And4rrro;
       break;
     case kOpXor:
-      log_imm = EncodeLogicalImmediate(is_wide, value);
+      is_logical = true;
       opcode = kA64Eor3Rrl;
       alt_opcode = kA64Eor4rrro;
       break;
     case kOpMul:
       // TUNING: power of 2, shift & add
-      log_imm = -1;
       alt_opcode = kA64Mul3rrr;
       break;
     default:
       LOG(FATAL) << "Bad opcode: " << op;
   }
 
-  if (log_imm >= 0) {
-    return NewLIR3(opcode | wide, r_dest.GetReg(), r_src1.GetReg(), log_imm);
-  } else {
-    RegStorage r_scratch;
-    if (is_wide) {
-      r_scratch = AllocTempWide();
-      LoadConstantWide(r_scratch, value);
+  if (is_logical) {
+    int log_imm = EncodeLogicalImmediate(is_wide, value);
+    if (log_imm >= 0) {
+      return NewLIR3(opcode | wide, r_dest.GetReg(), r_src1.GetReg(), log_imm);
     } else {
-      r_scratch = AllocTemp();
-      LoadConstant(r_scratch, value);
+      // When the immediate is either 0 or ~0, the logical operation can be trivially reduced
+      // to a - possibly negated - assignment.
+      if (value == 0) {
+        switch (op) {
+          case kOpOr:
+          case kOpXor:
+            // Or/Xor by zero reduces to an assignment.
+            return NewLIR2(kA64Mov2rr | wide, r_dest.GetReg(), r_src1.GetReg());
+          default:
+            // And by zero reduces to a `mov rdest, xzr'.
+            DCHECK(op == kOpAnd);
+            return NewLIR2(kA64Mov2rr | wide, r_dest.GetReg(), (is_wide) ? rxzr : rwzr);
+        }
+      } else if (value == INT64_C(-1)
+                 || (!is_wide && static_cast<uint32_t>(value) == ~UINT32_C(0))) {
+        switch (op) {
+          case kOpAnd:
+            // And by -1 reduces to an assignment.
+            return NewLIR2(kA64Mov2rr | wide, r_dest.GetReg(), r_src1.GetReg());
+          case kOpXor:
+            // Xor by -1 reduces to an `mvn rdest, rsrc'.
+            return NewLIR2(kA64Mvn2rr | wide, r_dest.GetReg(), r_src1.GetReg());
+          default:
+            // Or by -1 reduces to a `mvn rdest, xzr'.
+            DCHECK(op == kOpOr);
+            return NewLIR2(kA64Mvn2rr | wide, r_dest.GetReg(), (is_wide) ? rxzr : rwzr);
+        }
+      }
     }
-    if (EncodingMap[alt_opcode].flags & IS_QUAD_OP)
-      res = NewLIR4(alt_opcode | wide, r_dest.GetReg(), r_src1.GetReg(), r_scratch.GetReg(), info);
-    else
-      res = NewLIR3(alt_opcode | wide, r_dest.GetReg(), r_src1.GetReg(), r_scratch.GetReg());
-    FreeTemp(r_scratch);
-    return res;
   }
+
+  RegStorage r_scratch;
+  if (is_wide) {
+    r_scratch = AllocTempWide();
+    LoadConstantWide(r_scratch, value);
+  } else {
+    r_scratch = AllocTemp();
+    LoadConstant(r_scratch, value);
+  }
+  if (EncodingMap[alt_opcode].flags & IS_QUAD_OP)
+    res = NewLIR4(alt_opcode | wide, r_dest.GetReg(), r_src1.GetReg(), r_scratch.GetReg(), info);
+  else
+    res = NewLIR3(alt_opcode | wide, r_dest.GetReg(), r_src1.GetReg(), r_scratch.GetReg());
+  FreeTemp(r_scratch);
+  return res;
 }
 
 LIR* Arm64Mir2Lir::OpRegImm(OpKind op, RegStorage r_dest_src1, int value) {
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index aae9155..3e03750 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -256,7 +256,7 @@
     RegLocation rl_temp = UpdateLoc(rl_src2);
     int32_t constant_value = mir_graph_->ConstantValue(rl_src2);
     if ((rl_temp.location == kLocDalvikFrame) &&
-        InexpensiveConstantInt(constant_value)) {
+        InexpensiveConstantInt(constant_value, opcode)) {
       // OK - convert this to a compare immediate and branch
       OpCmpImmBranch(cond, rl_src1.reg, mir_graph_->ConstantValue(rl_src2), taken);
       return;
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index 4d8b91e..e519011 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -926,11 +926,11 @@
     case Instruction::XOR_INT:
     case Instruction::XOR_INT_2ADDR:
       if (rl_src[0].is_const &&
-          InexpensiveConstantInt(mir_graph_->ConstantValue(rl_src[0]))) {
+          InexpensiveConstantInt(mir_graph_->ConstantValue(rl_src[0]), opcode)) {
         GenArithOpIntLit(opcode, rl_dest, rl_src[1],
                              mir_graph_->ConstantValue(rl_src[0].orig_sreg));
       } else if (rl_src[1].is_const &&
-          InexpensiveConstantInt(mir_graph_->ConstantValue(rl_src[1]))) {
+                 InexpensiveConstantInt(mir_graph_->ConstantValue(rl_src[1]), opcode)) {
         GenArithOpIntLit(opcode, rl_dest, rl_src[0],
                              mir_graph_->ConstantValue(rl_src[1].orig_sreg));
       } else {
@@ -951,7 +951,7 @@
     case Instruction::USHR_INT:
     case Instruction::USHR_INT_2ADDR:
       if (rl_src[1].is_const &&
-          InexpensiveConstantInt(mir_graph_->ConstantValue(rl_src[1]))) {
+          InexpensiveConstantInt(mir_graph_->ConstantValue(rl_src[1]), opcode)) {
         GenArithOpIntLit(opcode, rl_dest, rl_src[0], mir_graph_->ConstantValue(rl_src[1]));
       } else {
         GenArithOpInt(opcode, rl_dest, rl_src[0], rl_src[1]);
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 70c17da..d8b3083 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -1445,6 +1445,9 @@
     virtual bool InexpensiveConstantFloat(int32_t value) = 0;
     virtual bool InexpensiveConstantLong(int64_t value) = 0;
     virtual bool InexpensiveConstantDouble(int64_t value) = 0;
+    virtual bool InexpensiveConstantInt(int32_t value, Instruction::Code opcode) {
+      return InexpensiveConstantInt(value);
+    }
 
     // May be optimized by targets.
     virtual void GenMonitorEnter(int opt_flags, RegLocation rl_src);
diff --git a/compiler/dex/quick/ralloc_util.cc b/compiler/dex/quick/ralloc_util.cc
index 45244e1..be966e1 100644
--- a/compiler/dex/quick/ralloc_util.cc
+++ b/compiler/dex/quick/ralloc_util.cc
@@ -1171,12 +1171,13 @@
       } else {
         counts[p_map_idx].count += use_count;
       }
-    } else if (!IsInexpensiveConstant(loc)) {
+    } else {
       if (loc.wide && WideGPRsAreAliases()) {
-        // Longs and doubles can be counted together.
         i++;
       }
-      counts[p_map_idx].count += use_count;
+      if (!IsInexpensiveConstant(loc)) {
+        counts[p_map_idx].count += use_count;
+      }
     }
   }
 }
@@ -1185,9 +1186,10 @@
 static int SortCounts(const void *val1, const void *val2) {
   const Mir2Lir::RefCounts* op1 = reinterpret_cast<const Mir2Lir::RefCounts*>(val1);
   const Mir2Lir::RefCounts* op2 = reinterpret_cast<const Mir2Lir::RefCounts*>(val2);
-  // Note that we fall back to sorting on reg so we get stable output
-  // on differing qsort implementations (such as on host and target or
-  // between local host and build servers).
+  // Note that we fall back to sorting on reg so we get stable output on differing qsort
+  // implementations (such as on host and target or between local host and build servers).
+  // Note also that if a wide val1 and a non-wide val2 have the same count, then val1 always
+  // ``loses'' (as STARTING_WIDE_SREG is or-ed in val1->s_reg).
   return (op1->count == op2->count)
           ? (op1->s_reg - op2->s_reg)
           : (op1->count < op2->count ? 1 : -1);
@@ -1230,8 +1232,8 @@
    * TUNING: replace with linear scan once we have the ability
    * to describe register live ranges for GC.
    */
-  size_t core_reg_count_size = cu_->target64 ? num_regs * 2 : num_regs;
-  size_t fp_reg_count_size = num_regs * 2;
+  size_t core_reg_count_size = WideGPRsAreAliases() ? num_regs : num_regs * 2;
+  size_t fp_reg_count_size = WideFPRsAreAliases() ? num_regs : num_regs * 2;
   RefCounts *core_regs =
       static_cast<RefCounts*>(arena_->Alloc(sizeof(RefCounts) * core_reg_count_size,
                                             kArenaAllocRegAlloc));
@@ -1261,7 +1263,6 @@
   // Sum use counts of SSA regs by original Dalvik vreg.
   CountRefs(core_regs, fp_regs, num_regs);
 
-
   // Sort the count arrays
   qsort(core_regs, core_reg_count_size, sizeof(RefCounts), SortCounts);
   qsort(fp_regs, fp_reg_count_size, sizeof(RefCounts), SortCounts);