ART: Implement loop full unrolling.

Performs whole loop unrolling for small loops with small
trip count to eliminate the loop check overhead, to have
more opportunities for inter-iteration optimizations.

caffeinemark/FloatAtom: 1.2x performance on arm64 Cortex-A57.

Test: 530-checker-peel-unroll.
Test: test-art-host, test-art-target.
Change-Id: Idf3fe3cb611376935d176c60db8c49907222e28a
diff --git a/compiler/optimizing/loop_analysis.cc b/compiler/optimizing/loop_analysis.cc
index efb23e7..d355ced 100644
--- a/compiler/optimizing/loop_analysis.cc
+++ b/compiler/optimizing/loop_analysis.cc
@@ -84,6 +84,8 @@
   static constexpr uint32_t kScalarHeuristicMaxBodySizeInstr = 17;
   // Loop's maximum basic block count. Loops with higher count will not be peeled/unrolled.
   static constexpr uint32_t kScalarHeuristicMaxBodySizeBlocks = 6;
+  // Maximum number of instructions to be created as a result of full unrolling.
+  static constexpr uint32_t kScalarHeuristicFullyUnrolledMaxInstrThreshold = 35;
 
   bool IsLoopNonBeneficialForScalarOpts(LoopAnalysisInfo* analysis_info) const OVERRIDE {
     return analysis_info->HasLongTypeInstructions() ||
@@ -108,6 +110,14 @@
 
   bool IsLoopPeelingEnabled() const OVERRIDE { return true; }
 
+  bool IsFullUnrollingBeneficial(LoopAnalysisInfo* analysis_info) const OVERRIDE {
+    int64_t trip_count = analysis_info->GetTripCount();
+    // We assume that trip count is known.
+    DCHECK_NE(trip_count, LoopAnalysisInfo::kUnknownTripCount);
+    size_t instr_num = analysis_info->GetNumberOfInstructions();
+    return (trip_count * instr_num < kScalarHeuristicFullyUnrolledMaxInstrThreshold);
+  }
+
  protected:
   bool IsLoopTooBig(LoopAnalysisInfo* loop_analysis_info,
                     size_t instr_threshold,
diff --git a/compiler/optimizing/loop_analysis.h b/compiler/optimizing/loop_analysis.h
index bcb7b70..57509ee 100644
--- a/compiler/optimizing/loop_analysis.h
+++ b/compiler/optimizing/loop_analysis.h
@@ -160,6 +160,13 @@
   // Returns 'false' by default, should be overridden by particular target loop helper.
   virtual bool IsLoopPeelingEnabled() const { return false; }
 
+  // Returns whether it is beneficial to fully unroll the loop.
+  //
+  // Returns 'false' by default, should be overridden by particular target loop helper.
+  virtual bool IsFullUnrollingBeneficial(LoopAnalysisInfo* analysis_info ATTRIBUTE_UNUSED) const {
+    return false;
+  }
+
   // Returns optimal SIMD unrolling factor for the loop.
   //
   // Returns kNoUnrollingFactor by default, should be overridden by particular target loop helper.
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 440cd33..7d66155 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -422,6 +422,15 @@
   }
 }
 
+// Peel the first 'count' iterations of the loop.
+static void PeelByCount(HLoopInformation* loop_info, int count) {
+  for (int i = 0; i < count; i++) {
+    // Perform peeling.
+    PeelUnrollSimpleHelper helper(loop_info);
+    helper.DoPeeling();
+  }
+}
+
 //
 // Public methods.
 //
@@ -811,6 +820,45 @@
   return true;
 }
 
+bool HLoopOptimization::TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code) {
+  // Fully unroll loops with a known and small trip count.
+  int64_t trip_count = analysis_info->GetTripCount();
+  if (!arch_loop_helper_->IsLoopPeelingEnabled() ||
+      trip_count == LoopAnalysisInfo::kUnknownTripCount ||
+      !arch_loop_helper_->IsFullUnrollingBeneficial(analysis_info)) {
+    return false;
+  }
+
+  if (generate_code) {
+    // Peeling of the N first iterations (where N equals to the trip count) will effectively
+    // eliminate the loop: after peeling we will have N sequential iterations copied into the loop
+    // preheader and the original loop. The trip count of this loop will be 0 as the sequential
+    // iterations are executed first and there are exactly N of them. Thus we can statically
+    // evaluate the loop exit condition to 'false' and fully eliminate it.
+    //
+    // Here is an example of full unrolling of a loop with a trip count 2:
+    //
+    //                                           loop_cond_1
+    //                                           loop_body_1        <- First iteration.
+    //                                               |
+    //                             \                 v
+    //                            ==\            loop_cond_2
+    //                            ==/            loop_body_2        <- Second iteration.
+    //                             /                 |
+    //               <-                              v     <-
+    //     loop_cond   \                         loop_cond   \      <- This cond is always false.
+    //     loop_body  _/                         loop_body  _/
+    //
+    HLoopInformation* loop_info = analysis_info->GetLoopInfo();
+    PeelByCount(loop_info, trip_count);
+    HIf* loop_hif = loop_info->GetHeader()->GetLastInstruction()->AsIf();
+    int32_t constant = loop_info->Contains(*loop_hif->IfTrueSuccessor()) ? 0 : 1;
+    loop_hif->ReplaceInput(graph_->GetIntConstant(constant), 0u);
+  }
+
+  return true;
+}
+
 bool HLoopOptimization::TryPeelingAndUnrolling(LoopNode* node) {
   // Don't run peeling/unrolling if compiler_options_ is nullptr (i.e., running under tests)
   // as InstructionSet is needed.
@@ -828,7 +876,8 @@
     return false;
   }
 
-  if (!TryPeelingForLoopInvariantExitsElimination(&analysis_info, /*generate_code*/ false) &&
+  if (!TryFullUnrolling(&analysis_info, /*generate_code*/ false) &&
+      !TryPeelingForLoopInvariantExitsElimination(&analysis_info, /*generate_code*/ false) &&
       !TryUnrollingForBranchPenaltyReduction(&analysis_info, /*generate_code*/ false)) {
     return false;
   }
@@ -838,7 +887,8 @@
     return false;
   }
 
-  return TryPeelingForLoopInvariantExitsElimination(&analysis_info) ||
+  return TryFullUnrolling(&analysis_info) ||
+         TryPeelingForLoopInvariantExitsElimination(&analysis_info) ||
          TryUnrollingForBranchPenaltyReduction(&analysis_info);
 }
 
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index bc47924..644b740 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -155,6 +155,12 @@
   bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info,
                                                   bool generate_code = true);
 
+  // Tries to perform whole loop unrolling for a small loop with a small trip count to eliminate
+  // the loop check overhead and to have more opportunities for inter-iteration optimizations.
+  // Returns whether transformation happened. 'generate_code' determines whether the optimization
+  // should be actually applied.
+  bool TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code = true);
+
   // Tries to apply scalar loop peeling and unrolling.
   bool TryPeelingAndUnrolling(LoopNode* node);
 
diff --git a/test/527-checker-array-access-split/src/Main.java b/test/527-checker-array-access-split/src/Main.java
index a5caa7b..935b378 100644
--- a/test/527-checker-array-access-split/src/Main.java
+++ b/test/527-checker-array-access-split/src/Main.java
@@ -400,7 +400,7 @@
   /// CHECK:                                    ArraySet [<<Address>>,<<Index>>,<<Div>>]
 
   public static int canMergeAfterBCE1() {
-    int[] array = {0, 7, 14, 21};
+    int[] array = {0, 7, 14, 21, 28, 35, 42};
     for (int i = 0; i < array.length; i++) {
       array[i] = array[i] / 7;
     }
@@ -513,7 +513,7 @@
   /// CHECK-NOT:                                IntermediateAddress
 
   public static int canMergeAfterBCE2() {
-    int[] array = {64, 8, 4, 2 };
+    int[] array = {128, 64, 32, 8, 4, 2 };
     for (int i = 0; i < array.length - 1; i++) {
       array[i + 1] = array[i] << array[i + 1];
     }
@@ -571,8 +571,8 @@
     accrossGC(array, 0);
     assertIntEquals(125, array[0]);
 
-    assertIntEquals(3, canMergeAfterBCE1());
-    assertIntEquals(1048576, canMergeAfterBCE2());
+    assertIntEquals(6, canMergeAfterBCE1());
+    assertIntEquals(2097152, canMergeAfterBCE2());
 
     assertIntEquals(18, checkLongFloatDouble());
   }
diff --git a/test/530-checker-peel-unroll/src/Main.java b/test/530-checker-peel-unroll/src/Main.java
index 11c2964..4d81440 100644
--- a/test/530-checker-peel-unroll/src/Main.java
+++ b/test/530-checker-peel-unroll/src/Main.java
@@ -1067,6 +1067,46 @@
     }
   }
 
+  /// CHECK-START: void Main.unrollingFull(int[]) loop_optimization (before)
+  /// CHECK-DAG: <<Param:l\d+>>     ParameterValue                          loop:none
+  /// CHECK-DAG: <<Const0:i\d+>>    IntConstant 0                           loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>    IntConstant 1                           loop:none
+  /// CHECK-DAG: <<Limit:i\d+>>     IntConstant 2                           loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>       Phi [<<Const0>>,{{i\d+}}]               loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:                    ArrayGet                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                    ArrayGet                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                    ArraySet                                loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-NOT:                    ArrayGet
+  /// CHECK-NOT:                    ArraySet
+
+  /// CHECK-START: void Main.unrollingFull(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Param:l\d+>>     ParameterValue                          loop:none
+  /// CHECK-DAG: <<Const0:i\d+>>    IntConstant 0                           loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>    IntConstant 1                           loop:none
+  /// CHECK-DAG: <<Limit:i\d+>>     IntConstant 2                           loop:none
+  //            Two peeled iterations
+  /// CHECK-DAG:                    ArrayGet                                loop:none
+  /// CHECK-DAG:                    ArrayGet                                loop:none
+  /// CHECK-DAG:                    ArraySet                                loop:none
+  /// CHECK-DAG:                    ArrayGet                                loop:none
+  /// CHECK-DAG:                    ArrayGet                                loop:none
+  /// CHECK-DAG:                    ArraySet                                loop:none
+  //            Loop
+  /// CHECK-DAG: <<Phi:i\d+>>       Phi [{{i\d+}},{{i\d+}}]                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:                    ArrayGet                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                    ArrayGet                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                    ArraySet                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                    If [<<Const1>>]                         loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-NOT:                    ArrayGet
+  /// CHECK-NOT:                    ArraySet
+  private static final void unrollingFull(int[] a) {
+    for (int i = 0; i < 2; i++) {
+      a[i] += a[i + 1];
+    }
+  }
+
   private static void expectEquals(int expected, int result) {
     if (expected != result) {
       throw new Error("Expected: " + expected + ", found: " + result);