ART: Implement loop full unrolling.
Performs whole loop unrolling for small loops with small
trip count to eliminate the loop check overhead, to have
more opportunities for inter-iteration optimizations.
caffeinemark/FloatAtom: 1.2x performance on arm64 Cortex-A57.
Test: 530-checker-peel-unroll.
Test: test-art-host, test-art-target.
Change-Id: Idf3fe3cb611376935d176c60db8c49907222e28a
diff --git a/compiler/optimizing/loop_analysis.cc b/compiler/optimizing/loop_analysis.cc
index efb23e7..d355ced 100644
--- a/compiler/optimizing/loop_analysis.cc
+++ b/compiler/optimizing/loop_analysis.cc
@@ -84,6 +84,8 @@
static constexpr uint32_t kScalarHeuristicMaxBodySizeInstr = 17;
// Loop's maximum basic block count. Loops with higher count will not be peeled/unrolled.
static constexpr uint32_t kScalarHeuristicMaxBodySizeBlocks = 6;
+ // Maximum number of instructions to be created as a result of full unrolling.
+ static constexpr uint32_t kScalarHeuristicFullyUnrolledMaxInstrThreshold = 35;
bool IsLoopNonBeneficialForScalarOpts(LoopAnalysisInfo* analysis_info) const OVERRIDE {
return analysis_info->HasLongTypeInstructions() ||
@@ -108,6 +110,14 @@
bool IsLoopPeelingEnabled() const OVERRIDE { return true; }
+ bool IsFullUnrollingBeneficial(LoopAnalysisInfo* analysis_info) const OVERRIDE {
+ int64_t trip_count = analysis_info->GetTripCount();
+ // We assume that trip count is known.
+ DCHECK_NE(trip_count, LoopAnalysisInfo::kUnknownTripCount);
+ size_t instr_num = analysis_info->GetNumberOfInstructions();
+ return (trip_count * instr_num < kScalarHeuristicFullyUnrolledMaxInstrThreshold);
+ }
+
protected:
bool IsLoopTooBig(LoopAnalysisInfo* loop_analysis_info,
size_t instr_threshold,
diff --git a/compiler/optimizing/loop_analysis.h b/compiler/optimizing/loop_analysis.h
index bcb7b70..57509ee 100644
--- a/compiler/optimizing/loop_analysis.h
+++ b/compiler/optimizing/loop_analysis.h
@@ -160,6 +160,13 @@
// Returns 'false' by default, should be overridden by particular target loop helper.
virtual bool IsLoopPeelingEnabled() const { return false; }
+ // Returns whether it is beneficial to fully unroll the loop.
+ //
+ // Returns 'false' by default, should be overridden by particular target loop helper.
+ virtual bool IsFullUnrollingBeneficial(LoopAnalysisInfo* analysis_info ATTRIBUTE_UNUSED) const {
+ return false;
+ }
+
// Returns optimal SIMD unrolling factor for the loop.
//
// Returns kNoUnrollingFactor by default, should be overridden by particular target loop helper.
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 440cd33..7d66155 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -422,6 +422,15 @@
}
}
+// Peel the first 'count' iterations of the loop.
+static void PeelByCount(HLoopInformation* loop_info, int count) {
+ for (int i = 0; i < count; i++) {
+ // Perform peeling.
+ PeelUnrollSimpleHelper helper(loop_info);
+ helper.DoPeeling();
+ }
+}
+
//
// Public methods.
//
@@ -811,6 +820,45 @@
return true;
}
+bool HLoopOptimization::TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code) {
+ // Fully unroll loops with a known and small trip count.
+ int64_t trip_count = analysis_info->GetTripCount();
+ if (!arch_loop_helper_->IsLoopPeelingEnabled() ||
+ trip_count == LoopAnalysisInfo::kUnknownTripCount ||
+ !arch_loop_helper_->IsFullUnrollingBeneficial(analysis_info)) {
+ return false;
+ }
+
+ if (generate_code) {
+ // Peeling of the N first iterations (where N equals to the trip count) will effectively
+ // eliminate the loop: after peeling we will have N sequential iterations copied into the loop
+ // preheader and the original loop. The trip count of this loop will be 0 as the sequential
+ // iterations are executed first and there are exactly N of them. Thus we can statically
+ // evaluate the loop exit condition to 'false' and fully eliminate it.
+ //
+ // Here is an example of full unrolling of a loop with a trip count 2:
+ //
+ // loop_cond_1
+ // loop_body_1 <- First iteration.
+ // |
+ // \ v
+ // ==\ loop_cond_2
+ // ==/ loop_body_2 <- Second iteration.
+ // / |
+ // <- v <-
+ // loop_cond \ loop_cond \ <- This cond is always false.
+ // loop_body _/ loop_body _/
+ //
+ HLoopInformation* loop_info = analysis_info->GetLoopInfo();
+ PeelByCount(loop_info, trip_count);
+ HIf* loop_hif = loop_info->GetHeader()->GetLastInstruction()->AsIf();
+ int32_t constant = loop_info->Contains(*loop_hif->IfTrueSuccessor()) ? 0 : 1;
+ loop_hif->ReplaceInput(graph_->GetIntConstant(constant), 0u);
+ }
+
+ return true;
+}
+
bool HLoopOptimization::TryPeelingAndUnrolling(LoopNode* node) {
// Don't run peeling/unrolling if compiler_options_ is nullptr (i.e., running under tests)
// as InstructionSet is needed.
@@ -828,7 +876,8 @@
return false;
}
- if (!TryPeelingForLoopInvariantExitsElimination(&analysis_info, /*generate_code*/ false) &&
+ if (!TryFullUnrolling(&analysis_info, /*generate_code*/ false) &&
+ !TryPeelingForLoopInvariantExitsElimination(&analysis_info, /*generate_code*/ false) &&
!TryUnrollingForBranchPenaltyReduction(&analysis_info, /*generate_code*/ false)) {
return false;
}
@@ -838,7 +887,8 @@
return false;
}
- return TryPeelingForLoopInvariantExitsElimination(&analysis_info) ||
+ return TryFullUnrolling(&analysis_info) ||
+ TryPeelingForLoopInvariantExitsElimination(&analysis_info) ||
TryUnrollingForBranchPenaltyReduction(&analysis_info);
}
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index bc47924..644b740 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -155,6 +155,12 @@
bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info,
bool generate_code = true);
+ // Tries to perform whole loop unrolling for a small loop with a small trip count to eliminate
+ // the loop check overhead and to have more opportunities for inter-iteration optimizations.
+ // Returns whether transformation happened. 'generate_code' determines whether the optimization
+ // should be actually applied.
+ bool TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code = true);
+
// Tries to apply scalar loop peeling and unrolling.
bool TryPeelingAndUnrolling(LoopNode* node);
diff --git a/test/527-checker-array-access-split/src/Main.java b/test/527-checker-array-access-split/src/Main.java
index a5caa7b..935b378 100644
--- a/test/527-checker-array-access-split/src/Main.java
+++ b/test/527-checker-array-access-split/src/Main.java
@@ -400,7 +400,7 @@
/// CHECK: ArraySet [<<Address>>,<<Index>>,<<Div>>]
public static int canMergeAfterBCE1() {
- int[] array = {0, 7, 14, 21};
+ int[] array = {0, 7, 14, 21, 28, 35, 42};
for (int i = 0; i < array.length; i++) {
array[i] = array[i] / 7;
}
@@ -513,7 +513,7 @@
/// CHECK-NOT: IntermediateAddress
public static int canMergeAfterBCE2() {
- int[] array = {64, 8, 4, 2 };
+ int[] array = {128, 64, 32, 8, 4, 2 };
for (int i = 0; i < array.length - 1; i++) {
array[i + 1] = array[i] << array[i + 1];
}
@@ -571,8 +571,8 @@
accrossGC(array, 0);
assertIntEquals(125, array[0]);
- assertIntEquals(3, canMergeAfterBCE1());
- assertIntEquals(1048576, canMergeAfterBCE2());
+ assertIntEquals(6, canMergeAfterBCE1());
+ assertIntEquals(2097152, canMergeAfterBCE2());
assertIntEquals(18, checkLongFloatDouble());
}
diff --git a/test/530-checker-peel-unroll/src/Main.java b/test/530-checker-peel-unroll/src/Main.java
index 11c2964..4d81440 100644
--- a/test/530-checker-peel-unroll/src/Main.java
+++ b/test/530-checker-peel-unroll/src/Main.java
@@ -1067,6 +1067,46 @@
}
}
+ /// CHECK-START: void Main.unrollingFull(int[]) loop_optimization (before)
+ /// CHECK-DAG: <<Param:l\d+>> ParameterValue loop:none
+ /// CHECK-DAG: <<Const0:i\d+>> IntConstant 0 loop:none
+ /// CHECK-DAG: <<Const1:i\d+>> IntConstant 1 loop:none
+ /// CHECK-DAG: <<Limit:i\d+>> IntConstant 2 loop:none
+ /// CHECK-DAG: <<Phi:i\d+>> Phi [<<Const0>>,{{i\d+}}] loop:<<Loop:B\d+>> outer_loop:none
+ /// CHECK-DAG: ArrayGet loop:<<Loop>> outer_loop:none
+ /// CHECK-DAG: ArrayGet loop:<<Loop>> outer_loop:none
+ /// CHECK-DAG: ArraySet loop:<<Loop>> outer_loop:none
+ //
+ /// CHECK-NOT: ArrayGet
+ /// CHECK-NOT: ArraySet
+
+ /// CHECK-START: void Main.unrollingFull(int[]) loop_optimization (after)
+ /// CHECK-DAG: <<Param:l\d+>> ParameterValue loop:none
+ /// CHECK-DAG: <<Const0:i\d+>> IntConstant 0 loop:none
+ /// CHECK-DAG: <<Const1:i\d+>> IntConstant 1 loop:none
+ /// CHECK-DAG: <<Limit:i\d+>> IntConstant 2 loop:none
+ // Two peeled iterations
+ /// CHECK-DAG: ArrayGet loop:none
+ /// CHECK-DAG: ArrayGet loop:none
+ /// CHECK-DAG: ArraySet loop:none
+ /// CHECK-DAG: ArrayGet loop:none
+ /// CHECK-DAG: ArrayGet loop:none
+ /// CHECK-DAG: ArraySet loop:none
+ // Loop
+ /// CHECK-DAG: <<Phi:i\d+>> Phi [{{i\d+}},{{i\d+}}] loop:<<Loop:B\d+>> outer_loop:none
+ /// CHECK-DAG: ArrayGet loop:<<Loop>> outer_loop:none
+ /// CHECK-DAG: ArrayGet loop:<<Loop>> outer_loop:none
+ /// CHECK-DAG: ArraySet loop:<<Loop>> outer_loop:none
+ /// CHECK-DAG: If [<<Const1>>] loop:<<Loop>> outer_loop:none
+ //
+ /// CHECK-NOT: ArrayGet
+ /// CHECK-NOT: ArraySet
+ private static final void unrollingFull(int[] a) {
+ for (int i = 0; i < 2; i++) {
+ a[i] += a[i + 1];
+ }
+ }
+
private static void expectEquals(int expected, int result) {
if (expected != result) {
throw new Error("Expected: " + expected + ", found: " + result);