Unrolling and dynamic loop peeling framework in vectorizer.

Rationale:
This CL introduces the basic framework for dynamically peeling
(to obtain aligned access) and unrolling the vector loop (to reduce
looping overhead and allow more target specific optimizations
on e.g. SIMD loads and stores).

NOTE:
The current heuristics are "bogus" and merely meant to exercise
the new framework. This CL focuses on introducing correct code for
the vectorizer. Heuristics and the memory computations for alignment
are to be implemented later.

Test: test-art-target, test-art-host

Change-Id: I010af1475f42f92fd1daa6a967d7a85922beace8
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index d249313..d39bc16 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -31,6 +31,9 @@
 // Enables vectorization (SIMDization) in the loop optimizer.
 static constexpr bool kEnableVectorization = true;
 
+// All current SIMD targets want 16-byte alignment.
+static constexpr size_t kAlignedBase = 16;
+
 // Remove the instruction from the graph. A bit more elaborate than the usual
 // instruction removal, since there may be a cycle in the use structure.
 static void RemoveFromCycle(HInstruction* instruction) {
@@ -283,6 +286,9 @@
       simplified_(false),
       vector_length_(0),
       vector_refs_(nullptr),
+      vector_peeling_candidate_(nullptr),
+      vector_runtime_test_a_(nullptr),
+      vector_runtime_test_b_(nullptr),
       vector_map_(nullptr) {
 }
 
@@ -422,23 +428,6 @@
 // Optimization.
 //
 
-bool HLoopOptimization::CanRemoveCycle() {
-  for (HInstruction* i : *iset_) {
-    // We can never remove instructions that have environment
-    // uses when we compile 'debuggable'.
-    if (i->HasEnvironmentUses() && graph_->IsDebuggable()) {
-      return false;
-    }
-    // A deoptimization should never have an environment input removed.
-    for (const HUseListNode<HEnvironment*>& use : i->GetEnvUses()) {
-      if (use.GetUser()->GetHolder()->IsDeoptimize()) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
 void HLoopOptimization::SimplifyInduction(LoopNode* node) {
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
@@ -565,7 +554,7 @@
   if (kEnableVectorization) {
     iset_->clear();  // prepare phi induction
     if (TrySetSimpleLoopHeader(header) &&
-        CanVectorize(node, body, trip_count) &&
+        ShouldVectorize(node, body, trip_count) &&
         TryAssignLastValue(node->loop_info, phi, preheader, /*collect_loop_uses*/ true)) {
       Vectorize(node, body, exit, trip_count);
       graph_->SetHasSIMD(true);  // flag SIMD usage
@@ -580,10 +569,11 @@
 // Intel Press, June, 2004 (http://www.aartbik.com/).
 //
 
-bool HLoopOptimization::CanVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count) {
+bool HLoopOptimization::ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count) {
   // Reset vector bookkeeping.
   vector_length_ = 0;
   vector_refs_->clear();
+  vector_peeling_candidate_ = nullptr;
   vector_runtime_test_a_ =
   vector_runtime_test_b_= nullptr;
 
@@ -600,12 +590,9 @@
     }
   }
 
-  // Heuristics. Does vectorization seem profitable?
-  // TODO: refine
-  if (vector_length_ == 0) {
-    return false;  // nothing found
-  } else if (0 < trip_count && trip_count < vector_length_) {
-    return false;  // insufficient iterations
+  // Does vectorization seem profitable?
+  if (!IsVectorizationProfitable(trip_count)) {
+    return false;
   }
 
   // Data dependence analysis. Find each pair of references with same type, where
@@ -645,6 +632,9 @@
     }
   }
 
+  // Consider dynamic loop peeling for alignment.
+  SetPeelingCandidate(trip_count);
+
   // Success!
   return true;
 }
@@ -657,28 +647,52 @@
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
 
-  // A cleanup is needed for any unknown trip count or for a known trip count
-  // with remainder iterations after vectorization.
-  bool needs_cleanup = trip_count == 0 || (trip_count % vector_length_) != 0;
+  // Pick a loop unrolling factor for the vector loop.
+  uint32_t unroll = GetUnrollingFactor(block, trip_count);
+  uint32_t chunk = vector_length_ * unroll;
+
+  // A cleanup loop is needed, at least, for any unknown trip count or
+  // for a known trip count with remainder iterations after vectorization.
+  bool needs_cleanup = trip_count == 0 || (trip_count % chunk) != 0;
 
   // Adjust vector bookkeeping.
   iset_->clear();  // prepare phi induction
   bool is_simple_loop_header = TrySetSimpleLoopHeader(header);  // fills iset_
   DCHECK(is_simple_loop_header);
+  vector_header_ = header;
+  vector_body_ = block;
 
-  // Generate preheader:
+  // Generate dynamic loop peeling trip count, if needed:
+  // ptc = <peeling-needed-for-candidate>
+  HInstruction* ptc = nullptr;
+  if (vector_peeling_candidate_ != nullptr) {
+    DCHECK_LT(vector_length_, trip_count) << "dynamic peeling currently requires known trip count";
+    //
+    // TODO: Implement this. Compute address of first access memory location and
+    //       compute peeling factor to obtain kAlignedBase alignment.
+    //
+    needs_cleanup = true;
+  }
+
+  // Generate loop control:
   // stc = <trip-count>;
-  // vtc = stc - stc % VL;
+  // vtc = stc - (stc - ptc) % chunk;
+  // i = 0;
   HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader);
   HInstruction* vtc = stc;
   if (needs_cleanup) {
-    DCHECK(IsPowerOfTwo(vector_length_));
+    DCHECK(IsPowerOfTwo(chunk));
+    HInstruction* diff = stc;
+    if (ptc != nullptr) {
+      diff = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, ptc));
+    }
     HInstruction* rem = Insert(
         preheader, new (global_allocator_) HAnd(induc_type,
-                                                stc,
-                                                graph_->GetIntConstant(vector_length_ - 1)));
+                                                diff,
+                                                graph_->GetIntConstant(chunk - 1)));
     vtc = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, rem));
   }
+  vector_index_ = graph_->GetIntConstant(0);
 
   // Generate runtime disambiguation test:
   // vtc = a != b ? vtc : 0;
@@ -691,16 +705,31 @@
     needs_cleanup = true;
   }
 
-  // Generate vector loop:
-  // for (i = 0; i < vtc; i += VL)
+  // Generate dynamic peeling loop for alignment, if needed:
+  // for ( ; i < ptc; i += 1)
+  //    <loop-body>
+  if (ptc != nullptr) {
+    vector_mode_ = kSequential;
+    GenerateNewLoop(node,
+                    block,
+                    graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
+                    vector_index_,
+                    ptc,
+                    graph_->GetIntConstant(1),
+                    /*unroll*/ 1);
+  }
+
+  // Generate vector loop, possibly further unrolled:
+  // for ( ; i < vtc; i += chunk)
   //    <vectorized-loop-body>
   vector_mode_ = kVector;
   GenerateNewLoop(node,
                   block,
-                  graph_->TransformLoopForVectorization(header, block, exit),
-                  graph_->GetIntConstant(0),
+                  graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
+                  vector_index_,
                   vtc,
-                  graph_->GetIntConstant(vector_length_));
+                  graph_->GetIntConstant(vector_length_),  // increment per unroll
+                  unroll);
   HLoopInformation* vloop = vector_header_->GetLoopInformation();
 
   // Generate cleanup loop, if needed:
@@ -711,9 +740,10 @@
     GenerateNewLoop(node,
                     block,
                     graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
-                    vector_phi_,
+                    vector_index_,
                     stc,
-                    graph_->GetIntConstant(1));
+                    graph_->GetIntConstant(1),
+                    /*unroll*/ 1);
   }
 
   // Remove the original loop by disconnecting the body block
@@ -722,8 +752,9 @@
   while (!header->GetFirstInstruction()->IsGoto()) {
     header->RemoveInstruction(header->GetFirstInstruction());
   }
-  // Update loop hierarchy: the old header now resides in the
-  // same outer loop as the old preheader.
+  // Update loop hierarchy: the old header now resides in the same outer loop
+  // as the old preheader. Note that we don't bother putting sequential
+  // loops back in the hierarchy at this point.
   header->SetLoopInformation(preheader->GetLoopInformation());  // outward
   node->loop_info = vloop;
 }
@@ -733,44 +764,64 @@
                                         HBasicBlock* new_preheader,
                                         HInstruction* lo,
                                         HInstruction* hi,
-                                        HInstruction* step) {
+                                        HInstruction* step,
+                                        uint32_t unroll) {
+  DCHECK(unroll == 1 || vector_mode_ == kVector);
   Primitive::Type induc_type = Primitive::kPrimInt;
   // Prepare new loop.
-  vector_map_->clear();
   vector_preheader_ = new_preheader,
   vector_header_ = vector_preheader_->GetSingleSuccessor();
   vector_body_ = vector_header_->GetSuccessors()[1];
-  vector_phi_ = new (global_allocator_) HPhi(global_allocator_,
-                                             kNoRegNumber,
-                                             0,
-                                             HPhi::ToPhiType(induc_type));
+  HPhi* phi = new (global_allocator_) HPhi(global_allocator_,
+                                           kNoRegNumber,
+                                           0,
+                                           HPhi::ToPhiType(induc_type));
   // Generate header and prepare body.
   // for (i = lo; i < hi; i += step)
   //    <loop-body>
-  HInstruction* cond = new (global_allocator_) HAboveOrEqual(vector_phi_, hi);
-  vector_header_->AddPhi(vector_phi_);
+  HInstruction* cond = new (global_allocator_) HAboveOrEqual(phi, hi);
+  vector_header_->AddPhi(phi);
   vector_header_->AddInstruction(cond);
   vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
-  for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
-    bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true);
-    DCHECK(vectorized_def);
-  }
-  // Generate body from the instruction map, but in original program order.
-  HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment();
-  for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
-    auto i = vector_map_->find(it.Current());
-    if (i != vector_map_->end() && !i->second->IsInBlock()) {
-      Insert(vector_body_, i->second);
-      // Deal with instructions that need an environment, such as the scalar intrinsics.
-      if (i->second->NeedsEnvironment()) {
-        i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
+  vector_index_ = phi;
+  for (uint32_t u = 0; u < unroll; u++) {
+    // Clear map, leaving loop invariants setup during unrolling.
+    if (u == 0) {
+      vector_map_->clear();
+    } else {
+      for (auto i = vector_map_->begin(); i != vector_map_->end(); ) {
+        if (i->second->IsVecReplicateScalar()) {
+          DCHECK(node->loop_info->IsDefinedOutOfTheLoop(i->first));
+          ++i;
+        } else {
+          i = vector_map_->erase(i);
+        }
       }
     }
+    // Generate instruction map.
+    for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+      bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true);
+      DCHECK(vectorized_def);
+    }
+    // Generate body from the instruction map, but in original program order.
+    HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment();
+    for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+      auto i = vector_map_->find(it.Current());
+      if (i != vector_map_->end() && !i->second->IsInBlock()) {
+        Insert(vector_body_, i->second);
+        // Deal with instructions that need an environment, such as the scalar intrinsics.
+        if (i->second->NeedsEnvironment()) {
+          i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
+        }
+      }
+    }
+    vector_index_ = new (global_allocator_) HAdd(induc_type, vector_index_, step);
+    Insert(vector_body_, vector_index_);
   }
-  // Finalize increment and phi.
-  HInstruction* inc = new (global_allocator_) HAdd(induc_type, vector_phi_, step);
-  vector_phi_->AddInput(lo);
-  vector_phi_->AddInput(Insert(vector_body_, inc));
+  // Finalize phi for the loop index.
+  phi->AddInput(lo);
+  phi->AddInput(vector_index_);
+  vector_index_ = phi;
 }
 
 // TODO: accept reductions at left-hand-side, mixed-type store idioms, etc.
@@ -795,7 +846,7 @@
         VectorizeUse(node, value, generate_code, type, restrictions)) {
       if (generate_code) {
         GenerateVecSub(index, offset);
-        GenerateVecMem(instruction, vector_map_->Get(index), vector_map_->Get(value), type);
+        GenerateVecMem(instruction, vector_map_->Get(index), vector_map_->Get(value), offset, type);
       } else {
         vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ true));
       }
@@ -852,7 +903,7 @@
         induction_range_.IsUnitStride(instruction, index, &offset)) {
       if (generate_code) {
         GenerateVecSub(index, offset);
-        GenerateVecMem(instruction, vector_map_->Get(index), nullptr, type);
+        GenerateVecMem(instruction, vector_map_->Get(index), nullptr, offset, type);
       } else {
         vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ false));
       }
@@ -1164,7 +1215,7 @@
 
 void HLoopOptimization::GenerateVecSub(HInstruction* org, HInstruction* offset) {
   if (vector_map_->find(org) == vector_map_->end()) {
-    HInstruction* subscript = vector_phi_;
+    HInstruction* subscript = vector_index_;
     if (offset != nullptr) {
       subscript = new (global_allocator_) HAdd(Primitive::kPrimInt, subscript, offset);
       if (org->IsPhi()) {
@@ -1178,17 +1229,27 @@
 void HLoopOptimization::GenerateVecMem(HInstruction* org,
                                        HInstruction* opa,
                                        HInstruction* opb,
+                                       HInstruction* offset,
                                        Primitive::Type type) {
   HInstruction* vector = nullptr;
   if (vector_mode_ == kVector) {
     // Vector store or load.
+    HInstruction* base = org->InputAt(0);
     if (opb != nullptr) {
       vector = new (global_allocator_) HVecStore(
-          global_allocator_, org->InputAt(0), opa, opb, type, vector_length_);
+          global_allocator_, base, opa, opb, type, vector_length_);
     } else  {
       bool is_string_char_at = org->AsArrayGet()->IsStringCharAt();
       vector = new (global_allocator_) HVecLoad(
-          global_allocator_, org->InputAt(0), opa, type, vector_length_, is_string_char_at);
+          global_allocator_, base, opa, type, vector_length_, is_string_char_at);
+    }
+    // Known dynamically enforced alignment?
+    // TODO: detect offset + constant differences.
+    // TODO: long run, static alignment analysis?
+    if (vector_peeling_candidate_ != nullptr &&
+        vector_peeling_candidate_->base == base &&
+        vector_peeling_candidate_->offset == offset) {
+      vector->AsVecMemoryOperation()->SetAlignment(Alignment(kAlignedBase, 0));
     }
   } else {
     // Scalar store or load.
@@ -1444,6 +1505,47 @@
 }
 
 //
+// Vectorization heuristics.
+//
+
+bool HLoopOptimization::IsVectorizationProfitable(int64_t trip_count) {
+  // Current heuristic: non-empty body with sufficient number
+  // of iterations (if known).
+  // TODO: refine by looking at e.g. operation count, alignment, etc.
+  if (vector_length_ == 0) {
+    return false;  // nothing found
+  } else if (0 < trip_count && trip_count < vector_length_) {
+    return false;  // insufficient iterations
+  }
+  return true;
+}
+
+void HLoopOptimization::SetPeelingCandidate(int64_t trip_count ATTRIBUTE_UNUSED) {
+  // Current heuristic: none.
+  // TODO: implement
+}
+
+uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) {
+  // Current heuristic: unroll by 2 on ARM64/X86 for large known trip
+  // counts and small loop bodies.
+  // TODO: refine with operation count, remaining iterations, etc.
+  //       Artem had some really cool ideas for this already.
+  switch (compiler_driver_->GetInstructionSet()) {
+    case kArm64:
+    case kX86:
+    case kX86_64: {
+      size_t num_instructions = block->GetInstructions().CountSize();
+      if (num_instructions <= 10 && trip_count >= 4 * vector_length_) {
+        return 2;
+      }
+      return 1;
+    }
+    default:
+      return 1;
+  }
+}
+
+//
 // Helpers.
 //
 
@@ -1576,8 +1678,8 @@
       size_t index = it->GetIndex();
       ++it;  // increment before replacing
       if (iset_->find(user->GetHolder()) == iset_->end()) {  // not excluded?
-        HLoopInformation* other_loop_info = user->GetHolder()->GetBlock()->GetLoopInformation();
         // Only update environment uses after the loop.
+        HLoopInformation* other_loop_info = user->GetHolder()->GetBlock()->GetLoopInformation();
         if (other_loop_info == nullptr || !other_loop_info->IsIn(*loop_info)) {
           user->RemoveAsUserOfInput(index);
           user->SetRawEnvAt(index, replacement);
@@ -1614,4 +1716,21 @@
   }
 }
 
+bool HLoopOptimization::CanRemoveCycle() {
+  for (HInstruction* i : *iset_) {
+    // We can never remove instructions that have environment
+    // uses when we compile 'debuggable'.
+    if (i->HasEnvironmentUses() && graph_->IsDebuggable()) {
+      return false;
+    }
+    // A deoptimization should never have an environment input removed.
+    for (const HUseListNode<HEnvironment*>& use : i->GetEnvUses()) {
+      if (use.GetUser()->GetHolder()->IsDeoptimize()) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index cc6343a..de4bd85 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -116,14 +116,15 @@
   void OptimizeInnerLoop(LoopNode* node);
 
   // Vectorization analysis and synthesis.
-  bool CanVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count);
+  bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count);
   void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count);
   void GenerateNewLoop(LoopNode* node,
                        HBasicBlock* block,
                        HBasicBlock* new_preheader,
                        HInstruction* lo,
                        HInstruction* hi,
-                       HInstruction* step);
+                       HInstruction* step,
+                       uint32_t unroll);
   bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code);
   bool VectorizeUse(LoopNode* node,
                     HInstruction* instruction,
@@ -133,10 +134,11 @@
   bool TrySetVectorType(Primitive::Type type, /*out*/ uint64_t* restrictions);
   bool TrySetVectorLength(uint32_t length);
   void GenerateVecInv(HInstruction* org, Primitive::Type type);
-  void GenerateVecSub(HInstruction* org, HInstruction* off);
+  void GenerateVecSub(HInstruction* org, HInstruction* offset);
   void GenerateVecMem(HInstruction* org,
                       HInstruction* opa,
                       HInstruction* opb,
+                      HInstruction* offset,
                       Primitive::Type type);
   void GenerateVecOp(HInstruction* org,
                      HInstruction* opa,
@@ -151,6 +153,11 @@
                                 Primitive::Type type,
                                 uint64_t restrictions);
 
+  // Vectorization heuristics.
+  bool IsVectorizationProfitable(int64_t trip_count);
+  void SetPeelingCandidate(int64_t trip_count);
+  uint32_t GetUnrollingFactor(HBasicBlock* block, int64_t trip_count);
+
   // Helpers.
   bool TrySetPhiInduction(HPhi* phi, bool restrict_uses);
   bool TrySetSimpleLoopHeader(HBasicBlock* block);
@@ -208,20 +215,25 @@
   // Contents reside in phase-local heap memory.
   ArenaSet<ArrayReference>* vector_refs_;
 
+  // Dynamic loop peeling candidate for alignment.
+  const ArrayReference* vector_peeling_candidate_;
+
+  // Dynamic data dependence test of the form a != b.
+  HInstruction* vector_runtime_test_a_;
+  HInstruction* vector_runtime_test_b_;
+
   // Mapping used during vectorization synthesis for both the scalar peeling/cleanup
-  // loop (simd_ is false) and the actual vector loop (simd_ is true). The data
+  // loop (mode is kSequential) and the actual vector loop (mode is kVector). The data
   // structure maps original instructions into the new instructions.
   // Contents reside in phase-local heap memory.
   ArenaSafeMap<HInstruction*, HInstruction*>* vector_map_;
 
   // Temporary vectorization bookkeeping.
+  VectorMode vector_mode_;  // synthesis mode
   HBasicBlock* vector_preheader_;  // preheader of the new loop
   HBasicBlock* vector_header_;  // header of the new loop
   HBasicBlock* vector_body_;  // body of the new loop
-  HInstruction* vector_runtime_test_a_;
-  HInstruction* vector_runtime_test_b_;  // defines a != b runtime test
-  HPhi* vector_phi_;  // the Phi representing the normalized loop index
-  VectorMode vector_mode_;  // selects synthesis mode
+  HInstruction* vector_index_;  // normalized index of the new loop
 
   friend class LoopOptimizationTest;
 
diff --git a/test/656-checker-simd-opt/expected.txt b/test/656-checker-simd-opt/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/656-checker-simd-opt/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/656-checker-simd-opt/info.txt b/test/656-checker-simd-opt/info.txt
new file mode 100644
index 0000000..185d2b1
--- /dev/null
+++ b/test/656-checker-simd-opt/info.txt
@@ -0,0 +1 @@
+Tests around optimizations of SIMD code.
diff --git a/test/656-checker-simd-opt/src/Main.java b/test/656-checker-simd-opt/src/Main.java
new file mode 100644
index 0000000..0d0885c
--- /dev/null
+++ b/test/656-checker-simd-opt/src/Main.java
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests for SIMD related optimizations.
+ */
+public class Main {
+
+  /// CHECK-START: void Main.unroll(float[], float[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons:f\d+>> FloatConstant 2.5                   loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:f\d+>>  ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul:f\d+>>  Mul [<<Get>>,<<Cons>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Mul>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.unroll(float[], float[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons:f\d+>> FloatConstant 2.5                    loop:none
+  /// CHECK-DAG: <<Incr:i\d+>> IntConstant 4                        loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Cons>>]        loop:none
+  /// CHECK-NOT:               VecReplicateScalar
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad [{{l\d+}},<<Phi>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul1:d\d+>> VecMul [<<Get1>>,<<Repl>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Mul1>>] loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add:i\d+>>  Add [<<Phi>>,<<Incr>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad [{{l\d+}},<<Add>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul2:d\d+>> VecMul [<<Get2>>,<<Repl>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Add>>,<<Mul2>>] loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               Add [<<Add>>,<<Incr>>]               loop:<<Loop>>      outer_loop:none
+  private static void unroll(float[] x, float[] y) {
+    for (int i = 0; i < 100; i++) {
+      x[i] = y[i] * 2.5f;
+    }
+  }
+
+  public static void main(String[] args) {
+    float[] x = new float[100];
+    float[] y = new float[100];
+    for (int i = 0; i < 100; i++) {
+      x[i] = 0.0f;
+      y[i] = 2.0f;
+    }
+    unroll(x, y);
+    for (int i = 0; i < 100; i++) {
+      expectEquals(5.0f, x[i]);
+      expectEquals(2.0f, y[i]);
+    }
+    System.out.println("passed");
+  }
+
+  private static void expectEquals(float expected, float result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+}