lib/Transforms/Vectorize.cpp - platform/external/tensorflow - Git at Google

 //===- Vectorize.cpp - Vectorize Pass Impl ----------------------*- C++ -*-===//
 //
 // Copyright 2019 The MLIR Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
 //
 // This file implements vectorization of loops, operations and data types to
 // a target-independent, n-D virtual vector abstraction.
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/Analysis/MLFunctionMatcher.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLValue.h"
 #include "mlir/IR/SSAValue.h"
 #include "mlir/IR/Types.h"
 #include "mlir/Pass.h"
 #include "mlir/StandardOps/StandardOps.h"
 #include "mlir/Support/Functional.h"
 #include "mlir/Transforms/Passes.h"

 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"

 using namespace llvm;
 using namespace mlir;

 /// This pass implements a high-level vectorization strategy at the MLFunction
 /// level. This is implemented by:
 ///   1. matching arbitrarily nested loop patterns that are vectorizable;
 ///   2. analyzing those patterns for profitability;
 ///   3. applying those patterns iteratively by coarsening the loops, inserting
 ///      a single explicit vector element AllocOp and an unaligned load/store
 ///      operation. The full semantics of this unaligned load/store is still
 ///      TBD.
 ///
 /// Loop transformation:
 //  ====================
 /// The choice of loop transformation to apply for coarsening vectorized loops
 /// is still subject to exploratory tradeoffs. In particular, say we want to
 /// vectorize by a factor 128, we want to transform the following input:
 ///     for %i = %M to %N {
 ///       %a = load(f(i))
 ///
 ///   Traditionally, one would vectorize late (after scheduling, tiling,
 ///   memory promotion etc) say after stripmining (and potentially unrolling in
 ///   the case of LLVM's SLP vectorizer):
 ///     for %i = floor(%M, 128) to ceil(%N, 128) {
 ///       for %ii = max(%M, 128 * %i) to min(%N, 128*%i + 127) {
 ///         load/store(f(ii))
 ///
 ///   We seek to vectorize early and freeze vector types before scheduling, so
 ///   we want to generate a pattern that resembles:
 ///     for %i = ? to ? step ? {
 ///       unaligned_load/unaligned_store(g(i))
 ///
 ///   i. simply dividing the lower / upper bounds by 128 creates issues
 ///   with representing expressions such as ii + 1 because now we only
 ///   have access to original values that have been divided. Additional
 ///   information is needed to specify accesses at below 128 granularity;
 ///   ii. another alternative is to coarsen the loop step but this may have
 ///   consequences on dependence analysis and fusability of loops: fusable
 ///   loops probably need to have the same step (because we don't want to
 ///   stripmine/unroll to enable fusion).
 /// As a consequence, we choose to represent the coarsening using the loop
 /// step for now and reevaluate in the future. Note that we can renormalize
 /// loop steps later if/when we have evidence that they are problematic.
 ///
 /// For the simple strawman example above, vectorizing for a 1-D vector
 /// abstraction of size 128 returns code similar to:
 ///   %c0 = constant 0 : index
 ///   for %i = %M to %N + 127 step 128 {
 ///     %a = alloc() : memref<1xvector<128xf32>>
 ///     %r = "n_d_unaligned_load"(%tensor, %i, %a, %c0)
 ///     %16 = load %a[%c0] : memref<1xvector<128xf32>>
 ///
 /// Note this is still work in progress and not yet functional.

 #define DEBUG_TYPE "early-vect"

 static cl::list<int> clVirtualVectorSize(
     "virtual-vector-size",
     cl::desc("Specify n-D virtual vector size for vectorization"),
     cl::ZeroOrMore);

 static cl::list<int> clFastestVaryingPattern(
     "test-fastest-varying",
     cl::desc("Specify a 1-D, 2-D or 3-D pattern of fastest varying memory "
              "dimensions to match. See defaultPatterns in Vectorize.cpp for a "
              "description and examples. This is used for testing purposes"),
     cl::ZeroOrMore);

 /// Forward declaration.
 static FilterFunctionType
 isVectorizableLoopPtrFactory(unsigned fastestVaryingMemRefDimension);

 // Build a bunch of predetermined patterns that will be traversed in order.
 // Due to the recursive nature of MLFunctionMatchers, this captures
 // arbitrarily nested pairs of loops at any position in the tree.
 /// Note that this currently only matches 2 nested loops and will be extended.
 // TODO(ntv): support 3-D loop patterns with a common reduction loop that can
 // be matched to GEMMs.
 static std::vector<MLFunctionMatcher> defaultPatterns() {
   using matcher::For;
   return std::vector<MLFunctionMatcher>{
       // 3-D patterns
       For(isVectorizableLoopPtrFactory(2),
           For(isVectorizableLoopPtrFactory(1),
               For(isVectorizableLoopPtrFactory(0)))),
       // for i { for j { A[??f(not i, not j), f(i, not j), f(not i, j)];}}
       // test independently with:
       //   --test-fastest-varying=1 --test-fastest-varying=0
       For(isVectorizableLoopPtrFactory(1),
           For(isVectorizableLoopPtrFactory(0))),
       // for i { for j { A[??f(not i, not j), f(i, not j), ?, f(not i, j)];}}
       // test independently with:
       //   --test-fastest-varying=2 --test-fastest-varying=0
       For(isVectorizableLoopPtrFactory(2),
           For(isVectorizableLoopPtrFactory(0))),
       // for i { for j { A[??f(not i, not j), f(i, not j), ?, ?, f(not i, j)];}}
       // test independently with:
       //   --test-fastest-varying=3 --test-fastest-varying=0
       For(isVectorizableLoopPtrFactory(3),
           For(isVectorizableLoopPtrFactory(0))),
       // for i { for j { A[??f(not i, not j), f(not i, j), f(i, not j)];}}
       // test independently with:
       //   --test-fastest-varying=0 --test-fastest-varying=1
       For(isVectorizableLoopPtrFactory(0),
           For(isVectorizableLoopPtrFactory(1))),
       // for i { for j { A[??f(not i, not j), f(not i, j), ?, f(i, not j)];}}
       // test independently with:
       //   --test-fastest-varying=0 --test-fastest-varying=2
       For(isVectorizableLoopPtrFactory(0),
           For(isVectorizableLoopPtrFactory(2))),
       // for i { for j { A[??f(not i, not j), f(not i, j), ?, ?, f(i, not j)];}}
       // test independently with:
       //   --test-fastest-varying=0 --test-fastest-varying=3
       For(isVectorizableLoopPtrFactory(0),
           For(isVectorizableLoopPtrFactory(3))),
       // for i { A[??f(not i) , f(i)];}
       // test independently with:  --test-fastest-varying=0
       For(isVectorizableLoopPtrFactory(0)),
       // for i { A[??f(not i) , f(i), ?];}
       // test independently with:  --test-fastest-varying=1
       For(isVectorizableLoopPtrFactory(1)),
       // for i { A[??f(not i) , f(i), ?, ?];}
       // test independently with:  --test-fastest-varying=2
       For(isVectorizableLoopPtrFactory(2)),
       // for i { A[??f(not i) , f(i), ?, ?, ?];}
       // test independently with:  --test-fastest-varying=3
       For(isVectorizableLoopPtrFactory(3))};
 }

 static std::vector<MLFunctionMatcher> makePatterns() {
   using matcher::For;
   if (clFastestVaryingPattern.empty()) {
     return defaultPatterns();
   }
   switch (clFastestVaryingPattern.size()) {
   case 1:
     return {For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[0]))};
   case 2:
     return {For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[0]),
                 For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[1])))};
   case 3:
     return {For(
         isVectorizableLoopPtrFactory(clFastestVaryingPattern[0]),
         For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[1]),
             For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[2]))))};
   default:
     assert(false && "Only up to 3-D fastest varying pattern supported atm");
   }
   return std::vector<MLFunctionMatcher>();
 }

 namespace {

 struct Vectorize : public FunctionPass {
   PassResult runOnMLFunction(MLFunction *f) override;

   // Thread-safe RAII contexts local to pass, BumpPtrAllocator freed on exit.
   MLFunctionMatcherContext MLContext;

   static char passID;
 };

 } // end anonymous namespace

 char Vectorize::passID = 0;

 /////// TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate. //////
 namespace {

 struct Strategy {
   DenseMap<ForStmt *, unsigned> loopToVectorDim;
 };

 } // end anonymous namespace

 /// Implements a simple strawman strategy for vectorization.
 /// Given a matched pattern `matches` of depth `patternDepth`, this strategy
 /// greedily assigns the fastest varying dimension ** of the vector ** to the
 /// innermost loop in the pattern.
 /// When coupled with a pattern that looks for the fastest varying dimension in
 /// load/store MemRefs, this creates a generic vectorization strategy that works
 /// for any loop in a hierarchy (outermost, innermost or intermediate).
 ///
 /// TODO(ntv): In the future we should additionally increase the power of the
 /// profitability analysis along 3 directions:
 ///   1. account for loop extents (both static and parametric + annotations);
 ///   2. account for data layout permutations;
 ///   3. account for impact of vectorization on maximal loop fusion.
 /// Then we can quantify the above to build a cost model and search over
 /// strategies.
 static bool analyzeProfitability(MLFunctionMatches matches,
                                  unsigned depthInPattern, unsigned patternDepth,
                                  Strategy *strategy) {
   for (auto m : matches) {
     auto *loop = cast<ForStmt>(m.first);
     LLVM_DEBUG(dbgs() << "[early-vect][profitability] patternDepth: "
                       << patternDepth << " depthInPattern: " << depthInPattern
                       << " loop ");
     LLVM_DEBUG(loop->print(dbgs()));
     LLVM_DEBUG(dbgs() << "\n");
     bool fail = analyzeProfitability(m.second, depthInPattern + 1, patternDepth,
                                      strategy);
     if (fail) {
       return fail;
     }
     assert(patternDepth > depthInPattern);
     if (patternDepth - depthInPattern <= clVirtualVectorSize.size()) {
       strategy->loopToVectorDim[loop] =
           clVirtualVectorSize.size() - (patternDepth - depthInPattern);
       LLVM_DEBUG(dbgs() << "[early-vect][profitability] vectorize @ "
                         << strategy->loopToVectorDim[loop] << " loop ");
       LLVM_DEBUG(loop->print(dbgs()));
       LLVM_DEBUG(dbgs() << "\n");
     } else {
       // Don't vectorize
       strategy->loopToVectorDim[loop] = -1;
     }
   }
   return false;
 }
 ///// end TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate /////

 ////// TODO(ntv): Hoist to a VectorizationMaterialize.cpp when appropriate. ////
 /// Gets a MemRefType of 1 vector with the same elemental type as `tmpl` and
 /// sizes specified by vectorSize. The MemRef lives in the same memory space as
 /// tmpl. The MemRef should be promoted to a closer memory address space in a
 /// later pass.
 static MemRefType getVectorizedMemRefType(MemRefType tmpl,
                                           ArrayRef<int> vectorSizes) {
   auto elementType = tmpl.getElementType();
   assert(!elementType.dyn_cast<VectorType>() &&
          "Can't vectorize an already vector type");
   assert(tmpl.getAffineMaps().empty() &&
          "Unsupported non-implicit identity map");
   return MemRefType::get({1}, VectorType::get(vectorSizes, elementType), {},
                          tmpl.getMemorySpace());
 }

 /// Creates an unaligned load with the following semantics:
 ///   1. TODO(ntv): apply a `srcMap` to a `srcIndex` to represent a `srcMemRef`
 ///   slice + permutations for loading from non-fastest varying dimensions.
 ///   Note that generally, the fastest varying dimension should be part of the
 ///   map otherwise global layout changes are likely needed to obtain an
 ///   efficient load. This is an orthogonal cost model consideration;
 ///   2. load from the `srcMemRef` resulting from 1.;
 ///   3. store into a `dstMemRef` starting at offset `dstIndex`;
 ///   4. copy sizeof(dstMemRef) bytes with adjustements for boundaries;
 ///   5. TODO(ntv): broadcast along `broadcastMap` inside the `dstMemRef` to
 ///   support patterns like scalar-to-vector and N-k dim MemRef slice
 /// The copy may overflow on the src side but not on the dst side. If the copy
 /// overflows on the src side, the `dstMemRef` will be padded with enough values
 /// to fill it completely.
 ///
 /// Usage:
 ///   This n_d_unaligned_load op will be implemented as a PseudoOp for different
 ///   backends. In its current form it is only used to load into a <1xvector>;
 ///   where the vector may have any shape that is some multiple of the
 ///   hardware-specific vector size used to implement the PseudoOp efficiently.
 ///   This is used to implement "non-effecting padding" for early vectorization
 ///   and allows higher-level passes in the codegen to not worry about
 ///   hardware-specific implementation details.
 ///
 /// TODO(ntv):
 ///   1. implement this end-to-end for some backend;
 ///   2. support operation-specific padding values to properly implement
 ///      "non-effecting padding";
 ///   3. support input map for on-the-fly transpositions (point 1 above);
 ///   4. support broadcast map (point 5 above).
 ///
 /// TODO(andydavis,bondhugula,ntv):
 ///   1. generalize to support padding semantics and offsets within vector type.
 static void createUnalignedLoad(MLFuncBuilder *b, Location *loc,
                                 SSAValue *srcMemRef,
                                 ArrayRef<SSAValue *> srcIndices,
                                 SSAValue *dstMemRef,
                                 ArrayRef<SSAValue *> dstIndices) {
   SmallVector<SSAValue *, 8> operands;
   operands.reserve(1 + srcIndices.size() + 1 + dstIndices.size());
   operands.insert(operands.end(), srcMemRef);
   operands.insert(operands.end(), srcIndices.begin(), srcIndices.end());
   operands.insert(operands.end(), dstMemRef);
   operands.insert(operands.end(), dstIndices.begin(), dstIndices.end());
   using functional::map;
   std::function<Type(SSAValue *)> getType = [](SSAValue *v) -> Type {
     return v->getType();
   };
   auto types = map(getType, operands);
   OperationState opState(b->getContext(), loc, "n_d_unaligned_load", operands,
                          types);
   b->createOperation(opState);
 }

 /// Creates an unaligned store with the following semantics:
 ///   1. TODO(ntv): apply a `srcMap` to a `srcIndex` to represent a `srcMemRef`
 ///   slice to support patterns like vector-to-scalar and N-k dim MemRef slice.
 ///   This is used as the counterpart to the broadcast map in the UnalignedLoad;
 ///   2. load from the `srcMemRef` resulting from 1.;
 ///   3. store into a `dstMemRef` starting at offset `dstIndex`;
 ///   4. TODO(ntv): apply a `dstMap` to a `dstIndex` to represent a `dstMemRef`
 ///   slice + permutations for storing into non-fastest varying dimensions.
 ///   Note that generally, the fastest varying dimension should be part of the
 ///   map otherwise global layout changes are likely needed to obtain an
 ///   efficient store. This is an orthogonal cost model consideration;
 ///   5. copy sizeof(srcMemRef) bytes with adjustements for boundaries;
 /// The copy may overflow on the dst side but not on the dst side. If the copy
 /// overflows on the dst side, the underlying implementation needs to resolve
 /// potential races.
 ///
 /// Usage:
 ///   This n_d_unaligned_store op will be implemented as a PseudoOp for
 ///   different backends. In its current form it is only used to store from a
 ///   <1xvector>; where the vector may have any shape that is some multiple of
 ///   the hardware-specific vector size used to implement the PseudoOp
 ///   efficiently. This is used to implement "non-effecting padding" for early
 ///   vectorization and allows higher-level passes in the codegen to not worry
 ///   about hardware-specific implementation details.
 ///
 /// TODO(ntv):
 ///   1. implement this end-to-end for some backend;
 ///   2. support write-back in the presence of races and ;
 ///   3. support input map for counterpart of broadcast (point 1 above);
 ///   4. support dstMap for writing back in non-contiguous memory regions
 ///   (point 4 above).
 static void createUnalignedStore(MLFuncBuilder *b, Location *loc,
                                  SSAValue *srcMemRef,
                                  ArrayRef<SSAValue *> srcIndices,
                                  SSAValue *dstMemRef,
                                  ArrayRef<SSAValue *> dstIndices) {
   SmallVector<SSAValue *, 8> operands;
   operands.reserve(1 + srcIndices.size() + 1 + dstIndices.size());
   operands.insert(operands.end(), srcMemRef);
   operands.insert(operands.end(), srcIndices.begin(), srcIndices.end());
   operands.insert(operands.end(), dstMemRef);
   operands.insert(operands.end(), dstIndices.begin(), dstIndices.end());
   using functional::map;
   std::function<Type(SSAValue *)> getType = [](SSAValue *v) -> Type {
     return v->getType();
   };
   auto types = map(getType, operands);
   OperationState opState(b->getContext(), loc, "n_d_unaligned_store", operands,
                          types);
   b->createOperation(opState);
 }

 /// The current implementation of vectorization materializes an AllocOp of
 /// MemRef<1 x vector_type> + a custom unaligned load/store pseudoop.
 /// The vector load/store accessing this MemRef always accesses element 0, so we
 /// just memoize a single 0 SSAValue, once upon function entry to avoid clutter.
 /// We create one such SSAValue per function.
 static SSAValue *insertZeroIndex(MLFunction *f) {
   static thread_local DenseMap<MLFunction *, SSAValue *> zeros;
   if (zeros.find(f) == zeros.end()) {
     MLFuncBuilder b(f);
     auto zero = b.create<ConstantIndexOp>(b.getUnknownLoc(), 0);
     zeros.insert(std::make_pair(f, zero));
   }
   return zeros.lookup(f);
 }

 /// Unwraps a pointer type to another type (possibly the same).
 /// Used in particular to allow easier compositions of
 ///   llvm::iterator_range<ForStmt::operand_iterator> types.
 template <typename T, typename ToType = T>
 static std::function<ToType *(T *)> unwrapPtr() {
   return [](T *val) { return dyn_cast<ToType>(val); };
 }

 /// Materializes the n-D vector into an unpromoted temporary storage and
 /// explicitly copy into it. Materialization occurs in a MemRef containing 1
 /// vector that lives in the same memory space as the base MemRef. Later passes
 /// should make the decision to promote this materialization to a faster address
 /// space.
 template <typename LoadOrStoreOpPointer>
 static MLValue *materializeVector(MLValue *iv, LoadOrStoreOpPointer memoryOp,
                                   ArrayRef<int> vectorSize) {
   auto memRefType =
       memoryOp->getMemRef()->getType().template cast<MemRefType>();
   auto vectorMemRefType = getVectorizedMemRefType(memRefType, vectorSize);

   // Materialize a MemRef with 1 vector.
   auto *opStmt = cast<OperationStmt>(memoryOp->getOperation());
   MLFuncBuilder b(opStmt);
   // Create an AllocOp to apply the new shape.
   auto allocOp = b.create<AllocOp>(opStmt->getLoc(), vectorMemRefType,
                                    ArrayRef<SSAValue *>{});
   auto *allocMemRef = memoryOp->getMemRef();
   using namespace functional;
   if (opStmt->template isa<LoadOp>()) {
     createUnalignedLoad(&b, opStmt->getLoc(), allocMemRef,
                         map(unwrapPtr<SSAValue>(), memoryOp->getIndices()),
                         allocOp->getResult(),
                         {insertZeroIndex(iv->getFunction())});
   } else {
     createUnalignedStore(&b, opStmt->getLoc(), allocOp->getResult(),
                          {insertZeroIndex(iv->getFunction())}, allocMemRef,
                          map(unwrapPtr<SSAValue>(), memoryOp->getIndices()));
   }

   return cast<MLValue>(allocOp->getResult());
 }
 /// end TODO(ntv): Hoist to a VectorizationMaterialize.cpp when appropriate. ///

 namespace {

 struct VectorizationState {
   // `vectorizedByThisPattern` keeps track of statements that have already been
   // vectorized by this pattern. This allows distinguishing between
   DenseSet<OperationStmt *> vectorizedByThisPattern;
   DenseSet<ForStmt *> vectorized;
   const Strategy *strategy;
 };
 } // end anonymous namespace

 /// Terminal template function for creating a LoadOp.
 static OpPointer<LoadOp> createLoad(MLFuncBuilder *b, Location *loc,
                                     MLValue *memRef) {
   return b->create<LoadOp>(
       loc, memRef,
       ArrayRef<SSAValue *>{insertZeroIndex(memRef->getFunction())});
 }

 /// Terminal template function for creating a StoreOp.
 static OpPointer<StoreOp> createStore(MLFuncBuilder *b, Location *loc,
                                       MLValue *memRef,
                                       OpPointer<StoreOp> store) {
   return b->create<StoreOp>(
       loc, store->getValueToStore(), memRef,
       ArrayRef<SSAValue *>{insertZeroIndex(memRef->getFunction())});
 }

 /// Vectorizes the `memoryOp` of type LoadOp or StoreOp along loop `iv` by
 /// factor `vectorSize`.
 /// In a first implementation, this triggers materialization of a vector Alloc.
 // TODO(ntv): this could be a view that changes the underlying element type.
 // Materialization of this view may or may not happen before legalization.
 template <typename LoadOrStoreOpPointer>
 static bool vectorize(MLValue *iv, LoadOrStoreOpPointer memoryOp,
                       ArrayRef<int> vectorSize, VectorizationState *state) {
   auto *materializedMemRef = materializeVector(iv, memoryOp, vectorSize);
   auto *opStmt = cast<OperationStmt>(memoryOp->getOperation());
   MLFuncBuilder b(opStmt);
   Operation *resultOperation;
   if (auto load = opStmt->template dyn_cast<LoadOp>()) {
     auto res = createLoad(&b, opStmt->getLoc(), materializedMemRef);
     resultOperation = res->getOperation();
   } else {
     auto store = opStmt->template dyn_cast<StoreOp>();
     auto res = createStore(&b, opStmt->getLoc(), materializedMemRef, store);
     resultOperation = res->getOperation();
   }
   state->vectorizedByThisPattern.insert(cast<OperationStmt>(resultOperation));
   return false;
 }

 // result == true => failure, TO
 // (ntv): Status enum
 static bool vectorizeForStmt(ForStmt *loop, AffineMap upperBound,
                              ArrayRef<int> vectorSize, int64_t step,
                              VectorizationState *state) {
   LLVM_DEBUG(dbgs() << "[early-vect] vectorize loop ");
   LLVM_DEBUG(loop->print(dbgs()));
   LLVM_DEBUG(dbgs() << "\n");

   using namespace functional;
   loop->setUpperBound(map(unwrapPtr<MLValue>(), loop->getUpperBoundOperands()),
                       upperBound);
   loop->setStep(step);

   FilterFunctionType notVectorizedThisRound = [state](const Statement &stmt) {
     if (!matcher::isLoadOrStore(stmt)) {
       return false;
     }
     return state->vectorizedByThisPattern.count(cast<OperationStmt>(&stmt)) ==
            0;
   };
   auto loadAndStores = matcher::Op(notVectorizedThisRound);
   auto matches = loadAndStores.match(loop);
   for (auto ls : matches) {
     auto *opStmt = cast<OperationStmt>(ls.first);
     auto load = opStmt->dyn_cast<LoadOp>();
     auto store = opStmt->dyn_cast<StoreOp>();
     LLVM_DEBUG(dbgs() << "[early-vect] vectorize op: ");
     LLVM_DEBUG(opStmt->print(dbgs()));
     LLVM_DEBUG(dbgs() << "\n");
     bool vectorizationFails = load ? vectorize(loop, load, vectorSize, state)
                                    : vectorize(loop, store, vectorSize, state);
     LLVM_DEBUG(dbgs() << "fail: " << vectorizationFails << "\n");
     if (vectorizationFails) {
       // Early exit and trigger RAII cleanups at the root.
       return true;
     }
     // Erase the original op.
     opStmt->erase();
   }
   return false;
 }

 /// Returns a FilterFunctionType that can be used in MLFunctionMatcher to
 /// match a loop whose underlying load/store accesses are all varying along the
 /// `fastestVaryingMemRefDimension`.
 /// TODO(ntv): In the future, allow more interesting mixed layout permutation
 /// once we understand better the performance implications and we are confident
 /// we can build a cost model and a search procedure.
 static FilterFunctionType
 isVectorizableLoopPtrFactory(unsigned fastestVaryingMemRefDimension) {
   return [fastestVaryingMemRefDimension](const Statement &forStmt) {
     const auto &loop = cast<ForStmt>(forStmt);
     return isVectorizableLoopAlongFastestVaryingMemRefDim(
         loop, fastestVaryingMemRefDimension);
   };
 }

 /// Forward-declaration.
 static bool vectorizeNonRoot(MLFunctionMatches matches,
                              VectorizationState *state);

 /// Apply vectorization of `loop` according to `state`. This is only triggered
 /// if all vectorizations in `childrenMatches` have already succeeded
 /// recursively in DFS post-order.
 static bool doVectorize(ForStmt *loop, MLFunctionMatches childrenMatches,
                         VectorizationState *state) {
   // 1. DFS postorder recursion, if any of my children fails, I fail too.
   auto fail = vectorizeNonRoot(childrenMatches, state);
   if (fail) {
     // Early exit and trigger RAII cleanups at the root.
     return true;
   }

   // 2. This loop may have been omitted from vectorization for various reasons
   // (e.g. due to the performance model or pattern depth > vector size).
   assert(state->strategy->loopToVectorDim.count(loop));
   assert(state->strategy->loopToVectorDim.find(loop) !=
              state->strategy->loopToVectorDim.end() &&
          "Key not found");
   int vectorDim = state->strategy->loopToVectorDim.lookup(loop);
   if (vectorDim < 0) {
     return false;
   }

   // 3. Actual post-order transformation.
   assert(vectorDim < clVirtualVectorSize.size() && "vector dim overflow");
   //   a. get actual vector size
   auto vectorSize = clVirtualVectorSize[vectorDim];
   //   b. loop transformation for early vectorization is still subject to
   //     exploratory tradeoffs (see top of the file).
   auto ubMap = loop->getUpperBoundMap();
   assert(ubMap.getRangeSizes().empty());
   //   c. apply coarsening, i.e.:
   //        | ub -> ub = vectorSize - 1
   //        | step -> step * vectorSize
   std::function<AffineExpr(AffineExpr)> coarsenUb =
       [vectorSize](AffineExpr expr) { return expr + vectorSize - 1; };
   auto newUbs = functional::map(coarsenUb, ubMap.getResults());
   //   d. recurse
   return vectorizeForStmt(
       loop,
       AffineMap::get(ubMap.getNumDims(), ubMap.getNumSymbols(), newUbs, {}),
       clVirtualVectorSize, loop->getStep() * vectorSize, state);
 }

 /// Non-root pattern iterates over the matches at this level, calls doVectorize
 /// and exits early if anything below fails.
 static bool vectorizeNonRoot(MLFunctionMatches matches,
                              VectorizationState *state) {
   for (auto m : matches) {
     auto fail = doVectorize(cast<ForStmt>(m.first), m.second, state);
     if (fail) {
       // Early exit and trigger RAII cleanups at the root.
       return true;
     }
   }
   return false;
 }

 /// Sets up error handling for this root loop.
 /// Vectorization is a recursive procedure where anything below can fail.
 /// The root match thus needs to maintain a clone for handling failure.
 /// Each root may succeed independently but will otherwise clean after itself if
 /// anything below it fails.
 static bool vectorizeRoot(MLFunctionMatches matches,
                           VectorizationState *state) {
   for (auto m : matches) {
     auto *loop = cast<ForStmt>(m.first);
     // Since patterns are recursive, they can very well intersect.
     // Since we do not want a fully greedy strategy in general, we decouple
     // pattern matching, from profitability analysis, from application.
     // As a consequence we must check that each root pattern is still
     // vectorizable. If a pattern is not vectorizable anymore, we just skip it.
     // TODO(ntv): implement a non-greedy profitability analysis that keeps only
     // non-intersecting patterns.
     if (!isVectorizableLoop(*loop)) {
       continue;
     }
     DenseMap<const MLValue *, MLValue *> nomap;
     MLFuncBuilder builder(loop->getFunction());
     ForStmt *clonedLoop = cast<ForStmt>(builder.clone(*loop, nomap));
     auto fail = doVectorize(loop, m.second, state);
     if (!fail) {
       LLVM_DEBUG(dbgs() << "[early-vect] success vectorizing loop: ");
       LLVM_DEBUG(loop->print(dbgs()));
       LLVM_DEBUG(dbgs() << "\n");
     }
     fail ? loop->erase() : clonedLoop->erase();
   }
   return false;
 }

 /// Applies vectorization to the current MLFunction by searching over a bunch of
 /// predetermined patterns.
 PassResult Vectorize::runOnMLFunction(MLFunction *f) {
   /// Build a zero at the entry of the function to avoid clutter in every single
   /// vectorized loop.
   insertZeroIndex(f);

   for (auto pat : makePatterns()) {
     LLVM_DEBUG(dbgs() << "\n[early-vect] Input function is now:\n");
     LLVM_DEBUG(f->print(dbgs()));
     LLVM_DEBUG(dbgs() << "\n[early-vect] match:\n");
     auto matches = pat.match(f);
     Strategy strategy;
     auto fail = analyzeProfitability(matches, 0, pat.getDepth(), &strategy);
     assert(!fail);
     VectorizationState state;
     state.strategy = &strategy;
     fail = vectorizeRoot(matches, &state);
     assert(!fail);
   }

   return PassResult::Success;
 }

 FunctionPass *mlir::createVectorizePass() { return new Vectorize(); }

 static PassRegistration<Vectorize>
     pass("vectorize",
          "Vectorize to a target independent n-D vector abstraction");