blob: 63969af451ff00225045e62e49f32fcec42427e3 [file] [log] [blame]
//===- Vectorize.cpp - Vectorize Pass Impl ----------------------*- C++ -*-===//
//
// Copyright 2019 The MLIR Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
//
// This file implements vectorization of loops, operations and data types to
// a target-independent, n-D virtual vector abstraction.
//
//===----------------------------------------------------------------------===//
#include "mlir/Analysis/LoopAnalysis.h"
#include "mlir/Analysis/MLFunctionMatcher.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/Location.h"
#include "mlir/IR/MLValue.h"
#include "mlir/IR/SSAValue.h"
#include "mlir/IR/Types.h"
#include "mlir/Pass.h"
#include "mlir/StandardOps/StandardOps.h"
#include "mlir/Support/Functional.h"
#include "mlir/Transforms/Passes.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
using namespace mlir;
/// This pass implements a high-level vectorization strategy at the MLFunction
/// level. This is implemented by:
/// 1. matching arbitrarily nested loop patterns that are vectorizable;
/// 2. analyzing those patterns for profitability;
/// 3. applying those patterns iteratively by coarsening the loops, inserting
/// a single explicit vector element AllocOp and an unaligned load/store
/// operation. The full semantics of this unaligned load/store is still
/// TBD.
///
/// Loop transformation:
// ====================
/// The choice of loop transformation to apply for coarsening vectorized loops
/// is still subject to exploratory tradeoffs. In particular, say we want to
/// vectorize by a factor 128, we want to transform the following input:
/// for %i = %M to %N {
/// %a = load(f(i))
///
/// Traditionally, one would vectorize late (after scheduling, tiling,
/// memory promotion etc) say after stripmining (and potentially unrolling in
/// the case of LLVM's SLP vectorizer):
/// for %i = floor(%M, 128) to ceil(%N, 128) {
/// for %ii = max(%M, 128 * %i) to min(%N, 128*%i + 127) {
/// load/store(f(ii))
///
/// We seek to vectorize early and freeze vector types before scheduling, so
/// we want to generate a pattern that resembles:
/// for %i = ? to ? step ? {
/// unaligned_load/unaligned_store(g(i))
///
/// i. simply dividing the lower / upper bounds by 128 creates issues
/// with representing expressions such as ii + 1 because now we only
/// have access to original values that have been divided. Additional
/// information is needed to specify accesses at below 128 granularity;
/// ii. another alternative is to coarsen the loop step but this may have
/// consequences on dependence analysis and fusability of loops: fusable
/// loops probably need to have the same step (because we don't want to
/// stripmine/unroll to enable fusion).
/// As a consequence, we choose to represent the coarsening using the loop
/// step for now and reevaluate in the future. Note that we can renormalize
/// loop steps later if/when we have evidence that they are problematic.
///
/// For the simple strawman example above, vectorizing for a 1-D vector
/// abstraction of size 128 returns code similar to:
/// %c0 = constant 0 : index
/// for %i = %M to %N + 127 step 128 {
/// %a = alloc() : memref<1xvector<128xf32>>
/// %r = "n_d_unaligned_load"(%tensor, %i, %a, %c0)
/// %16 = load %a[%c0] : memref<1xvector<128xf32>>
///
/// Note this is still work in progress and not yet functional.
#define DEBUG_TYPE "early-vect"
static cl::list<int> clVirtualVectorSize(
"virtual-vector-size",
cl::desc("Specify n-D virtual vector size for vectorization"),
cl::ZeroOrMore);
static cl::list<int> clFastestVaryingPattern(
"test-fastest-varying",
cl::desc("Specify a 1-D, 2-D or 3-D pattern of fastest varying memory "
"dimensions to match. See defaultPatterns in Vectorize.cpp for a "
"description and examples. This is used for testing purposes"),
cl::ZeroOrMore);
/// Forward declaration.
static FilterFunctionType
isVectorizableLoopPtrFactory(unsigned fastestVaryingMemRefDimension);
// Build a bunch of predetermined patterns that will be traversed in order.
// Due to the recursive nature of MLFunctionMatchers, this captures
// arbitrarily nested pairs of loops at any position in the tree.
/// Note that this currently only matches 2 nested loops and will be extended.
// TODO(ntv): support 3-D loop patterns with a common reduction loop that can
// be matched to GEMMs.
static std::vector<MLFunctionMatcher> defaultPatterns() {
using matcher::For;
return std::vector<MLFunctionMatcher>{
// 3-D patterns
For(isVectorizableLoopPtrFactory(2),
For(isVectorizableLoopPtrFactory(1),
For(isVectorizableLoopPtrFactory(0)))),
// for i { for j { A[??f(not i, not j), f(i, not j), f(not i, j)];}}
// test independently with:
// --test-fastest-varying=1 --test-fastest-varying=0
For(isVectorizableLoopPtrFactory(1),
For(isVectorizableLoopPtrFactory(0))),
// for i { for j { A[??f(not i, not j), f(i, not j), ?, f(not i, j)];}}
// test independently with:
// --test-fastest-varying=2 --test-fastest-varying=0
For(isVectorizableLoopPtrFactory(2),
For(isVectorizableLoopPtrFactory(0))),
// for i { for j { A[??f(not i, not j), f(i, not j), ?, ?, f(not i, j)];}}
// test independently with:
// --test-fastest-varying=3 --test-fastest-varying=0
For(isVectorizableLoopPtrFactory(3),
For(isVectorizableLoopPtrFactory(0))),
// for i { for j { A[??f(not i, not j), f(not i, j), f(i, not j)];}}
// test independently with:
// --test-fastest-varying=0 --test-fastest-varying=1
For(isVectorizableLoopPtrFactory(0),
For(isVectorizableLoopPtrFactory(1))),
// for i { for j { A[??f(not i, not j), f(not i, j), ?, f(i, not j)];}}
// test independently with:
// --test-fastest-varying=0 --test-fastest-varying=2
For(isVectorizableLoopPtrFactory(0),
For(isVectorizableLoopPtrFactory(2))),
// for i { for j { A[??f(not i, not j), f(not i, j), ?, ?, f(i, not j)];}}
// test independently with:
// --test-fastest-varying=0 --test-fastest-varying=3
For(isVectorizableLoopPtrFactory(0),
For(isVectorizableLoopPtrFactory(3))),
// for i { A[??f(not i) , f(i)];}
// test independently with: --test-fastest-varying=0
For(isVectorizableLoopPtrFactory(0)),
// for i { A[??f(not i) , f(i), ?];}
// test independently with: --test-fastest-varying=1
For(isVectorizableLoopPtrFactory(1)),
// for i { A[??f(not i) , f(i), ?, ?];}
// test independently with: --test-fastest-varying=2
For(isVectorizableLoopPtrFactory(2)),
// for i { A[??f(not i) , f(i), ?, ?, ?];}
// test independently with: --test-fastest-varying=3
For(isVectorizableLoopPtrFactory(3))};
}
static std::vector<MLFunctionMatcher> makePatterns() {
using matcher::For;
if (clFastestVaryingPattern.empty()) {
return defaultPatterns();
}
switch (clFastestVaryingPattern.size()) {
case 1:
return {For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[0]))};
case 2:
return {For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[0]),
For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[1])))};
case 3:
return {For(
isVectorizableLoopPtrFactory(clFastestVaryingPattern[0]),
For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[1]),
For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[2]))))};
default:
assert(false && "Only up to 3-D fastest varying pattern supported atm");
}
return std::vector<MLFunctionMatcher>();
}
namespace {
struct Vectorize : public FunctionPass {
PassResult runOnMLFunction(MLFunction *f) override;
// Thread-safe RAII contexts local to pass, BumpPtrAllocator freed on exit.
MLFunctionMatcherContext MLContext;
static char passID;
};
} // end anonymous namespace
char Vectorize::passID = 0;
/////// TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate. //////
namespace {
struct Strategy {
DenseMap<ForStmt *, unsigned> loopToVectorDim;
};
} // end anonymous namespace
/// Implements a simple strawman strategy for vectorization.
/// Given a matched pattern `matches` of depth `patternDepth`, this strategy
/// greedily assigns the fastest varying dimension ** of the vector ** to the
/// innermost loop in the pattern.
/// When coupled with a pattern that looks for the fastest varying dimension in
/// load/store MemRefs, this creates a generic vectorization strategy that works
/// for any loop in a hierarchy (outermost, innermost or intermediate).
///
/// TODO(ntv): In the future we should additionally increase the power of the
/// profitability analysis along 3 directions:
/// 1. account for loop extents (both static and parametric + annotations);
/// 2. account for data layout permutations;
/// 3. account for impact of vectorization on maximal loop fusion.
/// Then we can quantify the above to build a cost model and search over
/// strategies.
static bool analyzeProfitability(MLFunctionMatches matches,
unsigned depthInPattern, unsigned patternDepth,
Strategy *strategy) {
for (auto m : matches) {
auto *loop = cast<ForStmt>(m.first);
LLVM_DEBUG(dbgs() << "[early-vect][profitability] patternDepth: "
<< patternDepth << " depthInPattern: " << depthInPattern
<< " loop ");
LLVM_DEBUG(loop->print(dbgs()));
LLVM_DEBUG(dbgs() << "\n");
bool fail = analyzeProfitability(m.second, depthInPattern + 1, patternDepth,
strategy);
if (fail) {
return fail;
}
assert(patternDepth > depthInPattern);
if (patternDepth - depthInPattern <= clVirtualVectorSize.size()) {
strategy->loopToVectorDim[loop] =
clVirtualVectorSize.size() - (patternDepth - depthInPattern);
LLVM_DEBUG(dbgs() << "[early-vect][profitability] vectorize @ "
<< strategy->loopToVectorDim[loop] << " loop ");
LLVM_DEBUG(loop->print(dbgs()));
LLVM_DEBUG(dbgs() << "\n");
} else {
// Don't vectorize
strategy->loopToVectorDim[loop] = -1;
}
}
return false;
}
///// end TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate /////
////// TODO(ntv): Hoist to a VectorizationMaterialize.cpp when appropriate. ////
/// Gets a MemRefType of 1 vector with the same elemental type as `tmpl` and
/// sizes specified by vectorSize. The MemRef lives in the same memory space as
/// tmpl. The MemRef should be promoted to a closer memory address space in a
/// later pass.
static MemRefType getVectorizedMemRefType(MemRefType tmpl,
ArrayRef<int> vectorSizes) {
auto elementType = tmpl.getElementType();
assert(!elementType.dyn_cast<VectorType>() &&
"Can't vectorize an already vector type");
assert(tmpl.getAffineMaps().empty() &&
"Unsupported non-implicit identity map");
return MemRefType::get({1}, VectorType::get(vectorSizes, elementType), {},
tmpl.getMemorySpace());
}
/// Creates an unaligned load with the following semantics:
/// 1. TODO(ntv): apply a `srcMap` to a `srcIndex` to represent a `srcMemRef`
/// slice + permutations for loading from non-fastest varying dimensions.
/// Note that generally, the fastest varying dimension should be part of the
/// map otherwise global layout changes are likely needed to obtain an
/// efficient load. This is an orthogonal cost model consideration;
/// 2. load from the `srcMemRef` resulting from 1.;
/// 3. store into a `dstMemRef` starting at offset `dstIndex`;
/// 4. copy sizeof(dstMemRef) bytes with adjustements for boundaries;
/// 5. TODO(ntv): broadcast along `broadcastMap` inside the `dstMemRef` to
/// support patterns like scalar-to-vector and N-k dim MemRef slice
/// The copy may overflow on the src side but not on the dst side. If the copy
/// overflows on the src side, the `dstMemRef` will be padded with enough values
/// to fill it completely.
///
/// Usage:
/// This n_d_unaligned_load op will be implemented as a PseudoOp for different
/// backends. In its current form it is only used to load into a <1xvector>;
/// where the vector may have any shape that is some multiple of the
/// hardware-specific vector size used to implement the PseudoOp efficiently.
/// This is used to implement "non-effecting padding" for early vectorization
/// and allows higher-level passes in the codegen to not worry about
/// hardware-specific implementation details.
///
/// TODO(ntv):
/// 1. implement this end-to-end for some backend;
/// 2. support operation-specific padding values to properly implement
/// "non-effecting padding";
/// 3. support input map for on-the-fly transpositions (point 1 above);
/// 4. support broadcast map (point 5 above).
///
/// TODO(andydavis,bondhugula,ntv):
/// 1. generalize to support padding semantics and offsets within vector type.
static void createUnalignedLoad(MLFuncBuilder *b, Location *loc,
SSAValue *srcMemRef,
ArrayRef<SSAValue *> srcIndices,
SSAValue *dstMemRef,
ArrayRef<SSAValue *> dstIndices) {
SmallVector<SSAValue *, 8> operands;
operands.reserve(1 + srcIndices.size() + 1 + dstIndices.size());
operands.insert(operands.end(), srcMemRef);
operands.insert(operands.end(), srcIndices.begin(), srcIndices.end());
operands.insert(operands.end(), dstMemRef);
operands.insert(operands.end(), dstIndices.begin(), dstIndices.end());
using functional::map;
std::function<Type(SSAValue *)> getType = [](SSAValue *v) -> Type {
return v->getType();
};
auto types = map(getType, operands);
OperationState opState(b->getContext(), loc, "n_d_unaligned_load", operands,
types);
b->createOperation(opState);
}
/// Creates an unaligned store with the following semantics:
/// 1. TODO(ntv): apply a `srcMap` to a `srcIndex` to represent a `srcMemRef`
/// slice to support patterns like vector-to-scalar and N-k dim MemRef slice.
/// This is used as the counterpart to the broadcast map in the UnalignedLoad;
/// 2. load from the `srcMemRef` resulting from 1.;
/// 3. store into a `dstMemRef` starting at offset `dstIndex`;
/// 4. TODO(ntv): apply a `dstMap` to a `dstIndex` to represent a `dstMemRef`
/// slice + permutations for storing into non-fastest varying dimensions.
/// Note that generally, the fastest varying dimension should be part of the
/// map otherwise global layout changes are likely needed to obtain an
/// efficient store. This is an orthogonal cost model consideration;
/// 5. copy sizeof(srcMemRef) bytes with adjustements for boundaries;
/// The copy may overflow on the dst side but not on the dst side. If the copy
/// overflows on the dst side, the underlying implementation needs to resolve
/// potential races.
///
/// Usage:
/// This n_d_unaligned_store op will be implemented as a PseudoOp for
/// different backends. In its current form it is only used to store from a
/// <1xvector>; where the vector may have any shape that is some multiple of
/// the hardware-specific vector size used to implement the PseudoOp
/// efficiently. This is used to implement "non-effecting padding" for early
/// vectorization and allows higher-level passes in the codegen to not worry
/// about hardware-specific implementation details.
///
/// TODO(ntv):
/// 1. implement this end-to-end for some backend;
/// 2. support write-back in the presence of races and ;
/// 3. support input map for counterpart of broadcast (point 1 above);
/// 4. support dstMap for writing back in non-contiguous memory regions
/// (point 4 above).
static void createUnalignedStore(MLFuncBuilder *b, Location *loc,
SSAValue *srcMemRef,
ArrayRef<SSAValue *> srcIndices,
SSAValue *dstMemRef,
ArrayRef<SSAValue *> dstIndices) {
SmallVector<SSAValue *, 8> operands;
operands.reserve(1 + srcIndices.size() + 1 + dstIndices.size());
operands.insert(operands.end(), srcMemRef);
operands.insert(operands.end(), srcIndices.begin(), srcIndices.end());
operands.insert(operands.end(), dstMemRef);
operands.insert(operands.end(), dstIndices.begin(), dstIndices.end());
using functional::map;
std::function<Type(SSAValue *)> getType = [](SSAValue *v) -> Type {
return v->getType();
};
auto types = map(getType, operands);
OperationState opState(b->getContext(), loc, "n_d_unaligned_store", operands,
types);
b->createOperation(opState);
}
/// The current implementation of vectorization materializes an AllocOp of
/// MemRef<1 x vector_type> + a custom unaligned load/store pseudoop.
/// The vector load/store accessing this MemRef always accesses element 0, so we
/// just memoize a single 0 SSAValue, once upon function entry to avoid clutter.
/// We create one such SSAValue per function.
static SSAValue *insertZeroIndex(MLFunction *f) {
static thread_local DenseMap<MLFunction *, SSAValue *> zeros;
if (zeros.find(f) == zeros.end()) {
MLFuncBuilder b(f);
auto zero = b.create<ConstantIndexOp>(b.getUnknownLoc(), 0);
zeros.insert(std::make_pair(f, zero));
}
return zeros.lookup(f);
}
/// Unwraps a pointer type to another type (possibly the same).
/// Used in particular to allow easier compositions of
/// llvm::iterator_range<ForStmt::operand_iterator> types.
template <typename T, typename ToType = T>
static std::function<ToType *(T *)> unwrapPtr() {
return [](T *val) { return dyn_cast<ToType>(val); };
}
/// Materializes the n-D vector into an unpromoted temporary storage and
/// explicitly copy into it. Materialization occurs in a MemRef containing 1
/// vector that lives in the same memory space as the base MemRef. Later passes
/// should make the decision to promote this materialization to a faster address
/// space.
template <typename LoadOrStoreOpPointer>
static MLValue *materializeVector(MLValue *iv, LoadOrStoreOpPointer memoryOp,
ArrayRef<int> vectorSize) {
auto memRefType =
memoryOp->getMemRef()->getType().template cast<MemRefType>();
auto vectorMemRefType = getVectorizedMemRefType(memRefType, vectorSize);
// Materialize a MemRef with 1 vector.
auto *opStmt = cast<OperationStmt>(memoryOp->getOperation());
MLFuncBuilder b(opStmt);
// Create an AllocOp to apply the new shape.
auto allocOp = b.create<AllocOp>(opStmt->getLoc(), vectorMemRefType,
ArrayRef<SSAValue *>{});
auto *allocMemRef = memoryOp->getMemRef();
using namespace functional;
if (opStmt->template isa<LoadOp>()) {
createUnalignedLoad(&b, opStmt->getLoc(), allocMemRef,
map(unwrapPtr<SSAValue>(), memoryOp->getIndices()),
allocOp->getResult(),
{insertZeroIndex(iv->getFunction())});
} else {
createUnalignedStore(&b, opStmt->getLoc(), allocOp->getResult(),
{insertZeroIndex(iv->getFunction())}, allocMemRef,
map(unwrapPtr<SSAValue>(), memoryOp->getIndices()));
}
return cast<MLValue>(allocOp->getResult());
}
/// end TODO(ntv): Hoist to a VectorizationMaterialize.cpp when appropriate. ///
namespace {
struct VectorizationState {
// `vectorizedByThisPattern` keeps track of statements that have already been
// vectorized by this pattern. This allows distinguishing between
DenseSet<OperationStmt *> vectorizedByThisPattern;
DenseSet<ForStmt *> vectorized;
const Strategy *strategy;
};
} // end anonymous namespace
/// Terminal template function for creating a LoadOp.
static OpPointer<LoadOp> createLoad(MLFuncBuilder *b, Location *loc,
MLValue *memRef) {
return b->create<LoadOp>(
loc, memRef,
ArrayRef<SSAValue *>{insertZeroIndex(memRef->getFunction())});
}
/// Terminal template function for creating a StoreOp.
static OpPointer<StoreOp> createStore(MLFuncBuilder *b, Location *loc,
MLValue *memRef,
OpPointer<StoreOp> store) {
return b->create<StoreOp>(
loc, store->getValueToStore(), memRef,
ArrayRef<SSAValue *>{insertZeroIndex(memRef->getFunction())});
}
/// Vectorizes the `memoryOp` of type LoadOp or StoreOp along loop `iv` by
/// factor `vectorSize`.
/// In a first implementation, this triggers materialization of a vector Alloc.
// TODO(ntv): this could be a view that changes the underlying element type.
// Materialization of this view may or may not happen before legalization.
template <typename LoadOrStoreOpPointer>
static bool vectorize(MLValue *iv, LoadOrStoreOpPointer memoryOp,
ArrayRef<int> vectorSize, VectorizationState *state) {
auto *materializedMemRef = materializeVector(iv, memoryOp, vectorSize);
auto *opStmt = cast<OperationStmt>(memoryOp->getOperation());
MLFuncBuilder b(opStmt);
Operation *resultOperation;
if (auto load = opStmt->template dyn_cast<LoadOp>()) {
auto res = createLoad(&b, opStmt->getLoc(), materializedMemRef);
resultOperation = res->getOperation();
} else {
auto store = opStmt->template dyn_cast<StoreOp>();
auto res = createStore(&b, opStmt->getLoc(), materializedMemRef, store);
resultOperation = res->getOperation();
}
state->vectorizedByThisPattern.insert(cast<OperationStmt>(resultOperation));
return false;
}
// result == true => failure, TO
// (ntv): Status enum
static bool vectorizeForStmt(ForStmt *loop, AffineMap upperBound,
ArrayRef<int> vectorSize, int64_t step,
VectorizationState *state) {
LLVM_DEBUG(dbgs() << "[early-vect] vectorize loop ");
LLVM_DEBUG(loop->print(dbgs()));
LLVM_DEBUG(dbgs() << "\n");
using namespace functional;
loop->setUpperBound(map(unwrapPtr<MLValue>(), loop->getUpperBoundOperands()),
upperBound);
loop->setStep(step);
FilterFunctionType notVectorizedThisRound = [state](const Statement &stmt) {
if (!matcher::isLoadOrStore(stmt)) {
return false;
}
return state->vectorizedByThisPattern.count(cast<OperationStmt>(&stmt)) ==
0;
};
auto loadAndStores = matcher::Op(notVectorizedThisRound);
auto matches = loadAndStores.match(loop);
for (auto ls : matches) {
auto *opStmt = cast<OperationStmt>(ls.first);
auto load = opStmt->dyn_cast<LoadOp>();
auto store = opStmt->dyn_cast<StoreOp>();
LLVM_DEBUG(dbgs() << "[early-vect] vectorize op: ");
LLVM_DEBUG(opStmt->print(dbgs()));
LLVM_DEBUG(dbgs() << "\n");
bool vectorizationFails = load ? vectorize(loop, load, vectorSize, state)
: vectorize(loop, store, vectorSize, state);
LLVM_DEBUG(dbgs() << "fail: " << vectorizationFails << "\n");
if (vectorizationFails) {
// Early exit and trigger RAII cleanups at the root.
return true;
}
// Erase the original op.
opStmt->erase();
}
return false;
}
/// Returns a FilterFunctionType that can be used in MLFunctionMatcher to
/// match a loop whose underlying load/store accesses are all varying along the
/// `fastestVaryingMemRefDimension`.
/// TODO(ntv): In the future, allow more interesting mixed layout permutation
/// once we understand better the performance implications and we are confident
/// we can build a cost model and a search procedure.
static FilterFunctionType
isVectorizableLoopPtrFactory(unsigned fastestVaryingMemRefDimension) {
return [fastestVaryingMemRefDimension](const Statement &forStmt) {
const auto &loop = cast<ForStmt>(forStmt);
return isVectorizableLoopAlongFastestVaryingMemRefDim(
loop, fastestVaryingMemRefDimension);
};
}
/// Forward-declaration.
static bool vectorizeNonRoot(MLFunctionMatches matches,
VectorizationState *state);
/// Apply vectorization of `loop` according to `state`. This is only triggered
/// if all vectorizations in `childrenMatches` have already succeeded
/// recursively in DFS post-order.
static bool doVectorize(ForStmt *loop, MLFunctionMatches childrenMatches,
VectorizationState *state) {
// 1. DFS postorder recursion, if any of my children fails, I fail too.
auto fail = vectorizeNonRoot(childrenMatches, state);
if (fail) {
// Early exit and trigger RAII cleanups at the root.
return true;
}
// 2. This loop may have been omitted from vectorization for various reasons
// (e.g. due to the performance model or pattern depth > vector size).
assert(state->strategy->loopToVectorDim.count(loop));
assert(state->strategy->loopToVectorDim.find(loop) !=
state->strategy->loopToVectorDim.end() &&
"Key not found");
int vectorDim = state->strategy->loopToVectorDim.lookup(loop);
if (vectorDim < 0) {
return false;
}
// 3. Actual post-order transformation.
assert(vectorDim < clVirtualVectorSize.size() && "vector dim overflow");
// a. get actual vector size
auto vectorSize = clVirtualVectorSize[vectorDim];
// b. loop transformation for early vectorization is still subject to
// exploratory tradeoffs (see top of the file).
auto ubMap = loop->getUpperBoundMap();
assert(ubMap.getRangeSizes().empty());
// c. apply coarsening, i.e.:
// | ub -> ub = vectorSize - 1
// | step -> step * vectorSize
std::function<AffineExpr(AffineExpr)> coarsenUb =
[vectorSize](AffineExpr expr) { return expr + vectorSize - 1; };
auto newUbs = functional::map(coarsenUb, ubMap.getResults());
// d. recurse
return vectorizeForStmt(
loop,
AffineMap::get(ubMap.getNumDims(), ubMap.getNumSymbols(), newUbs, {}),
clVirtualVectorSize, loop->getStep() * vectorSize, state);
}
/// Non-root pattern iterates over the matches at this level, calls doVectorize
/// and exits early if anything below fails.
static bool vectorizeNonRoot(MLFunctionMatches matches,
VectorizationState *state) {
for (auto m : matches) {
auto fail = doVectorize(cast<ForStmt>(m.first), m.second, state);
if (fail) {
// Early exit and trigger RAII cleanups at the root.
return true;
}
}
return false;
}
/// Sets up error handling for this root loop.
/// Vectorization is a recursive procedure where anything below can fail.
/// The root match thus needs to maintain a clone for handling failure.
/// Each root may succeed independently but will otherwise clean after itself if
/// anything below it fails.
static bool vectorizeRoot(MLFunctionMatches matches,
VectorizationState *state) {
for (auto m : matches) {
auto *loop = cast<ForStmt>(m.first);
// Since patterns are recursive, they can very well intersect.
// Since we do not want a fully greedy strategy in general, we decouple
// pattern matching, from profitability analysis, from application.
// As a consequence we must check that each root pattern is still
// vectorizable. If a pattern is not vectorizable anymore, we just skip it.
// TODO(ntv): implement a non-greedy profitability analysis that keeps only
// non-intersecting patterns.
if (!isVectorizableLoop(*loop)) {
continue;
}
DenseMap<const MLValue *, MLValue *> nomap;
MLFuncBuilder builder(loop->getFunction());
ForStmt *clonedLoop = cast<ForStmt>(builder.clone(*loop, nomap));
auto fail = doVectorize(loop, m.second, state);
if (!fail) {
LLVM_DEBUG(dbgs() << "[early-vect] success vectorizing loop: ");
LLVM_DEBUG(loop->print(dbgs()));
LLVM_DEBUG(dbgs() << "\n");
}
fail ? loop->erase() : clonedLoop->erase();
}
return false;
}
/// Applies vectorization to the current MLFunction by searching over a bunch of
/// predetermined patterns.
PassResult Vectorize::runOnMLFunction(MLFunction *f) {
/// Build a zero at the entry of the function to avoid clutter in every single
/// vectorized loop.
insertZeroIndex(f);
for (auto pat : makePatterns()) {
LLVM_DEBUG(dbgs() << "\n[early-vect] Input function is now:\n");
LLVM_DEBUG(f->print(dbgs()));
LLVM_DEBUG(dbgs() << "\n[early-vect] match:\n");
auto matches = pat.match(f);
Strategy strategy;
auto fail = analyzeProfitability(matches, 0, pat.getDepth(), &strategy);
assert(!fail);
VectorizationState state;
state.strategy = &strategy;
fail = vectorizeRoot(matches, &state);
assert(!fail);
}
return PassResult::Success;
}
FunctionPass *mlir::createVectorizePass() { return new Vectorize(); }
static PassRegistration<Vectorize>
pass("vectorize",
"Vectorize to a target independent n-D vector abstraction");