| //===- MaterializeVectors.cpp - MaterializeVectors Pass Impl ----*- C++ -*-===// |
| // |
| // Copyright 2019 The MLIR Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // ============================================================================= |
| // |
| // This file implements target-dependent materialization of super-vectors to |
| // vectors of the proper size for the hardware. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "mlir/AffineOps/AffineOps.h" |
| #include "mlir/Analysis/AffineAnalysis.h" |
| #include "mlir/Analysis/Dominance.h" |
| #include "mlir/Analysis/LoopAnalysis.h" |
| #include "mlir/Analysis/NestedMatcher.h" |
| #include "mlir/Analysis/SliceAnalysis.h" |
| #include "mlir/Analysis/Utils.h" |
| #include "mlir/Analysis/VectorAnalysis.h" |
| #include "mlir/IR/AffineExpr.h" |
| #include "mlir/IR/AffineMap.h" |
| #include "mlir/IR/Attributes.h" |
| #include "mlir/IR/Builders.h" |
| #include "mlir/IR/BuiltinOps.h" |
| #include "mlir/IR/Location.h" |
| #include "mlir/IR/OperationSupport.h" |
| #include "mlir/IR/Types.h" |
| #include "mlir/Pass/Pass.h" |
| #include "mlir/StandardOps/StandardOps.h" |
| #include "mlir/SuperVectorOps/SuperVectorOps.h" |
| #include "mlir/Support/Functional.h" |
| #include "mlir/Support/LLVM.h" |
| #include "mlir/Transforms/Passes.h" |
| |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/raw_ostream.h" |
| |
| /// |
| /// Implements target-dependent materialization of virtual super-vectors to |
| /// vectors of the proper size for the hardware. |
| /// |
| /// While the physical vector size is target-dependent, the pass is written in |
| /// a target-independent way: the target vector size is specified as a parameter |
| /// to the pass. This pass is thus a partial lowering that opens the "greybox" |
| /// that is the super-vector abstraction. In particular, this pass can turn the |
| /// vector_transfer_read and vector_transfer_write ops in either: |
| /// 1. a loop nest with either scalar and vector load/store instructions; or |
| /// 2. a loop-nest with DmaStartOp / DmaWaitOp; or |
| /// 3. a pre-existing blackbox library call that can be written manually or |
| /// synthesized using search and superoptimization. |
| /// An important feature that either of these 3 target lowering abstractions |
| /// must handle is the handling of "non-effecting" padding with the proper |
| /// neutral element in order to guarantee that all "partial tiles" are actually |
| /// "full tiles" in practice. |
| /// |
| /// In particular this pass is a MLIR-MLIR rewriting and does not concern itself |
| /// with target-specific instruction-selection and register allocation. These |
| /// will happen downstream in LLVM. |
| /// |
| /// In this sense, despite performing lowering to a target-dependent size, this |
| /// pass is still target-agnostic. |
| /// |
| /// Implementation details |
| /// ====================== |
| /// The current decisions made by the super-vectorization pass guarantee that |
| /// use-def chains do not escape an enclosing vectorized AffineForOp. In other |
| /// words, this pass operates on a scoped program slice. Furthermore, since we |
| /// do not vectorize in the presence of conditionals for now, sliced chains are |
| /// guaranteed not to escape the innermost scope, which has to be either the top |
| /// Function scope or the innermost loop scope, by construction. As a |
| /// consequence, the implementation just starts from vector_transfer_write |
| /// operations and builds the slice scoped the innermost loop enclosing the |
| /// current vector_transfer_write. These assumptions and the implementation |
| /// details are subject to revision in the future. |
| /// |
| /// Example |
| /// ======== |
| /// In the following, the single vector_transfer_write op operates on a |
| /// vector<4x4x4xf32>. Let's assume the HW supports vector<4x4xf32>. |
| /// Materialization is achieved by instantiating each occurrence of the leading |
| /// dimension of vector<4x4x4xf32> into a vector<4x4xf32>. |
| /// The program transformation that implements this instantiation is a |
| /// multi-loop unroll-and-jam (it can be partial or full depending on the ratio |
| /// of super-vector shape to HW-vector shape). |
| /// |
| /// As a simple case, the following: |
| /// |
| /// ```mlir |
| /// mlfunc @materialize(%M : index, %N : index, %O : index, %P : index) { |
| /// %A = alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0> |
| /// %f1 = constant splat<vector<4x4x4xf32>, 1.000000e+00> : |
| /// vector<4x4x4xf32> for %i0 = 0 to %M step 4 { |
| /// for %i1 = 0 to %N step 4 { |
| /// for %i2 = 0 to %O { |
| /// for %i3 = 0 to %P step 4 { |
| /// vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 |
| /// {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : |
| /// vector<4x4x4xf32>, memref<?x?x?x?xf32, 0>, |
| /// index, index, index, index |
| /// }}}} |
| /// return |
| /// } |
| /// ``` |
| /// |
| /// is instantiated by unroll-and-jam (just unroll in this case) into: |
| /// |
| /// ```mlir |
| /// mlfunc @materialize(%M : index, %N : index, %O : index, %P : index) { |
| /// %A = alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0> |
| /// %f1 = constant splat<vector<4x4xf32>, 1.000000e+00> : vector<4x4x4xf32> |
| /// for %i0 = 0 to %arg0 step 4 { |
| /// for %i1 = 0 to %arg1 step 4 { |
| /// for %i2 = 0 to %arg2 { |
| /// for %i3 = 0 to %arg3 step 4 { |
| /// %1 = affine.apply (d0, d1, d2, d3) -> (d0, d1, d2, d3) |
| /// (%i0, %i1, %i2, %i3) |
| /// vector_transfer_write f1, %0, %1#0, %1#1, %1#2, %1#3 |
| /// {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} : |
| /// vector<4x4xf32>, memref<?x?x?x?xf32>, |
| /// index, index, index, index |
| /// %2 = affine.apply (d0, d1, d2, d3) -> (d0, d1, d2, d3 + 1) |
| /// (%i0, %i1, %i2, %i3) |
| /// vector_transfer_write {{.*}}, %0, %2#0, %2#1, %2#2, %2#3 |
| /// {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} : |
| /// vector<4x4xf32>, memref<?x?x?x?xf32>, |
| /// index, index, index, index |
| /// %3 = affine.apply (d0, d1, d2, d3) -> (d0, d1, d2, d3 + 2) |
| /// (%i0, %i1, %i2, %i3) |
| /// vector_transfer_write {{.*}}, %0, %3#0, %3#1, %3#2, %3#3 |
| /// {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} : |
| /// vector<4x4xf32>, memref<?x?x?x?xf32>, |
| /// index, index, index, index |
| /// %4 = affine.apply (d0, d1, d2, d3) -> (d0, d1, d2, d3 + 3) |
| /// (%i0, %i1, %i2, %i3) |
| /// vector_transfer_write {{.*}}, %0, %4#0, %4#1, %4#2, %4#3 |
| /// {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} : |
| /// vector<4x4xf32>, memref<?x?x?x?xf32>, |
| /// index, index, index, index |
| /// }}}} |
| /// return |
| /// } |
| /// ``` |
| |
| using llvm::dbgs; |
| using llvm::DenseSet; |
| using llvm::SetVector; |
| |
| using namespace mlir; |
| |
| using functional::makePtrDynCaster; |
| using functional::map; |
| |
| static llvm::cl::list<int> |
| clVectorSize("vector-size", |
| llvm::cl::desc("Specify the HW vector size for vectorization"), |
| llvm::cl::ZeroOrMore); |
| |
| #define DEBUG_TYPE "materialize-vect" |
| |
| namespace { |
| struct MaterializationState { |
| /// In practice, the determination of the HW-specific vector type to use when |
| /// lowering a super-vector type must be based on the elemental type. The |
| /// elemental type must be retrieved from the super-vector type. In the future |
| /// information about hardware vector type for a particular elemental type |
| /// will be part of the contract between MLIR and the backend. |
| /// |
| /// For example, 8xf32 has the same size as 16xf16 but the targeted HW itself |
| /// may exhibit the following property: |
| /// 1. have a special unit for a 128xf16 datapath; |
| /// 2. no F16 FPU support on the regular 8xf32/16xf16 vector datapath. |
| /// |
| /// For now, we just assume hwVectorSize has the proper information regardless |
| /// of the type and we assert everything is f32. |
| /// TODO(ntv): relax the assumptions on admissible element type once a |
| /// contract exists. |
| MaterializationState() : hwVectorSize(clVectorSize.size(), 0) { |
| std::copy(clVectorSize.begin(), clVectorSize.end(), hwVectorSize.begin()); |
| } |
| SmallVector<int64_t, 8> hwVectorSize; |
| VectorType superVectorType; |
| VectorType hwVectorType; |
| SmallVector<unsigned, 8> hwVectorInstance; |
| DenseMap<const Value *, Value *> *substitutionsMap; |
| }; |
| |
| struct MaterializeVectorsPass : public FunctionPass { |
| MaterializeVectorsPass() : FunctionPass(&MaterializeVectorsPass::passID) {} |
| |
| PassResult runOnFunction(Function *f) override; |
| |
| static char passID; |
| }; |
| |
| } // end anonymous namespace |
| |
| char MaterializeVectorsPass::passID = 0; |
| |
| /// Given a shape with sizes greater than 0 along all dimensions, |
| /// returns the distance, in number of elements, between a slice in a dimension |
| /// and the next slice in the same dimension. |
| /// e.g. shape[3, 4, 5] -> strides[20, 5, 1] |
| static SmallVector<unsigned, 8> makeStrides(ArrayRef<unsigned> shape) { |
| SmallVector<unsigned, 8> tmp; |
| tmp.reserve(shape.size()); |
| unsigned running = 1; |
| for (auto rit = shape.rbegin(), reit = shape.rend(); rit != reit; ++rit) { |
| assert(*rit > 0 && "size must be greater than 0 along all dimensions of " |
| "shape"); |
| tmp.push_back(running); |
| running *= *rit; |
| } |
| return SmallVector<unsigned, 8>(tmp.rbegin(), tmp.rend()); |
| } |
| |
| /// Given a shape with sizes greater than 0 along all dimensions, returns the |
| /// delinearized components of linearIndex along shape. |
| static SmallVector<unsigned, 8> delinearize(unsigned linearIndex, |
| ArrayRef<unsigned> shape) { |
| SmallVector<unsigned, 8> res; |
| res.reserve(shape.size()); |
| auto strides = makeStrides(shape); |
| for (unsigned idx = 0; idx < strides.size(); ++idx) { |
| assert(strides[idx] > 0); |
| auto val = linearIndex / strides[idx]; |
| res.push_back(val); |
| assert((val >= 0 && val < shape[idx]) && |
| "delinearization is out of bounds"); |
| linearIndex %= strides[idx]; |
| } |
| // Sanity check. |
| assert(linearIndex == 0 && "linear index constructed from shape must " |
| "have 0 remainder after delinearization"); |
| return res; |
| } |
| |
| static Instruction * |
| instantiate(FuncBuilder *b, Instruction *opInst, VectorType hwVectorType, |
| DenseMap<const Value *, Value *> *substitutionsMap); |
| |
| /// Not all Values belong to a program slice scoped within the immediately |
| /// enclosing loop. |
| /// One simple example is constants defined outside the innermost loop scope. |
| /// For such cases the substitutionsMap has no entry and we allow an additional |
| /// insertion. |
| /// For now, this is limited to ConstantOp because we do not vectorize loop |
| /// indices and will need to be extended in the future. |
| /// |
| /// If substitution fails, returns nullptr. |
| static Value *substitute(Value *v, VectorType hwVectorType, |
| DenseMap<const Value *, Value *> *substitutionsMap) { |
| auto it = substitutionsMap->find(v); |
| if (it == substitutionsMap->end()) { |
| auto *opInst = v->getDefiningInst(); |
| if (opInst->isa<ConstantOp>()) { |
| FuncBuilder b(opInst); |
| auto *inst = instantiate(&b, opInst, hwVectorType, substitutionsMap); |
| auto res = |
| substitutionsMap->insert(std::make_pair(v, inst->getResult(0))); |
| assert(res.second && "Insertion failed"); |
| return res.first->second; |
| } |
| v->getDefiningInst()->emitError("Missing substitution"); |
| return nullptr; |
| } |
| return it->second; |
| } |
| |
| /// Returns a list of single result AffineApplyOps that reindex the |
| /// `memRefIndices` by the multi-dimensional `hwVectorInstance`. This is used by |
| /// the function that materializes a vector_transfer operation to use hardware |
| /// vector types instead of super-vector types. |
| /// |
| /// The general problem this function solves is as follows: |
| /// Assume a vector_transfer operation at the super-vector granularity that has |
| /// `l` enclosing loops (AffineForOp). Assume the vector transfer operation |
| /// operates on a MemRef of rank `r`, a super-vector of rank `s` and a hardware |
| /// vector of rank `h`. For the purpose of illustration assume l==4, r==3, s==2, |
| /// h==1 and that the super-vector is vector<3x32xf32> and the hardware vector |
| /// is vector<8xf32>. Assume the following MLIR snippet after |
| /// super-vectorization has been applied: |
| /// |
| /// ```mlir |
| /// for %i0 = 0 to %M { |
| /// for %i1 = 0 to %N step 3 { |
| /// for %i2 = 0 to %O { |
| /// for %i3 = 0 to %P step 32 { |
| /// %r = vector_transfer_read(%A, map(%i..)#0, map(%i..)#1, map(%i..)#2) |
| /// -> vector<3x32xf32> |
| /// ... |
| /// }}}} |
| /// ``` |
| /// |
| /// where map denotes an AffineMap operating on enclosing loops with properties |
| /// compatible for vectorization (i.e. some contiguity left unspecified here). |
| /// Note that the vectorized loops are %i1 and %i3. |
| /// This function translates the vector_transfer_read operation to multiple |
| /// instances of vector_transfer_read that operate on vector<8x32>. |
| /// |
| /// Without loss of generality, we assume hwVectorInstance is: {2, 1}. |
| /// The only constraints on hwVectorInstance is they belong to: |
| /// [0, 2] x [0, 3], which is the span of ratio of super-vector shape to |
| /// hardware vector shape in our example. |
| /// |
| /// This function instantiates the iteration <2, 1> of vector_transfer_read |
| /// into the set of operations in pseudo-MLIR: |
| /// |
| /// ```mlir |
| /// map2 = (d0, d1, d2, d3) -> (d0, d1 + 2, d2, d3 + 1 * 8) |
| /// map3 = map o map2 // where o denotes composition |
| /// %r = vector_transfer_read(%A, map3(%i..)#0, map3(%i..)#1, map3(%i..)#2) |
| /// -> vector<3x32xf32> |
| /// ``` |
| /// |
| /// Practical considerations |
| /// ======================== |
| /// For now, `map` is assumed to be the identity map and the indices are |
| /// specified just as vector_transfer_read(%A, %i0, %i1, %i2, %i3). This will be |
| /// extended in the future once we have a proper Op for vector transfers. |
| /// Additionally, the example above is specified in pseudo-MLIR form; once we |
| /// have proper support for generic maps we can generate the code and show |
| /// actual MLIR. |
| /// |
| /// TODO(ntv): support a concrete AffineMap and compose with it. |
| /// TODO(ntv): these implementation details should be captured in a |
| /// vectorization trait at the op level directly. |
| static SmallVector<mlir::Value *, 8> |
| reindexAffineIndices(FuncBuilder *b, VectorType hwVectorType, |
| ArrayRef<unsigned> hwVectorInstance, |
| ArrayRef<Value *> memrefIndices) { |
| auto vectorShape = hwVectorType.getShape(); |
| assert(hwVectorInstance.size() >= vectorShape.size()); |
| |
| unsigned numIndices = memrefIndices.size(); |
| auto numMemRefIndices = numIndices - hwVectorInstance.size(); |
| auto numSuperVectorIndices = hwVectorInstance.size() - vectorShape.size(); |
| |
| SmallVector<AffineExpr, 8> affineExprs; |
| // TODO(ntv): support a concrete map and composition. |
| unsigned i = 0; |
| // The first numMemRefIndices correspond to AffineForOp that have not been |
| // vectorized, the transformation is the identity on those. |
| for (i = 0; i < numMemRefIndices; ++i) { |
| auto d_i = b->getAffineDimExpr(i); |
| affineExprs.push_back(d_i); |
| } |
| // The next numSuperVectorIndices correspond to super-vector dimensions that |
| // do not have a hardware vector dimension counterpart. For those we only |
| // need to increment the index by the corresponding hwVectorInstance. |
| for (i = numMemRefIndices; i < numMemRefIndices + numSuperVectorIndices; |
| ++i) { |
| auto d_i = b->getAffineDimExpr(i); |
| auto offset = hwVectorInstance[i - numMemRefIndices]; |
| affineExprs.push_back(d_i + offset); |
| } |
| // The remaining indices correspond to super-vector dimensions that |
| // have a hardware vector dimension counterpart. For those we to increment the |
| // index by "hwVectorInstance" multiples of the corresponding hardware |
| // vector size. |
| for (; i < numIndices; ++i) { |
| auto d_i = b->getAffineDimExpr(i); |
| auto offset = hwVectorInstance[i - numMemRefIndices]; |
| auto stride = vectorShape[i - numMemRefIndices - numSuperVectorIndices]; |
| affineExprs.push_back(d_i + offset * stride); |
| } |
| |
| // Create a bunch of single result AffineApplyOp. |
| SmallVector<mlir::Value *, 8> res; |
| res.reserve(affineExprs.size()); |
| for (auto expr : affineExprs) { |
| auto map = AffineMap::get(numIndices, 0, expr, {}); |
| res.push_back(makeComposedAffineApply(b, b->getInsertionPoint()->getLoc(), |
| map, memrefIndices)); |
| } |
| return res; |
| } |
| |
| /// Returns attributes with the following substitutions applied: |
| /// - constant splat is replaced by constant splat of `hwVectorType`. |
| /// TODO(ntv): add more substitutions on a per-need basis. |
| static SmallVector<NamedAttribute, 1> |
| materializeAttributes(Instruction *opInst, VectorType hwVectorType) { |
| SmallVector<NamedAttribute, 1> res; |
| for (auto a : opInst->getAttrs()) { |
| if (auto splat = a.second.dyn_cast<SplatElementsAttr>()) { |
| auto attr = SplatElementsAttr::get(hwVectorType, splat.getValue()); |
| res.push_back(NamedAttribute(a.first, attr)); |
| } else { |
| res.push_back(a); |
| } |
| } |
| return res; |
| } |
| |
| /// Creates an instantiated version of `opInst`. |
| /// Ops other than VectorTransferReadOp/VectorTransferWriteOp require no |
| /// affine reindexing. Just substitute their Value operands and be done. For |
| /// this case the actual instance is irrelevant. Just use the values in |
| /// substitutionsMap. |
| /// |
| /// If the underlying substitution fails, this fails too and returns nullptr. |
| static Instruction * |
| instantiate(FuncBuilder *b, Instruction *opInst, VectorType hwVectorType, |
| DenseMap<const Value *, Value *> *substitutionsMap) { |
| assert(!opInst->isa<VectorTransferReadOp>() && |
| "Should call the function specialized for VectorTransferReadOp"); |
| assert(!opInst->isa<VectorTransferWriteOp>() && |
| "Should call the function specialized for VectorTransferWriteOp"); |
| if (opInst->getNumBlockLists() != 0) |
| return nullptr; |
| |
| bool fail = false; |
| auto operands = map( |
| [hwVectorType, substitutionsMap, &fail](Value *v) -> Value * { |
| auto *res = |
| fail ? nullptr : substitute(v, hwVectorType, substitutionsMap); |
| fail |= !res; |
| return res; |
| }, |
| opInst->getOperands()); |
| if (fail) |
| return nullptr; |
| |
| auto attrs = materializeAttributes(opInst, hwVectorType); |
| |
| OperationState state(b->getContext(), opInst->getLoc(), |
| opInst->getName().getStringRef(), operands, |
| {hwVectorType}, attrs); |
| return b->createOperation(state); |
| } |
| |
| /// Computes the permutationMap required for a VectorTransferOp from the memref |
| /// to the `hwVectorType`. |
| /// This is achieved by returning the projection of the permutationMap along the |
| /// dimensions of the super-vector type that remain in the hwVectorType. |
| /// In particular, if a dimension is fully instantiated (i.e. unrolled) then it |
| /// is projected out in the final result. |
| template <typename VectorTransferOpTy> |
| static AffineMap projectedPermutationMap(VectorTransferOpTy *transfer, |
| VectorType hwVectorType) { |
| static_assert( |
| std::is_same<VectorTransferOpTy, VectorTransferReadOp>::value || |
| std::is_same<VectorTransferOpTy, VectorTransferWriteOp>::value, |
| "Must be called on a VectorTransferOp"); |
| auto superVectorType = transfer->getVectorType(); |
| auto optionalRatio = shapeRatio(superVectorType, hwVectorType); |
| assert(optionalRatio && |
| (optionalRatio->size() == superVectorType.getShape().size()) && |
| "Shape and ratio not of the same size"); |
| unsigned dim = 0; |
| SmallVector<AffineExpr, 4> keep; |
| MLIRContext *context = transfer->getInstruction()->getContext(); |
| functional::zipApply( |
| [&dim, &keep, context](int64_t shape, int64_t ratio) { |
| assert(shape >= ratio && "shape dim must be greater than ratio dim"); |
| if (shape != ratio) { |
| // HW vector is not full instantiated along this dim, keep it. |
| keep.push_back(getAffineDimExpr(dim, context)); |
| } |
| ++dim; |
| }, |
| superVectorType.getShape(), *optionalRatio); |
| auto permutationMap = transfer->getPermutationMap(); |
| LLVM_DEBUG(permutationMap.print(dbgs() << "\npermutationMap: ")); |
| if (keep.empty()) { |
| return permutationMap; |
| } |
| auto projectionMap = AffineMap::get(optionalRatio->size(), 0, keep, {}); |
| LLVM_DEBUG(projectionMap.print(dbgs() << "\nprojectionMap: ")); |
| return simplifyAffineMap(projectionMap.compose(permutationMap)); |
| } |
| |
| /// Creates an instantiated version of `read` for the instance of |
| /// `hwVectorInstance` when lowering from a super-vector type to |
| /// `hwVectorType`. `hwVectorInstance` represents one particular instance of |
| /// `hwVectorType` int the covering of the super-vector type. For a more |
| /// detailed description of the problem, see the description of |
| /// reindexAffineIndices. |
| static Instruction * |
| instantiate(FuncBuilder *b, VectorTransferReadOp *read, VectorType hwVectorType, |
| ArrayRef<unsigned> hwVectorInstance, |
| DenseMap<const Value *, Value *> *substitutionsMap) { |
| SmallVector<Value *, 8> indices = |
| map(makePtrDynCaster<Value>(), read->getIndices()); |
| auto affineIndices = |
| reindexAffineIndices(b, hwVectorType, hwVectorInstance, indices); |
| auto map = projectedPermutationMap(read, hwVectorType); |
| if (!map) { |
| return nullptr; |
| } |
| auto cloned = b->create<VectorTransferReadOp>( |
| read->getLoc(), hwVectorType, read->getMemRef(), affineIndices, map, |
| read->getPaddingValue()); |
| return cloned->getInstruction(); |
| } |
| |
| /// Creates an instantiated version of `write` for the instance of |
| /// `hwVectorInstance` when lowering from a super-vector type to |
| /// `hwVectorType`. `hwVectorInstance` represents one particular instance of |
| /// `hwVectorType` int the covering of th3e super-vector type. For a more |
| /// detailed description of the problem, see the description of |
| /// reindexAffineIndices. |
| static Instruction * |
| instantiate(FuncBuilder *b, VectorTransferWriteOp *write, |
| VectorType hwVectorType, ArrayRef<unsigned> hwVectorInstance, |
| DenseMap<const Value *, Value *> *substitutionsMap) { |
| SmallVector<Value *, 8> indices = |
| map(makePtrDynCaster<Value>(), write->getIndices()); |
| auto affineIndices = |
| reindexAffineIndices(b, hwVectorType, hwVectorInstance, indices); |
| auto cloned = b->create<VectorTransferWriteOp>( |
| write->getLoc(), |
| substitute(write->getVector(), hwVectorType, substitutionsMap), |
| write->getMemRef(), affineIndices, |
| projectedPermutationMap(write, hwVectorType)); |
| return cloned->getInstruction(); |
| } |
| |
| /// Returns `true` if inst instance is properly cloned and inserted, false |
| /// otherwise. |
| /// The multi-dimensional `hwVectorInstance` belongs to the shapeRatio of |
| /// super-vector type to hw vector type. |
| /// A cloned instance of `inst` is formed as follows: |
| /// 1. vector_transfer_read: the return `superVectorType` is replaced by |
| /// `hwVectorType`. Additionally, affine indices are reindexed with |
| /// `reindexAffineIndices` using `hwVectorInstance` and vector type |
| /// information; |
| /// 2. vector_transfer_write: the `valueToStore` type is simply substituted. |
| /// Since we operate on a topologically sorted slice, a substitution must |
| /// have been registered for non-constant ops. Additionally, affine indices |
| /// are reindexed in the same way as for vector_transfer_read; |
| /// 3. constant ops are splats of the super-vector type by construction. |
| /// They are cloned to a splat on the hw vector type with the same value; |
| /// 4. remaining ops are cloned to version of the op that returns a hw vector |
| /// type, all operands are substituted according to `substitutions`. Thanks |
| /// to the topological order of a slice, the substitution is always |
| /// possible. |
| /// |
| /// Returns true on failure. |
| static bool instantiateMaterialization(Instruction *inst, |
| MaterializationState *state) { |
| LLVM_DEBUG(dbgs() << "\ninstantiate: " << *inst); |
| |
| // Create a builder here for unroll-and-jam effects. |
| FuncBuilder b(inst); |
| // AffineApplyOp are ignored: instantiating the proper vector op will take |
| // care of AffineApplyOps by composing them properly. |
| if (inst->isa<AffineApplyOp>()) { |
| return false; |
| } |
| if (inst->getNumBlockLists() != 0) |
| return inst->emitError("NYI path Op with region"); |
| |
| if (auto write = inst->dyn_cast<VectorTransferWriteOp>()) { |
| auto *clone = instantiate(&b, write, state->hwVectorType, |
| state->hwVectorInstance, state->substitutionsMap); |
| return clone == nullptr; |
| } |
| if (auto read = inst->dyn_cast<VectorTransferReadOp>()) { |
| auto *clone = instantiate(&b, read, state->hwVectorType, |
| state->hwVectorInstance, state->substitutionsMap); |
| if (!clone) { |
| return true; |
| } |
| state->substitutionsMap->insert( |
| std::make_pair(read->getResult(), clone->getResult(0))); |
| return false; |
| } |
| // The only op with 0 results reaching this point must, by construction, be |
| // VectorTransferWriteOps and have been caught above. Ops with >= 2 results |
| // are not yet supported. So just support 1 result. |
| if (inst->getNumResults() != 1) { |
| return inst->emitError("NYI: ops with != 1 results"); |
| } |
| if (inst->getResult(0)->getType() != state->superVectorType) { |
| return inst->emitError("Op does not return a supervector."); |
| } |
| auto *clone = |
| instantiate(&b, inst, state->hwVectorType, state->substitutionsMap); |
| if (!clone) { |
| return true; |
| } |
| state->substitutionsMap->insert( |
| std::make_pair(inst->getResult(0), clone->getResult(0))); |
| return false; |
| } |
| |
| /// Takes a slice and rewrites the operations in it so that occurrences |
| /// of `superVectorType` are replaced by `hwVectorType`. |
| /// |
| /// Implementation |
| /// ============== |
| /// 1. computes the shape ratio of super-vector to HW vector shapes. This |
| /// gives for each op in the slice, how many instantiations are required |
| /// in each dimension; |
| /// 2. performs the concrete materialization. Note that in a first |
| /// implementation we use full unrolling because it pragmatically removes |
| /// the need to explicitly materialize an AllocOp. Thanks to the properties |
| /// of super-vectors, this unrolling is always possible and simple: |
| /// vectorizing to a super-vector abstraction already achieved the |
| /// equivalent of loop strip-mining + loop sinking and encoded this in the |
| /// vector type. |
| /// |
| /// Returns true on failure. |
| /// |
| /// TODO(ntv): materialized allocs. |
| /// TODO(ntv): full loops + materialized allocs. |
| /// TODO(ntv): partial unrolling + materialized allocs. |
| static bool emitSlice(MaterializationState *state, |
| SetVector<Instruction *> *slice) { |
| auto ratio = shapeRatio(state->superVectorType, state->hwVectorType); |
| assert(ratio.hasValue() && |
| "ratio of super-vector to HW-vector shape is not integral"); |
| // The number of integer points in a hyperrectangular region is: |
| // shape[0] * strides[0]. |
| auto numValueToUnroll = (*ratio)[0] * makeStrides(*ratio)[0]; |
| // Full unrolling to hardware vectors in a first approximation. |
| for (unsigned idx = 0; idx < numValueToUnroll; ++idx) { |
| // Fresh RAII instanceIndices and substitutionsMap. |
| MaterializationState scopedState = *state; |
| scopedState.hwVectorInstance = delinearize(idx, *ratio); |
| DenseMap<const Value *, Value *> substitutionMap; |
| scopedState.substitutionsMap = &substitutionMap; |
| // slice are topologically sorted, we can just clone them in order. |
| for (auto *inst : *slice) { |
| auto fail = instantiateMaterialization(inst, &scopedState); |
| if (fail) { |
| inst->emitError("Unhandled super-vector materialization failure"); |
| return true; |
| } |
| } |
| } |
| |
| LLVM_DEBUG(dbgs() << "\nMLFunction is now\n"); |
| LLVM_DEBUG((*slice)[0]->getFunction()->print(dbgs())); |
| |
| // slice are topologically sorted, we can just erase them in reverse |
| // order. Reverse iterator does not just work simply with an operator* |
| // dereference. |
| for (int idx = slice->size() - 1; idx >= 0; --idx) { |
| LLVM_DEBUG(dbgs() << "\nErase: "); |
| LLVM_DEBUG((*slice)[idx]->print(dbgs())); |
| (*slice)[idx]->erase(); |
| } |
| return false; |
| } |
| |
| /// Materializes super-vector types into concrete hw vector types as follows: |
| /// 1. start from super-vector terminators (current vector_transfer_write |
| /// ops); |
| /// 2. collect all the instructions that can be reached by transitive use-defs |
| /// chains; |
| /// 3. get the superVectorType for this particular terminator and the |
| /// corresponding hardware vector type (for now limited to F32) |
| /// TODO(ntv): be more general than F32. |
| /// 4. emit the transitive useDef set to operate on the finer-grain vector |
| /// types. |
| /// |
| /// Notes |
| /// ===== |
| /// The `slice` is sorted in topological order by construction. |
| /// Additionally, this set is limited to instructions in the same lexical scope |
| /// because we currently disallow vectorization of defs that come from another |
| /// scope. |
| /// TODO(ntv): please document return value. |
| static bool materialize(Function *f, |
| const SetVector<Instruction *> &terminators, |
| MaterializationState *state) { |
| DenseSet<Instruction *> seen; |
| DominanceInfo domInfo(f); |
| for (auto *term : terminators) { |
| // Short-circuit test, a given terminator may have been reached by some |
| // other previous transitive use-def chains. |
| if (seen.count(term) > 0) { |
| continue; |
| } |
| |
| auto terminator = term->cast<VectorTransferWriteOp>(); |
| LLVM_DEBUG(dbgs() << "\nFrom terminator:" << *term); |
| |
| // Get the transitive use-defs starting from terminator, limited to the |
| // current enclosing scope of the terminator. See the top of the function |
| // Note for the justification of this restriction. |
| // TODO(ntv): relax scoping constraints. |
| auto *enclosingScope = term->getParentInst(); |
| auto keepIfInSameScope = [enclosingScope, &domInfo](Instruction *inst) { |
| assert(inst && "NULL inst"); |
| if (!enclosingScope) { |
| // by construction, everyone is always under the top scope (null scope). |
| return true; |
| } |
| return domInfo.properlyDominates(enclosingScope, inst); |
| }; |
| SetVector<Instruction *> slice = |
| getSlice(term, keepIfInSameScope, keepIfInSameScope); |
| assert(!slice.empty()); |
| |
| // Sanity checks: transitive slice must be completely disjoint from |
| // what we have seen so far. |
| LLVM_DEBUG(dbgs() << "\nTransitive use-defs:"); |
| for (auto *ud : slice) { |
| LLVM_DEBUG(dbgs() << "\nud:" << *ud); |
| assert(seen.count(ud) == 0 && |
| "Transitive use-defs not disjoint from already seen"); |
| seen.insert(ud); |
| } |
| |
| // Emit the current slice. |
| // Set scoped super-vector and corresponding hw vector types. |
| state->superVectorType = terminator->getVectorType(); |
| assert((state->superVectorType.getElementType() == |
| FloatType::getF32(term->getContext())) && |
| "Only f32 supported for now"); |
| state->hwVectorType = VectorType::get( |
| state->hwVectorSize, state->superVectorType.getElementType()); |
| auto fail = emitSlice(state, &slice); |
| if (fail) { |
| return true; |
| } |
| LLVM_DEBUG(dbgs() << "\nMLFunction is now\n"); |
| LLVM_DEBUG(f->print(dbgs())); |
| } |
| return false; |
| } |
| |
| PassResult MaterializeVectorsPass::runOnFunction(Function *f) { |
| // Thread-safe RAII local context, BumpPtrAllocator freed on exit. |
| NestedPatternContext mlContext; |
| |
| // TODO(ntv): Check to see if this supports arbitrary top-level code. |
| if (f->getBlocks().size() != 1) |
| return success(); |
| |
| using matcher::Op; |
| LLVM_DEBUG(dbgs() << "\nMaterializeVectors on Function\n"); |
| LLVM_DEBUG(f->print(dbgs())); |
| |
| MaterializationState state; |
| // Get the hardware vector type. |
| // TODO(ntv): get elemental type from super-vector type rather than force f32. |
| auto subVectorType = |
| VectorType::get(state.hwVectorSize, FloatType::getF32(f->getContext())); |
| |
| // Capture terminators; i.e. vector_transfer_write ops involving a strict |
| // super-vector of subVectorType. |
| auto filter = [subVectorType](const Instruction &inst) { |
| if (!inst.isa<VectorTransferWriteOp>()) { |
| return false; |
| } |
| return matcher::operatesOnSuperVectors(inst, subVectorType); |
| }; |
| auto pat = Op(filter); |
| SmallVector<NestedMatch, 8> matches; |
| pat.match(f, &matches); |
| SetVector<Instruction *> terminators; |
| for (auto m : matches) { |
| terminators.insert(m.getMatchedInstruction()); |
| } |
| |
| auto fail = materialize(f, terminators, &state); |
| return fail ? PassResult::Failure : PassResult::Success; |
| } |
| |
| FunctionPass *mlir::createMaterializeVectorsPass() { |
| return new MaterializeVectorsPass(); |
| } |
| |
| static PassRegistration<MaterializeVectorsPass> |
| pass("materialize-vectors", "Materializes super-vectors to vectors of the " |
| "proper size for the hardware"); |
| |
| #undef DEBUG_TYPE |