| //===- Vectorize.cpp - Vectorize Pass Impl ----------------------*- C++ -*-===// |
| // |
| // Copyright 2019 The MLIR Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // ============================================================================= |
| // |
| // This file implements vectorization of loops, operations and data types to |
| // a target-independent, n-D super-vector abstraction. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "mlir/AffineOps/AffineOps.h" |
| #include "mlir/Analysis/LoopAnalysis.h" |
| #include "mlir/Analysis/NestedMatcher.h" |
| #include "mlir/Analysis/VectorAnalysis.h" |
| #include "mlir/IR/AffineExpr.h" |
| #include "mlir/IR/Builders.h" |
| #include "mlir/IR/BuiltinOps.h" |
| #include "mlir/IR/Location.h" |
| #include "mlir/IR/Types.h" |
| #include "mlir/Pass/Pass.h" |
| #include "mlir/StandardOps/StandardOps.h" |
| #include "mlir/SuperVectorOps/SuperVectorOps.h" |
| #include "mlir/Support/Functional.h" |
| #include "mlir/Support/LLVM.h" |
| #include "mlir/Transforms/Passes.h" |
| |
| #include "llvm/ADT/DenseMap.h" |
| #include "llvm/ADT/DenseSet.h" |
| #include "llvm/ADT/SetVector.h" |
| #include "llvm/ADT/SmallString.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/raw_ostream.h" |
| |
| using namespace mlir; |
| |
| /// |
| /// Implements a high-level vectorization strategy on a Function. |
| /// The abstraction used is that of super-vectors, which provide a single, |
| /// compact, representation in the vector types, information that is expected |
| /// to reduce the impact of the phase ordering problem |
| /// |
| /// Vector granularity: |
| /// =================== |
| /// This pass is designed to perform vectorization at a super-vector |
| /// granularity. A super-vector is loosely defined as a vector type that is a |
| /// multiple of a "good" vector size so the HW can efficiently implement a set |
| /// of high-level primitives. Multiple is understood along any dimension; e.g. |
| /// both vector<16xf32> and vector<2x8xf32> are valid super-vectors for a |
| /// vector<8xf32> HW vector. Note that a "good vector size so the HW can |
| /// efficiently implement a set of high-level primitives" is not necessarily an |
| /// integer multiple of actual hardware registers. We leave details of this |
| /// distinction unspecified for now. |
| /// |
| /// Some may prefer the terminology a "tile of HW vectors". In this case, one |
| /// should note that super-vectors implement an "always full tile" abstraction. |
| /// They guarantee no partial-tile separation is necessary by relying on a |
| /// high-level copy-reshape abstraction that we call vector_transfer. This |
| /// copy-reshape operations is also responsible for performing layout |
| /// transposition if necessary. In the general case this will require a scoped |
| /// allocation in some notional local memory. |
| /// |
| /// Whatever the mental model one prefers to use for this abstraction, the key |
| /// point is that we burn into a single, compact, representation in the vector |
| /// types, information that is expected to reduce the impact of the phase |
| /// ordering problem. Indeed, a vector type conveys information that: |
| /// 1. the associated loops have dependency semantics that do not prevent |
| /// vectorization; |
| /// 2. the associate loops have been sliced in chunks of static sizes that are |
| /// compatible with vector sizes (i.e. similar to unroll-and-jam); |
| /// 3. the inner loops, in the unroll-and-jam analogy of 2, are captured by |
| /// the |
| /// vector type and no vectorization hampering transformations can be |
| /// applied to them anymore; |
| /// 4. the underlying memrefs are accessed in some notional contiguous way |
| /// that allows loading into vectors with some amount of spatial locality; |
| /// In other words, super-vectorization provides a level of separation of |
| /// concern by way of opacity to subsequent passes. This has the effect of |
| /// encapsulating and propagating vectorization constraints down the list of |
| /// passes until we are ready to lower further. |
| /// |
| /// For a particular target, a notion of minimal n-d vector size will be |
| /// specified and vectorization targets a multiple of those. In the following |
| /// paragraph, let "k ." represent "a multiple of", to be understood as a |
| /// multiple in the same dimension (e.g. vector<16 x k . 128> summarizes |
| /// vector<16 x 128>, vector<16 x 256>, vector<16 x 1024>, etc). |
| /// |
| /// Some non-exhaustive notable super-vector sizes of interest include: |
| /// - CPU: vector<k . HW_vector_size>, |
| /// vector<k' . core_count x k . HW_vector_size>, |
| /// vector<socket_count x k' . core_count x k . HW_vector_size>; |
| /// - GPU: vector<k . warp_size>, |
| /// vector<k . warp_size x float2>, |
| /// vector<k . warp_size x float4>, |
| /// vector<k . warp_size x 4 x 4x 4> (for tensor_core sizes). |
| /// |
| /// Loops and operations are emitted that operate on those super-vector shapes. |
| /// Subsequent lowering passes will materialize to actual HW vector sizes. These |
| /// passes are expected to be (gradually) more target-specific. |
| /// |
| /// At a high level, a vectorized load in a loop will resemble: |
| /// ```mlir |
| /// for %i = ? to ? step ? { |
| /// %v_a = "vector_transfer_read" (A, %i) : (memref<?xf32>, index) -> |
| /// vector<128xf32> |
| /// } |
| /// ``` |
| /// It is the reponsibility of the implementation of the vector_transfer_read |
| /// to materialize vector registers from the original scalar memrefs. |
| /// A later (more target-dependent) lowering pass will materialize to actual HW |
| /// vector sizes. This lowering may be occur at different times: |
| /// 1. at the MLIR level into a combination of loops, unrolling, DmaStartOp + |
| /// DmaWaitOp + vectorized operations |
| /// for data transformations and shuffle; thus opening opportunities for |
| /// unrolling and pipelining. This is an instance of library call |
| /// "whiteboxing"; or |
| /// 2. later in the a target-specific lowering pass or hand-written library |
| /// call; achieving full separation of concerns. This is an instance of |
| /// library call; or |
| /// 3. a mix of both, e.g. based on a model. |
| /// In the future, these operations will expose a contract to constrain the |
| /// search on vectorization patterns and sizes. |
| /// |
| /// Occurrence of super-vectorization in the compiler flow: |
| /// ======================================================= |
| /// This is an active area of investigation. We start with 2 remarks to position |
| /// super-vectorization in the context of existing ongoing work: LLVM VPLAN |
| /// and LLVM SLP Vectorizer. |
| /// |
| /// LLVM VPLAN: |
| /// ----------- |
| /// The astute reader may have noticed that in the limit, super-vectorization |
| /// can be applied at a similar time and with similar objectives than VPLAN. |
| /// For instance, in the case of a traditional, polyhedral compilation-flow (for |
| /// instance, the PPCG project uses ISL to provide dependence analysis, |
| /// multi-level(scheduling + tiling), lifting footprint to fast memory, |
| /// communication synthesis, mapping, register optimizations) and before |
| /// unrolling. When vectorization is applied at this *late* level in a typical |
| /// polyhedral flow, and is instantiated with actual hardware vector sizes, |
| /// super-vectorization is expected to match (or subsume) the type of patterns |
| /// that LLVM's VPLAN aims at targeting. The main difference here is that MLIR |
| /// is higher level and our implementation should be significantly simpler. Also |
| /// note that in this mode, recursive patterns are probably a bit of an overkill |
| /// although it is reasonable to expect that mixing a bit of outer loop and |
| /// inner loop vectorization + unrolling will provide interesting choices to |
| /// MLIR. |
| /// |
| /// LLVM SLP Vectorizer: |
| /// -------------------- |
| /// Super-vectorization however is not meant to be usable in a similar fashion |
| /// to the SLP vectorizer. The main difference lies in the information that |
| /// both vectorizers use: super-vectorization examines contiguity of memory |
| /// references along fastest varying dimensions and loops with recursive nested |
| /// patterns capturing imperfectly-nested loop nests; the SLP vectorizer, on |
| /// the other hand, performs flat pattern matching inside a single unrolled loop |
| /// body and stitches together pieces of load and store instructions into full |
| /// 1-D vectors. We envision that the SLP vectorizer is a good way to capture |
| /// innermost loop, control-flow dependent patterns that super-vectorization may |
| /// not be able to capture easily. In other words, super-vectorization does not |
| /// aim at replacing the SLP vectorizer and the two solutions are complementary. |
| /// |
| /// Ongoing investigations: |
| /// ----------------------- |
| /// We discuss the following *early* places where super-vectorization is |
| /// applicable and touch on the expected benefits and risks . We list the |
| /// opportunities in the context of the traditional polyhedral compiler flow |
| /// described in PPCG. There are essentially 6 places in the MLIR pass pipeline |
| /// we expect to experiment with super-vectorization: |
| /// 1. Right after language lowering to MLIR: this is the earliest time where |
| /// super-vectorization is expected to be applied. At this level, all the |
| /// language/user/library-level annotations are available and can be fully |
| /// exploited. Examples include loop-type annotations (such as parallel, |
| /// reduction, scan, dependence distance vector, vectorizable) as well as |
| /// memory access annotations (such as non-aliasing writes guaranteed, |
| /// indirect accesses that are permutations by construction) accesses or |
| /// that a particular operation is prescribed atomic by the user. At this |
| /// level, anything that enriches what dependence analysis can do should be |
| /// aggressively exploited. At this level we are close to having explicit |
| /// vector types in the language, except we do not impose that burden on the |
| /// programmer/library: we derive information from scalar code + annotations. |
| /// 2. After dependence analysis and before polyhedral scheduling: the |
| /// information that supports vectorization does not need to be supplied by a |
| /// higher level of abstraction. Traditional dependence anaysis is available |
| /// in MLIR and will be used to drive vectorization and cost models. |
| /// |
| /// Let's pause here and remark that applying super-vectorization as described |
| /// in 1. and 2. presents clear opportunities and risks: |
| /// - the opportunity is that vectorization is burned in the type system and |
| /// is protected from the adverse effect of loop scheduling, tiling, loop |
| /// interchange and all passes downstream. Provided that subsequent passes are |
| /// able to operate on vector types; the vector shapes, associated loop |
| /// iterator properties, alignment, and contiguity of fastest varying |
| /// dimensions are preserved until we lower the super-vector types. We expect |
| /// this to significantly rein in on the adverse effects of phase ordering. |
| /// - the risks are that a. all passes after super-vectorization have to work |
| /// on elemental vector types (not that this is always true, wherever |
| /// vectorization is applied) and b. that imposing vectorization constraints |
| /// too early may be overall detrimental to loop fusion, tiling and other |
| /// transformations because the dependence distances are coarsened when |
| /// operating on elemental vector types. For this reason, the pattern |
| /// profitability analysis should include a component that also captures the |
| /// maximal amount of fusion available under a particular pattern. This is |
| /// still at the stage of rought ideas but in this context, search is our |
| /// friend as the Tensor Comprehensions and auto-TVM contributions |
| /// demonstrated previously. |
| /// Bottom-line is we do not yet have good answers for the above but aim at |
| /// making it easy to answer such questions. |
| /// |
| /// Back to our listing, the last places where early super-vectorization makes |
| /// sense are: |
| /// 3. right after polyhedral-style scheduling: PLUTO-style algorithms are known |
| /// to improve locality, parallelism and be configurable (e.g. max-fuse, |
| /// smart-fuse etc). They can also have adverse effects on contiguity |
| /// properties that are required for vectorization but the vector_transfer |
| /// copy-reshape-pad-transpose abstraction is expected to help recapture |
| /// these properties. |
| /// 4. right after polyhedral-style scheduling+tiling; |
| /// 5. right after scheduling+tiling+rescheduling: points 4 and 5 represent |
| /// probably the most promising places because applying tiling achieves a |
| /// separation of concerns that allows rescheduling to worry less about |
| /// locality and more about parallelism and distribution (e.g. min-fuse). |
| /// |
| /// At these levels the risk-reward looks different: on one hand we probably |
| /// lost a good deal of language/user/library-level annotation; on the other |
| /// hand we gained parallelism and locality through scheduling and tiling. |
| /// However we probably want to ensure tiling is compatible with the |
| /// full-tile-only abstraction used in super-vectorization or suffer the |
| /// consequences. It is too early to place bets on what will win but we expect |
| /// super-vectorization to be the right abstraction to allow exploring at all |
| /// these levels. And again, search is our friend. |
| /// |
| /// Lastly, we mention it again here: |
| /// 6. as a MLIR-based alternative to VPLAN. |
| /// |
| /// Lowering, unrolling, pipelining: |
| /// ================================ |
| /// TODO(ntv): point to the proper places. |
| /// |
| /// Algorithm: |
| /// ========== |
| /// The algorithm proceeds in a few steps: |
| /// 1. defining super-vectorization patterns and matching them on the tree of |
| /// AffineForOp. A super-vectorization pattern is defined as a recursive |
| /// data structures that matches and captures nested, imperfectly-nested |
| /// loops that have a. comformable loop annotations attached (e.g. parallel, |
| /// reduction, vectoriable, ...) as well as b. all contiguous load/store |
| /// operations along a specified minor dimension (not necessarily the |
| /// fastest varying) ; |
| /// 2. analyzing those patterns for profitability (TODO(ntv): and |
| /// interference); |
| /// 3. Then, for each pattern in order: |
| /// a. applying iterative rewriting of the loop and the load operations in |
| /// DFS postorder. Rewriting is implemented by coarsening the loops and |
| /// turning load operations into opaque vector_transfer_read ops; |
| /// b. keeping track of the load operations encountered as "roots" and the |
| /// store operations as "terminators"; |
| /// c. traversing the use-def chains starting from the roots and iteratively |
| /// propagating vectorized values. Scalar values that are encountered |
| /// during this process must come from outside the scope of the current |
| /// pattern (TODO(ntv): enforce this and generalize). Such a scalar value |
| /// is vectorized only if it is a constant (into a vector splat). The |
| /// non-constant case is not supported for now and results in the pattern |
| /// failing to vectorize; |
| /// d. performing a second traversal on the terminators (store ops) to |
| /// rewriting the scalar value they write to memory into vector form. |
| /// If the scalar value has been vectorized previously, we simply replace |
| /// it by its vector form. Otherwise, if the scalar value is a constant, |
| /// it is vectorized into a splat. In all other cases, vectorization for |
| /// the pattern currently fails. |
| /// e. if everything under the root AffineForOp in the current pattern |
| /// vectorizes properly, we commit that loop to the IR. Otherwise we |
| /// discard it and restore a previously cloned version of the loop. Thanks |
| /// to the recursive scoping nature of matchers and captured patterns, |
| /// this is transparently achieved by a simple RAII implementation. |
| /// f. vectorization is applied on the next pattern in the list. Because |
| /// pattern interference avoidance is not yet implemented and that we do |
| /// not support further vectorizing an already vector load we need to |
| /// re-verify that the pattern is still vectorizable. This is expected to |
| /// make cost models more difficult to write and is subject to improvement |
| /// in the future. |
| /// |
| /// Points c. and d. above are worth additional comment. In most passes that |
| /// do not change the type of operands, it is usually preferred to eagerly |
| /// `replaceAllUsesWith`. Unfortunately this does not work for vectorization |
| /// because during the use-def chain traversal, all the operands of an operation |
| /// must be available in vector form. Trying to propagate eagerly makes the IR |
| /// temporarily invalid and results in errors such as: |
| /// `vectorize.mlir:308:13: error: 'addf' op requires the same type for all |
| /// operands and results |
| /// %s5 = addf %a5, %b5 : f32` |
| /// |
| /// Lastly, we show a minimal example for which use-def chains rooted in load / |
| /// vector_transfer_read are not enough. This is what motivated splitting |
| /// terminator processing out of the use-def chains starting from loads. In the |
| /// following snippet, there is simply no load:: |
| /// ```mlir |
| /// mlfunc @fill(%A : memref<128xf32>) -> () { |
| /// %f1 = constant 1.0 : f32 |
| /// for %i0 = 0 to 32 { |
| /// store %f1, %A[%i0] : memref<128xf32, 0> |
| /// } |
| /// return |
| /// } |
| /// ``` |
| /// |
| /// Choice of loop transformation to support the algorithm: |
| /// ======================================================= |
| /// The choice of loop transformation to apply for coarsening vectorized loops |
| /// is still subject to exploratory tradeoffs. In particular, say we want to |
| /// vectorize by a factor 128, we want to transform the following input: |
| /// ```mlir |
| /// for %i = %M to %N { |
| /// %a = load A[%i] : memref<?xf32> |
| /// } |
| /// ``` |
| /// |
| /// Traditionally, one would vectorize late (after scheduling, tiling, |
| /// memory promotion etc) say after stripmining (and potentially unrolling in |
| /// the case of LLVM's SLP vectorizer): |
| /// ```mlir |
| /// for %i = floor(%M, 128) to ceil(%N, 128) { |
| /// for %ii = max(%M, 128 * %i) to min(%N, 128*%i + 127) { |
| /// %a = load A[%ii] : memref<?xf32> |
| /// } |
| /// } |
| /// ``` |
| /// |
| /// Instead, we seek to vectorize early and freeze vector types before |
| /// scheduling, so we want to generate a pattern that resembles: |
| /// ```mlir |
| /// for %i = ? to ? step ? { |
| /// %v_a = "vector_transfer_read" (A, %i) : (memref<?xf32>, index) -> |
| /// vector<128xf32> |
| /// } |
| /// ``` |
| /// |
| /// i. simply dividing the lower / upper bounds by 128 creates issues |
| /// when representing expressions such as ii + 1 because now we only |
| /// have access to original values that have been divided. Additional |
| /// information is needed to specify accesses at below-128 granularity; |
| /// ii. another alternative is to coarsen the loop step but this may have |
| /// consequences on dependence analysis and fusability of loops: fusable |
| /// loops probably need to have the same step (because we don't want to |
| /// stripmine/unroll to enable fusion). |
| /// As a consequence, we choose to represent the coarsening using the loop |
| /// step for now and reevaluate in the future. Note that we can renormalize |
| /// loop steps later if/when we have evidence that they are problematic. |
| /// |
| /// For the simple strawman example above, vectorizing for a 1-D vector |
| /// abstraction of size 128 returns code similar to: |
| /// ```mlir |
| /// for %i = %M to %N step 128 { |
| /// %v_a = "vector_transfer_read" (A, %i) : (memref<?xf32>, index) -> |
| /// vector<128xf32> |
| /// } |
| /// ``` |
| /// |
| /// Unsupported cases, extensions, and work in progress (help welcome :-) ): |
| /// ======================================================================== |
| /// 1. lowering to concrete vector types for various HW; |
| /// 2. reduction support; |
| /// 3. non-effecting padding during vector_transfer_read and filter during |
| /// vector_transfer_write; |
| /// 4. misalignment support vector_transfer_read / vector_transfer_write |
| /// (hopefully without read-modify-writes); |
| /// 5. control-flow support; |
| /// 6. cost-models, heuristics and search; |
| /// 7. Op implementation, extensions and implication on memref views; |
| /// 8. many TODOs left around. |
| /// |
| /// Examples: |
| /// ========= |
| /// Consider the following Function: |
| /// ```mlir |
| /// mlfunc @vector_add_2d(%M : index, %N : index) -> f32 { |
| /// %A = alloc (%M, %N) : memref<?x?xf32, 0> |
| /// %B = alloc (%M, %N) : memref<?x?xf32, 0> |
| /// %C = alloc (%M, %N) : memref<?x?xf32, 0> |
| /// %f1 = constant 1.0 : f32 |
| /// %f2 = constant 2.0 : f32 |
| /// for %i0 = 0 to %M { |
| /// for %i1 = 0 to %N { |
| /// // non-scoped %f1 |
| /// store %f1, %A[%i0, %i1] : memref<?x?xf32, 0> |
| /// } |
| /// } |
| /// for %i2 = 0 to %M { |
| /// for %i3 = 0 to %N { |
| /// // non-scoped %f2 |
| /// store %f2, %B[%i2, %i3] : memref<?x?xf32, 0> |
| /// } |
| /// } |
| /// for %i4 = 0 to %M { |
| /// for %i5 = 0 to %N { |
| /// %a5 = load %A[%i4, %i5] : memref<?x?xf32, 0> |
| /// %b5 = load %B[%i4, %i5] : memref<?x?xf32, 0> |
| /// %s5 = addf %a5, %b5 : f32 |
| /// // non-scoped %f1 |
| /// %s6 = addf %s5, %f1 : f32 |
| /// // non-scoped %f2 |
| /// %s7 = addf %s5, %f2 : f32 |
| /// // diamond dependency. |
| /// %s8 = addf %s7, %s6 : f32 |
| /// store %s8, %C[%i4, %i5] : memref<?x?xf32, 0> |
| /// } |
| /// } |
| /// %c7 = constant 7 : index |
| /// %c42 = constant 42 : index |
| /// %res = load %C[%c7, %c42] : memref<?x?xf32, 0> |
| /// return %res : f32 |
| /// } |
| /// ``` |
| /// |
| /// TODO(ntv): update post b/119731251. |
| /// The -vectorize pass with the following arguments: |
| /// ``` |
| /// -vectorize -virtual-vector-size 256 --test-fastest-varying=0 |
| /// ``` |
| /// |
| /// produces this standard innermost-loop vectorized code: |
| /// ```mlir |
| /// mlfunc @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 { |
| /// %0 = alloc(%arg0, %arg1) : memref<?x?xf32> |
| /// %1 = alloc(%arg0, %arg1) : memref<?x?xf32> |
| /// %2 = alloc(%arg0, %arg1) : memref<?x?xf32> |
| /// %cst = constant 1.0 : f32 |
| /// %cst_0 = constant 2.0 : f32 |
| /// for %i0 = 0 to %arg0 { |
| /// for %i1 = 0 to %arg1 step 256 { |
| /// %cst_1 = constant splat<vector<256xf32>, 1.0> : |
| /// vector<256xf32> |
| /// "vector_transfer_write"(%cst_1, %0, %i0, %i1) : |
| /// (vector<256xf32>, memref<?x?xf32>, index, index) -> () |
| /// } |
| /// } |
| /// for %i2 = 0 to %arg0 { |
| /// for %i3 = 0 to %arg1 step 256 { |
| /// %cst_2 = constant splat<vector<256xf32>, 2.0> : |
| /// vector<256xf32> |
| /// "vector_transfer_write"(%cst_2, %1, %i2, %i3) : |
| /// (vector<256xf32>, memref<?x?xf32>, index, index) -> () |
| /// } |
| /// } |
| /// for %i4 = 0 to %arg0 { |
| /// for %i5 = 0 to %arg1 step 256 { |
| /// %3 = "vector_transfer_read"(%0, %i4, %i5) : |
| /// (memref<?x?xf32>, index, index) -> vector<256xf32> |
| /// %4 = "vector_transfer_read"(%1, %i4, %i5) : |
| /// (memref<?x?xf32>, index, index) -> vector<256xf32> |
| /// %5 = addf %3, %4 : vector<256xf32> |
| /// %cst_3 = constant splat<vector<256xf32>, 1.0> : |
| /// vector<256xf32> |
| /// %6 = addf %5, %cst_3 : vector<256xf32> |
| /// %cst_4 = constant splat<vector<256xf32>, 2.0> : |
| /// vector<256xf32> |
| /// %7 = addf %5, %cst_4 : vector<256xf32> |
| /// %8 = addf %7, %6 : vector<256xf32> |
| /// "vector_transfer_write"(%8, %2, %i4, %i5) : |
| /// (vector<256xf32>, memref<?x?xf32>, index, index) -> () |
| /// } |
| /// } |
| /// %c7 = constant 7 : index |
| /// %c42 = constant 42 : index |
| /// %9 = load %2[%c7, %c42] : memref<?x?xf32> |
| /// return %9 : f32 |
| /// } |
| /// ``` |
| /// |
| /// TODO(ntv): update post b/119731251. |
| /// The -vectorize pass with the following arguments: |
| /// ``` |
| /// -vectorize -virtual-vector-size 32 -virtual-vector-size 256 |
| /// --test-fastest-varying=1 --test-fastest-varying=0 |
| /// ``` |
| /// |
| /// produces this more insteresting mixed outer-innermost-loop vectorized code: |
| /// ```mlir |
| /// mlfunc @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 { |
| /// %0 = alloc(%arg0, %arg1) : memref<?x?xf32> |
| /// %1 = alloc(%arg0, %arg1) : memref<?x?xf32> |
| /// %2 = alloc(%arg0, %arg1) : memref<?x?xf32> |
| /// %cst = constant 1.0 : f32 |
| /// %cst_0 = constant 2.0 : f32 |
| /// for %i0 = 0 to %arg0 step 32 { |
| /// for %i1 = 0 to %arg1 step 256 { |
| /// %cst_1 = constant splat<vector<32x256xf32>, 1.0> : |
| /// vector<32x256xf32> |
| /// "vector_transfer_write"(%cst_1, %0, %i0, %i1) : |
| /// (vector<32x256xf32>, memref<?x?xf32>, index, index) -> () |
| /// } |
| /// } |
| /// for %i2 = 0 to %arg0 step 32 { |
| /// for %i3 = 0 to %arg1 step 256 { |
| /// %cst_2 = constant splat<vector<32x256xf32>, 2.0> : |
| /// vector<32x256xf32> |
| /// "vector_transfer_write"(%cst_2, %1, %i2, %i3) : |
| /// (vector<32x256xf32>, memref<?x?xf32>, index, index) -> () |
| /// } |
| /// } |
| /// for %i4 = 0 to %arg0 step 32 { |
| /// for %i5 = 0 to %arg1 step 256 { |
| /// %3 = "vector_transfer_read"(%0, %i4, %i5) : |
| /// (memref<?x?xf32>, index, index) -> vector<32x256xf32> |
| /// %4 = "vector_transfer_read"(%1, %i4, %i5) : |
| /// (memref<?x?xf32>, index, index) -> vector<32x256xf32> |
| /// %5 = addf %3, %4 : vector<32x256xf32> |
| /// %cst_3 = constant splat<vector<32x256xf32>, 1.0> : |
| /// vector<32x256xf32> |
| /// %6 = addf %5, %cst_3 : vector<32x256xf32> |
| /// %cst_4 = constant splat<vector<32x256xf32>, 2.0> : |
| /// vector<32x256xf32> |
| /// %7 = addf %5, %cst_4 : vector<32x256xf32> |
| /// %8 = addf %7, %6 : vector<32x256xf32> |
| /// "vector_transfer_write"(%8, %2, %i4, %i5) : |
| /// (vector<32x256xf32>, memref<?x?xf32>, index, index) -> () |
| /// } |
| /// } |
| /// %c7 = constant 7 : index |
| /// %c42 = constant 42 : index |
| /// %9 = load %2[%c7, %c42] : memref<?x?xf32> |
| /// return %9 : f32 |
| /// } |
| /// ``` |
| /// |
| /// Of course, much more intricate n-D imperfectly-nested patterns can be |
| /// vectorized too and specified in a fully declarative fashion. |
| |
| #define DEBUG_TYPE "early-vect" |
| |
| using functional::apply; |
| using functional::makePtrDynCaster; |
| using functional::map; |
| using functional::ScopeGuard; |
| using llvm::dbgs; |
| using llvm::DenseSet; |
| using llvm::SetVector; |
| |
| static llvm::cl::OptionCategory clOptionsCategory("vectorize options"); |
| |
| static llvm::cl::list<int> clVirtualVectorSize( |
| "virtual-vector-size", |
| llvm::cl::desc("Specify n-D virtual vector size for early vectorization"), |
| llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory)); |
| |
| static llvm::cl::list<int> clFastestVaryingPattern( |
| "test-fastest-varying", |
| llvm::cl::desc( |
| "Specify a 1-D, 2-D or 3-D pattern of fastest varying memory" |
| " dimensions to match. See defaultPatterns in Vectorize.cpp for a" |
| " description and examples. This is used for testing purposes"), |
| llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory)); |
| |
| /// Forward declaration. |
| static FilterFunctionType |
| isVectorizableLoopPtrFactory(unsigned fastestVaryingMemRefDimension); |
| |
| // Build a bunch of predetermined patterns that will be traversed in order. |
| // Due to the recursive nature of NestedPatterns, this captures |
| // arbitrarily nested pairs of loops at any position in the tree. |
| /// Note that this currently only matches 2 nested loops and will be extended. |
| // TODO(ntv): support 3-D loop patterns with a common reduction loop that can |
| // be matched to GEMMs. |
| static std::vector<NestedPattern> defaultPatterns() { |
| using matcher::For; |
| return std::vector<NestedPattern>{ |
| // 3-D patterns |
| For(isVectorizableLoopPtrFactory(2), |
| For(isVectorizableLoopPtrFactory(1), |
| For(isVectorizableLoopPtrFactory(0)))), |
| // for i { for j { A[??f(not i, not j), f(i, not j), f(not i, j)];}} |
| // test independently with: |
| // --test-fastest-varying=1 --test-fastest-varying=0 |
| For(isVectorizableLoopPtrFactory(1), |
| For(isVectorizableLoopPtrFactory(0))), |
| // for i { for j { A[??f(not i, not j), f(i, not j), ?, f(not i, j)];}} |
| // test independently with: |
| // --test-fastest-varying=2 --test-fastest-varying=0 |
| For(isVectorizableLoopPtrFactory(2), |
| For(isVectorizableLoopPtrFactory(0))), |
| // for i { for j { A[??f(not i, not j), f(i, not j), ?, ?, f(not i, j)];}} |
| // test independently with: |
| // --test-fastest-varying=3 --test-fastest-varying=0 |
| For(isVectorizableLoopPtrFactory(3), |
| For(isVectorizableLoopPtrFactory(0))), |
| // for i { for j { A[??f(not i, not j), f(not i, j), f(i, not j)];}} |
| // test independently with: |
| // --test-fastest-varying=0 --test-fastest-varying=1 |
| For(isVectorizableLoopPtrFactory(0), |
| For(isVectorizableLoopPtrFactory(1))), |
| // for i { for j { A[??f(not i, not j), f(not i, j), ?, f(i, not j)];}} |
| // test independently with: |
| // --test-fastest-varying=0 --test-fastest-varying=2 |
| For(isVectorizableLoopPtrFactory(0), |
| For(isVectorizableLoopPtrFactory(2))), |
| // for i { for j { A[??f(not i, not j), f(not i, j), ?, ?, f(i, not j)];}} |
| // test independently with: |
| // --test-fastest-varying=0 --test-fastest-varying=3 |
| For(isVectorizableLoopPtrFactory(0), |
| For(isVectorizableLoopPtrFactory(3))), |
| // for i { A[??f(not i) , f(i)];} |
| // test independently with: --test-fastest-varying=0 |
| For(isVectorizableLoopPtrFactory(0)), |
| // for i { A[??f(not i) , f(i), ?];} |
| // test independently with: --test-fastest-varying=1 |
| For(isVectorizableLoopPtrFactory(1)), |
| // for i { A[??f(not i) , f(i), ?, ?];} |
| // test independently with: --test-fastest-varying=2 |
| For(isVectorizableLoopPtrFactory(2)), |
| // for i { A[??f(not i) , f(i), ?, ?, ?];} |
| // test independently with: --test-fastest-varying=3 |
| For(isVectorizableLoopPtrFactory(3))}; |
| } |
| |
| /// Creates a vectorization pattern from the command line arguments. |
| /// Up to 3-D patterns are supported. |
| /// If the command line argument requests a pattern of higher order, returns an |
| /// empty pattern list which will conservatively result in no vectorization. |
| static std::vector<NestedPattern> makePatterns() { |
| using matcher::For; |
| if (clFastestVaryingPattern.empty()) { |
| return defaultPatterns(); |
| } |
| switch (clFastestVaryingPattern.size()) { |
| case 1: |
| return {For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[0]))}; |
| case 2: |
| return {For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[0]), |
| For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[1])))}; |
| case 3: |
| return {For( |
| isVectorizableLoopPtrFactory(clFastestVaryingPattern[0]), |
| For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[1]), |
| For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[2]))))}; |
| default: |
| return std::vector<NestedPattern>(); |
| } |
| } |
| |
| namespace { |
| |
| struct Vectorize : public FunctionPass { |
| Vectorize() : FunctionPass(&Vectorize::passID) {} |
| |
| PassResult runOnFunction(Function *f) override; |
| |
| constexpr static PassID passID = {}; |
| }; |
| |
| } // end anonymous namespace |
| |
| /////// TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate. ////// |
| namespace { |
| |
| struct VectorizationStrategy { |
| SmallVector<int64_t, 8> vectorSizes; |
| DenseMap<Instruction *, unsigned> loopToVectorDim; |
| }; |
| |
| } // end anonymous namespace |
| |
| static void vectorizeLoopIfProfitable(Instruction *loop, |
| unsigned depthInPattern, |
| unsigned patternDepth, |
| VectorizationStrategy *strategy) { |
| assert(patternDepth > depthInPattern && |
| "patternDepth is greater than depthInPattern"); |
| if (patternDepth - depthInPattern > strategy->vectorSizes.size()) { |
| // Don't vectorize this loop |
| return; |
| } |
| strategy->loopToVectorDim[loop] = |
| strategy->vectorSizes.size() - (patternDepth - depthInPattern); |
| } |
| |
| /// Implements a simple strawman strategy for vectorization. |
| /// Given a matched pattern `matches` of depth `patternDepth`, this strategy |
| /// greedily assigns the fastest varying dimension ** of the vector ** to the |
| /// innermost loop in the pattern. |
| /// When coupled with a pattern that looks for the fastest varying dimension in |
| /// load/store MemRefs, this creates a generic vectorization strategy that works |
| /// for any loop in a hierarchy (outermost, innermost or intermediate). |
| /// |
| /// TODO(ntv): In the future we should additionally increase the power of the |
| /// profitability analysis along 3 directions: |
| /// 1. account for loop extents (both static and parametric + annotations); |
| /// 2. account for data layout permutations; |
| /// 3. account for impact of vectorization on maximal loop fusion. |
| /// Then we can quantify the above to build a cost model and search over |
| /// strategies. |
| static bool analyzeProfitability(ArrayRef<NestedMatch> matches, |
| unsigned depthInPattern, unsigned patternDepth, |
| VectorizationStrategy *strategy) { |
| for (auto m : matches) { |
| bool fail = analyzeProfitability(m.getMatchedChildren(), depthInPattern + 1, |
| patternDepth, strategy); |
| if (fail) { |
| return fail; |
| } |
| vectorizeLoopIfProfitable(m.getMatchedInstruction(), depthInPattern, |
| patternDepth, strategy); |
| } |
| return false; |
| } |
| |
| ///// end TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate ///// |
| |
| namespace { |
| |
| struct VectorizationState { |
| /// Adds an entry of pre/post vectorization instructions in the state. |
| void registerReplacement(Instruction *key, Instruction *value); |
| /// When the current vectorization pattern is successful, this erases the |
| /// instructions that were marked for erasure in the proper order and resets |
| /// the internal state for the next pattern. |
| void finishVectorizationPattern(); |
| |
| // In-order tracking of original Instruction that have been vectorized. |
| // Erase in reverse order. |
| SmallVector<Instruction *, 16> toErase; |
| // Set of Instruction that have been vectorized (the values in the |
| // vectorizationMap for hashed access). The vectorizedSet is used in |
| // particular to filter the instructions that have already been vectorized by |
| // this pattern, when iterating over nested loops in this pattern. |
| DenseSet<Instruction *> vectorizedSet; |
| // Map of old scalar Instruction to new vectorized Instruction. |
| DenseMap<Instruction *, Instruction *> vectorizationMap; |
| // Map of old scalar Value to new vectorized Value. |
| DenseMap<const Value *, Value *> replacementMap; |
| // The strategy drives which loop to vectorize by which amount. |
| const VectorizationStrategy *strategy; |
| // Use-def roots. These represent the starting points for the worklist in the |
| // vectorizeOperations function. They consist of the subset of load operations |
| // that have been vectorized. They can be retrieved from `vectorizationMap` |
| // but it is convenient to keep track of them in a separate data structure. |
| DenseSet<Instruction *> roots; |
| // Terminator instructions for the worklist in the vectorizeOperations |
| // function. They consist of the subset of store operations that have been |
| // vectorized. They can be retrieved from `vectorizationMap` but it is |
| // convenient to keep track of them in a separate data structure. Since they |
| // do not necessarily belong to use-def chains starting from loads (e.g |
| // storing a constant), we need to handle them in a post-pass. |
| DenseSet<Instruction *> terminators; |
| // Checks that the type of `inst` is StoreOp and adds it to the terminators |
| // set. |
| void registerTerminator(Instruction *inst); |
| |
| private: |
| void registerReplacement(const Value *key, Value *value); |
| }; |
| |
| } // end namespace |
| |
| void VectorizationState::registerReplacement(Instruction *key, |
| Instruction *value) { |
| LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ commit vectorized op: "); |
| LLVM_DEBUG(key->print(dbgs())); |
| LLVM_DEBUG(dbgs() << " into "); |
| LLVM_DEBUG(value->print(dbgs())); |
| assert(key->getNumResults() == 1 && "already registered"); |
| assert(value->getNumResults() == 1 && "already registered"); |
| assert(vectorizedSet.count(value) == 0 && "already registered"); |
| assert(vectorizationMap.count(key) == 0 && "already registered"); |
| toErase.push_back(key); |
| vectorizedSet.insert(value); |
| vectorizationMap.insert(std::make_pair(key, value)); |
| registerReplacement(key->getResult(0), value->getResult(0)); |
| if (key->isa<LoadOp>()) { |
| assert(roots.count(key) == 0 && "root was already inserted previously"); |
| roots.insert(key); |
| } |
| } |
| |
| void VectorizationState::registerTerminator(Instruction *inst) { |
| assert(inst->isa<StoreOp>() && "terminator must be a StoreOp"); |
| assert(terminators.count(inst) == 0 && |
| "terminator was already inserted previously"); |
| terminators.insert(inst); |
| } |
| |
| void VectorizationState::finishVectorizationPattern() { |
| while (!toErase.empty()) { |
| auto *inst = toErase.pop_back_val(); |
| LLVM_DEBUG(dbgs() << "\n[early-vect] finishVectorizationPattern erase: "); |
| LLVM_DEBUG(inst->print(dbgs())); |
| inst->erase(); |
| } |
| } |
| |
| void VectorizationState::registerReplacement(const Value *key, Value *value) { |
| assert(replacementMap.count(key) == 0 && "replacement already registered"); |
| replacementMap.insert(std::make_pair(key, value)); |
| } |
| |
| ////// TODO(ntv): Hoist to a VectorizationMaterialize.cpp when appropriate. //// |
| |
| /// Handles the vectorization of load and store MLIR operations. |
| /// |
| /// LoadOp operations are the roots of the vectorizeOperations call. They are |
| /// vectorized immediately. The resulting vector_transfer_read is immediately |
| /// registered to replace all uses of the LoadOp in this pattern's scope. |
| /// |
| /// StoreOp are the terminators of the vectorizeOperations call. They need |
| /// to be vectorized late once all the use-def chains have been traversed. |
| /// Additionally, they may have ssa-values operands which come from outside |
| /// the scope of the current pattern. |
| /// Such special cases force us to delay the vectorization of the stores |
| /// until the last step. Here we merely register the store operation. |
| template <typename LoadOrStoreOpPointer> |
| static bool vectorizeRootOrTerminal(Value *iv, LoadOrStoreOpPointer memoryOp, |
| VectorizationState *state) { |
| auto memRefType = |
| memoryOp->getMemRef()->getType().template cast<MemRefType>(); |
| |
| auto elementType = memRefType.getElementType(); |
| // TODO(ntv): ponder whether we want to further vectorize a vector value. |
| assert(VectorType::isValidElementType(elementType) && |
| "Not a valid vector element type"); |
| auto vectorType = VectorType::get(state->strategy->vectorSizes, elementType); |
| |
| // Materialize a MemRef with 1 vector. |
| auto *opInst = memoryOp->getInstruction(); |
| // For now, vector_transfers must be aligned, operate only on indices with an |
| // identity subset of AffineMap and do not change layout. |
| // TODO(ntv): increase the expressiveness power of vector_transfer operations |
| // as needed by various targets. |
| if (opInst->template isa<LoadOp>()) { |
| auto permutationMap = |
| makePermutationMap(opInst, state->strategy->loopToVectorDim); |
| LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: "); |
| LLVM_DEBUG(permutationMap.print(dbgs())); |
| FuncBuilder b(opInst); |
| auto transfer = b.create<VectorTransferReadOp>( |
| opInst->getLoc(), vectorType, memoryOp->getMemRef(), |
| map(makePtrDynCaster<Value>(), memoryOp->getIndices()), permutationMap); |
| state->registerReplacement(opInst, transfer->getInstruction()); |
| } else { |
| state->registerTerminator(opInst); |
| } |
| return false; |
| } |
| /// end TODO(ntv): Hoist to a VectorizationMaterialize.cpp when appropriate. /// |
| |
| /// Coarsens the loops bounds and transforms all remaining load and store |
| /// operations into the appropriate vector_transfer. |
| static bool vectorizeAffineForOp(AffineForOp *loop, int64_t step, |
| VectorizationState *state) { |
| using namespace functional; |
| loop->setStep(step); |
| |
| FilterFunctionType notVectorizedThisPattern = |
| [state](const Instruction &inst) { |
| if (!matcher::isLoadOrStore(inst)) { |
| return false; |
| } |
| return state->vectorizationMap.count(&inst) == 0 && |
| state->vectorizedSet.count(&inst) == 0 && |
| state->roots.count(&inst) == 0 && |
| state->terminators.count(&inst) == 0; |
| }; |
| auto loadAndStores = matcher::Op(notVectorizedThisPattern); |
| SmallVector<NestedMatch, 8> loadAndStoresMatches; |
| loadAndStores.match(loop->getInstruction(), &loadAndStoresMatches); |
| for (auto ls : loadAndStoresMatches) { |
| auto *opInst = ls.getMatchedInstruction(); |
| auto load = opInst->dyn_cast<LoadOp>(); |
| auto store = opInst->dyn_cast<StoreOp>(); |
| LLVM_DEBUG(opInst->print(dbgs())); |
| auto fail = |
| load ? vectorizeRootOrTerminal(loop->getInductionVar(), load, state) |
| : vectorizeRootOrTerminal(loop->getInductionVar(), store, state); |
| if (fail) { |
| return fail; |
| } |
| } |
| return false; |
| } |
| |
| /// Returns a FilterFunctionType that can be used in NestedPattern to |
| /// match a loop whose underlying load/store accesses are all varying along the |
| /// `fastestVaryingMemRefDimension`. |
| /// TODO(ntv): In the future, allow more interesting mixed layout permutation |
| /// once we understand better the performance implications and we are confident |
| /// we can build a cost model and a search procedure. |
| static FilterFunctionType |
| isVectorizableLoopPtrFactory(unsigned fastestVaryingMemRefDimension) { |
| return [fastestVaryingMemRefDimension](const Instruction &forInst) { |
| auto loop = forInst.cast<AffineForOp>(); |
| return isVectorizableLoopAlongFastestVaryingMemRefDim( |
| loop, fastestVaryingMemRefDimension); |
| }; |
| } |
| |
| /// Forward-declaration. |
| static bool vectorizeNonRoot(ArrayRef<NestedMatch> matches, |
| VectorizationState *state); |
| |
| /// Apply vectorization of `loop` according to `state`. This is only triggered |
| /// if all vectorizations in `childrenMatches` have already succeeded |
| /// recursively in DFS post-order. |
| static bool doVectorize(NestedMatch oneMatch, VectorizationState *state) { |
| auto *loopInst = oneMatch.getMatchedInstruction(); |
| auto loop = loopInst->cast<AffineForOp>(); |
| auto childrenMatches = oneMatch.getMatchedChildren(); |
| |
| // 1. DFS postorder recursion, if any of my children fails, I fail too. |
| auto fail = vectorizeNonRoot(childrenMatches, state); |
| if (fail) { |
| // Early exit and trigger RAII cleanups at the root. |
| return fail; |
| } |
| |
| // 2. This loop may have been omitted from vectorization for various reasons |
| // (e.g. due to the performance model or pattern depth > vector size). |
| auto it = state->strategy->loopToVectorDim.find(loopInst); |
| if (it == state->strategy->loopToVectorDim.end()) { |
| return false; |
| } |
| |
| // 3. Actual post-order transformation. |
| auto vectorDim = it->second; |
| assert(vectorDim < state->strategy->vectorSizes.size() && |
| "vector dim overflow"); |
| // a. get actual vector size |
| auto vectorSize = state->strategy->vectorSizes[vectorDim]; |
| // b. loop transformation for early vectorization is still subject to |
| // exploratory tradeoffs (see top of the file). Apply coarsening, i.e.: |
| // | ub -> ub |
| // | step -> step * vectorSize |
| LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize |
| << " : "); |
| LLVM_DEBUG(loopInst->print(dbgs())); |
| return vectorizeAffineForOp(loop, loop->getStep() * vectorSize, state); |
| } |
| |
| /// Non-root pattern iterates over the matches at this level, calls doVectorize |
| /// and exits early if anything below fails. |
| static bool vectorizeNonRoot(ArrayRef<NestedMatch> matches, |
| VectorizationState *state) { |
| for (auto m : matches) { |
| auto fail = doVectorize(m, state); |
| if (fail) { |
| // Early exit and trigger RAII cleanups at the root. |
| return fail; |
| } |
| } |
| return false; |
| } |
| |
| /// Tries to transform a scalar constant into a vector splat of that constant. |
| /// Returns the vectorized splat operation if the constant is a valid vector |
| /// element type. |
| /// If `type` is not a valid vector type or if the scalar constant is not a |
| /// valid vector element type, returns nullptr. |
| static Value *vectorizeConstant(Instruction *inst, const ConstantOp &constant, |
| Type type) { |
| if (!type || !type.isa<VectorType>() || |
| !VectorType::isValidElementType(constant.getType())) { |
| return nullptr; |
| } |
| FuncBuilder b(inst); |
| Location loc = inst->getLoc(); |
| auto vectorType = type.cast<VectorType>(); |
| auto attr = SplatElementsAttr::get(vectorType, constant.getValue()); |
| auto *constantOpInst = constant.getInstruction(); |
| |
| OperationState state( |
| b.getContext(), loc, constantOpInst->getName().getStringRef(), {}, |
| {vectorType}, |
| {std::make_pair(Identifier::get("value", b.getContext()), attr)}); |
| |
| return b.createOperation(state)->getResult(0); |
| } |
| |
| /// Returns a uniqu'ed VectorType. |
| /// In the case `v`'s defining instruction is already part of the `state`'s |
| /// vectorizedSet, just returns the type of `v`. |
| /// Otherwise, constructs a new VectorType of shape defined by `state.strategy` |
| /// and of elemental type the type of `v`. |
| static Type getVectorType(Value *v, const VectorizationState &state) { |
| if (!VectorType::isValidElementType(v->getType())) { |
| return Type(); |
| } |
| if (state.vectorizedSet.count(v->getDefiningInst()) > 0) { |
| return v->getType().cast<VectorType>(); |
| } |
| return VectorType::get(state.strategy->vectorSizes, v->getType()); |
| }; |
| |
| /// Tries to vectorize a given operand `op` of Instruction `inst` during |
| /// def-chain propagation or during terminator vectorization, by applying the |
| /// following logic: |
| /// 1. if the defining instruction is part of the vectorizedSet (i.e. vectorized |
| /// useby -def propagation), `op` is already in the proper vector form; |
| /// 2. otherwise, the `op` may be in some other vector form that fails to |
| /// vectorize atm (i.e. broadcasting required), returns nullptr to indicate |
| /// failure; |
| /// 3. if the `op` is a constant, returns the vectorized form of the constant; |
| /// 4. non-constant scalars are currently non-vectorizable, in particular to |
| /// guard against vectorizing an index which may be loop-variant and needs |
| /// special handling. |
| /// |
| /// In particular this logic captures some of the use cases where definitions |
| /// that are not scoped under the current pattern are needed to vectorize. |
| /// One such example is top level function constants that need to be splatted. |
| /// |
| /// Returns an operand that has been vectorized to match `state`'s strategy if |
| /// vectorization is possible with the above logic. Returns nullptr otherwise. |
| /// |
| /// TODO(ntv): handle more complex cases. |
| static Value *vectorizeOperand(Value *operand, Instruction *inst, |
| VectorizationState *state) { |
| LLVM_DEBUG(dbgs() << "\n[early-vect]vectorize operand: "); |
| LLVM_DEBUG(operand->print(dbgs())); |
| // 1. If this value has already been vectorized this round, we are done. |
| if (state->vectorizedSet.count(operand->getDefiningInst()) > 0) { |
| LLVM_DEBUG(dbgs() << " -> already vector operand"); |
| return operand; |
| } |
| // 1.b. Delayed on-demand replacement of a use. |
| // Note that we cannot just call replaceAllUsesWith because it may result |
| // in ops with mixed types, for ops whose operands have not all yet |
| // been vectorized. This would be invalid IR. |
| auto it = state->replacementMap.find(operand); |
| if (it != state->replacementMap.end()) { |
| auto *res = it->second; |
| LLVM_DEBUG(dbgs() << "-> delayed replacement by: "); |
| LLVM_DEBUG(res->print(dbgs())); |
| return res; |
| } |
| // 2. TODO(ntv): broadcast needed. |
| if (operand->getType().isa<VectorType>()) { |
| LLVM_DEBUG(dbgs() << "-> non-vectorizable"); |
| return nullptr; |
| } |
| // 3. vectorize constant. |
| if (auto constant = operand->getDefiningInst()->dyn_cast<ConstantOp>()) { |
| return vectorizeConstant(inst, *constant, |
| getVectorType(operand, *state).cast<VectorType>()); |
| } |
| // 4. currently non-vectorizable. |
| LLVM_DEBUG(dbgs() << "-> non-vectorizable"); |
| LLVM_DEBUG(operand->print(dbgs())); |
| return nullptr; |
| }; |
| |
| /// Encodes Instruction-specific behavior for vectorization. In general we |
| /// assume that all operands of an op must be vectorized but this is not always |
| /// true. In the future, it would be nice to have a trait that describes how a |
| /// particular operation vectorizes. For now we implement the case distinction |
| /// here. |
| /// Returns a vectorized form of an operation or nullptr if vectorization fails. |
| /// TODO(ntv): consider adding a trait to Op to describe how it gets vectorized. |
| /// Maybe some Ops are not vectorizable or require some tricky logic, we cannot |
| /// do one-off logic here; ideally it would be TableGen'd. |
| static Instruction *vectorizeOneInstruction(FuncBuilder *b, Instruction *opInst, |
| VectorizationState *state) { |
| // Sanity checks. |
| assert(!opInst->isa<LoadOp>() && |
| "all loads must have already been fully vectorized independently"); |
| assert(!opInst->isa<VectorTransferReadOp>() && |
| "vector_transfer_read cannot be further vectorized"); |
| assert(!opInst->isa<VectorTransferWriteOp>() && |
| "vector_transfer_write cannot be further vectorized"); |
| |
| if (auto store = opInst->dyn_cast<StoreOp>()) { |
| auto *memRef = store->getMemRef(); |
| auto *value = store->getValueToStore(); |
| auto *vectorValue = vectorizeOperand(value, opInst, state); |
| auto indices = map(makePtrDynCaster<Value>(), store->getIndices()); |
| FuncBuilder b(opInst); |
| auto permutationMap = |
| makePermutationMap(opInst, state->strategy->loopToVectorDim); |
| LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: "); |
| LLVM_DEBUG(permutationMap.print(dbgs())); |
| auto transfer = b.create<VectorTransferWriteOp>( |
| opInst->getLoc(), vectorValue, memRef, indices, permutationMap); |
| auto *res = transfer->getInstruction(); |
| LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << *res); |
| // "Terminators" (i.e. StoreOps) are erased on the spot. |
| opInst->erase(); |
| return res; |
| } |
| if (opInst->getNumBlockLists() != 0) |
| return nullptr; |
| |
| auto types = map([state](Value *v) { return getVectorType(v, *state); }, |
| opInst->getResults()); |
| auto vectorizeOneOperand = [opInst, state](Value *op) -> Value * { |
| return vectorizeOperand(op, opInst, state); |
| }; |
| auto operands = map(vectorizeOneOperand, opInst->getOperands()); |
| // Check whether a single operand is null. If so, vectorization failed. |
| bool success = llvm::all_of(operands, [](Value *op) { return op; }); |
| if (!success) { |
| LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ an operand failed vectorize"); |
| return nullptr; |
| } |
| |
| // Create a clone of the op with the proper operands and return types. |
| // TODO(ntv): The following assumes there is always an op with a fixed |
| // name that works both in scalar mode and vector mode. |
| // TODO(ntv): Is it worth considering an Instruction.clone operation |
| // which changes the type so we can promote an Instruction with less |
| // boilerplate? |
| OperationState newOp(b->getContext(), opInst->getLoc(), |
| opInst->getName().getStringRef(), operands, types, |
| opInst->getAttrs(), /*successors=*/{}, |
| /*numBlockLists=*/0, opInst->hasResizableOperandsList()); |
| return b->createOperation(newOp); |
| } |
| |
| /// Iterates over the Instruction in the loop and rewrites them using their |
| /// vectorized counterpart by: |
| /// 1. iteratively building a worklist of uses of the Instruction vectorized |
| /// so far by this pattern; |
| /// 2. for each Instruction in the worklist, create the vector form of this |
| /// operation and replace all its uses by the vectorized form. For this step, |
| /// the worklist must be traversed in order; |
| /// 3. verify that all operands of the newly vectorized operation have been |
| /// vectorized by this pattern. |
| static bool vectorizeOperations(VectorizationState *state) { |
| // 1. create initial worklist with the uses of the roots. |
| SetVector<Instruction *> worklist; |
| auto insertUsesOf = [&worklist, state](Instruction *vectorized) { |
| for (auto *r : vectorized->getResults()) |
| for (auto &u : r->getUses()) { |
| auto *inst = u.getOwner(); |
| // Don't propagate to terminals, a separate pass is needed for those. |
| // TODO(ntv)[b/119759136]: use isa<> once Op is implemented. |
| if (state->terminators.count(inst) > 0) { |
| continue; |
| } |
| worklist.insert(inst); |
| } |
| }; |
| apply(insertUsesOf, state->roots); |
| |
| // Note: Worklist size increases iteratively. At each round we evaluate the |
| // size again. By construction, the order of elements in the worklist is |
| // consistent across iterations. |
| for (unsigned i = 0; i < worklist.size(); ++i) { |
| auto *inst = worklist[i]; |
| LLVM_DEBUG(dbgs() << "\n[early-vect] vectorize use: "); |
| LLVM_DEBUG(inst->print(dbgs())); |
| |
| // 2. Create vectorized form of the instruction. |
| // Insert it just before inst, on success register inst as replaced. |
| FuncBuilder b(inst); |
| auto *vectorizedInst = vectorizeOneInstruction(&b, inst, state); |
| if (!vectorizedInst) { |
| return true; |
| } |
| |
| // 3. Register replacement for future uses in the scop. |
| // Note that we cannot just call replaceAllUsesWith because it may |
| // result in ops with mixed types, for ops whose operands have not all |
| // yet been vectorized. This would be invalid IR. |
| state->registerReplacement(inst, vectorizedInst); |
| |
| // 4. Augment the worklist with uses of the instruction we just vectorized. |
| // This preserves the proper order in the worklist. |
| apply(insertUsesOf, ArrayRef<Instruction *>{inst}); |
| } |
| return false; |
| } |
| |
| /// Vectorization is a recursive procedure where anything below can fail. |
| /// The root match thus needs to maintain a clone for handling failure. |
| /// Each root may succeed independently but will otherwise clean after itself if |
| /// anything below it fails. |
| static bool vectorizeRootMatch(NestedMatch m, VectorizationStrategy *strategy) { |
| auto loop = m.getMatchedInstruction()->cast<AffineForOp>(); |
| VectorizationState state; |
| state.strategy = strategy; |
| |
| // Since patterns are recursive, they can very well intersect. |
| // Since we do not want a fully greedy strategy in general, we decouple |
| // pattern matching, from profitability analysis, from application. |
| // As a consequence we must check that each root pattern is still |
| // vectorizable. If a pattern is not vectorizable anymore, we just skip it. |
| // TODO(ntv): implement a non-greedy profitability analysis that keeps only |
| // non-intersecting patterns. |
| if (!isVectorizableLoop(loop)) { |
| LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable"); |
| return true; |
| } |
| auto *loopInst = loop->getInstruction(); |
| FuncBuilder builder(loopInst); |
| auto clonedLoop = builder.clone(*loopInst)->cast<AffineForOp>(); |
| |
| auto fail = doVectorize(m, &state); |
| /// Sets up error handling for this root loop. This is how the root match |
| /// maintains a clone for handling failure and restores the proper state via |
| /// RAII. |
| ScopeGuard sg2([&fail, &loop, &clonedLoop]() { |
| if (fail) { |
| loop->getInductionVar()->replaceAllUsesWith( |
| clonedLoop->getInductionVar()); |
| loop->erase(); |
| } else { |
| clonedLoop->erase(); |
| } |
| }); |
| if (fail) { |
| LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed root doVectorize"); |
| return true; |
| } |
| |
| // Form the root operationsthat have been set in the replacementMap. |
| // For now, these roots are the loads for which vector_transfer_read |
| // operations have been inserted. |
| auto getDefiningInst = [](const Value *val) { |
| return const_cast<Value *>(val)->getDefiningInst(); |
| }; |
| using ReferenceTy = decltype(*(state.replacementMap.begin())); |
| auto getKey = [](ReferenceTy it) { return it.first; }; |
| auto roots = map(getDefiningInst, map(getKey, state.replacementMap)); |
| |
| // Vectorize the root operations and everything reached by use-def chains |
| // except the terminators (store instructions) that need to be |
| // post-processed separately. |
| fail = vectorizeOperations(&state); |
| if (fail) { |
| LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed vectorizeOperations"); |
| return true; |
| } |
| |
| // Finally, vectorize the terminators. If anything fails to vectorize, skip. |
| auto vectorizeOrFail = [&fail, &state](Instruction *inst) { |
| if (fail) { |
| return; |
| } |
| FuncBuilder b(inst); |
| auto *res = vectorizeOneInstruction(&b, inst, &state); |
| if (res == nullptr) { |
| fail = true; |
| } |
| }; |
| apply(vectorizeOrFail, state.terminators); |
| if (fail) { |
| LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed to vectorize terminators"); |
| return true; |
| } else { |
| LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern"); |
| } |
| |
| // Finish this vectorization pattern. |
| state.finishVectorizationPattern(); |
| return false; |
| } |
| |
| /// Applies vectorization to the current Function by searching over a bunch of |
| /// predetermined patterns. |
| PassResult Vectorize::runOnFunction(Function *f) { |
| // Thread-safe RAII local context, BumpPtrAllocator freed on exit. |
| NestedPatternContext mlContext; |
| |
| for (auto &pat : makePatterns()) { |
| LLVM_DEBUG(dbgs() << "\n******************************************"); |
| LLVM_DEBUG(dbgs() << "\n******************************************"); |
| LLVM_DEBUG(dbgs() << "\n[early-vect] new pattern on Function\n"); |
| LLVM_DEBUG(f->print(dbgs())); |
| unsigned patternDepth = pat.getDepth(); |
| |
| SmallVector<NestedMatch, 8> matches; |
| pat.match(f, &matches); |
| // Iterate over all the top-level matches and vectorize eagerly. |
| // This automatically prunes intersecting matches. |
| for (auto m : matches) { |
| VectorizationStrategy strategy; |
| // TODO(ntv): depending on profitability, elect to reduce the vector size. |
| strategy.vectorSizes.assign(clVirtualVectorSize.begin(), |
| clVirtualVectorSize.end()); |
| auto fail = analyzeProfitability(m.getMatchedChildren(), 1, patternDepth, |
| &strategy); |
| if (fail) { |
| continue; |
| } |
| vectorizeLoopIfProfitable(m.getMatchedInstruction(), 0, patternDepth, |
| &strategy); |
| // TODO(ntv): if pattern does not apply, report it; alter the |
| // cost/benefit. |
| fail = vectorizeRootMatch(m, &strategy); |
| // TODO(ntv): some diagnostics. |
| } |
| } |
| LLVM_DEBUG(dbgs() << "\n"); |
| return PassResult::Success; |
| } |
| |
| FunctionPass *mlir::createVectorizePass() { return new Vectorize(); } |
| |
| static PassRegistration<Vectorize> |
| pass("vectorize", |
| "Vectorize to a target independent n-D vector abstraction"); |