Fix Linalg tiling for the partial tile case.

    This CL prepares for mixing lowering of tiled linalg operations to loops with load and store operations. In particular it is necessary to capture partial tile information in views. This CL makes slice ops during Linalg tiling properly stop at partial tile boundaries by implementing `min` with a `cmpi` and `select` over values of index type.

    To be consistent with lowering to loops, the implementation of tiling also drops specifics of accessing values via ranges and instead uses ranges of the form
    `[0, dim(view), 1]` for creating view slices. This simplifies the code for the implementation of tiling and utils.

    This also allows removing restrictions around needing a View or SliceOp defined in the current function context (as well as all it RangeOps). The restriction removal is tested by making the dot test operate directly on views.

    The above is still subject to folding of the linalg.dim operation left for a future CL.

    At this time, mixing tiling and lowering to loops all the way to execution is not yet functional because affine.for does not allow arbitrarily defined values of index type as its operands.

    The previously introduced linalg.range_intersection was not sufficient to capture the necessary information and still required dealing with max quantities.
    A followup CL will remove linalg.range_intersection.

--

PiperOrigin-RevId: 249698823
diff --git a/include/mlir/Linalg/Utils/Utils.h b/include/mlir/Linalg/Utils/Utils.h
index eea92be..26fcbc7 100644
--- a/include/mlir/Linalg/Utils/Utils.h
+++ b/include/mlir/Linalg/Utils/Utils.h
@@ -42,6 +42,9 @@
 
 } // namespace edsc
 
+namespace linalg {
+class LinalgOp;
+
 /// Helper class to memoize the creation of redundant constants within a given
 /// function.
 class FunctionConstants {
@@ -54,40 +57,10 @@
   llvm::SmallDenseMap<int64_t, Value *> map;
 };
 
-/// Abstracts away the extraction of values of RangeType from the actual op
-/// implementation. For each operand of `op`:
-///   1. If it is of RangeType, appends it to the result.
-///   2. If it is of ViewType, further differentiates between:
-///      a. Views that have a defining op, in which cases it appends the ranges
-///         of the defining op.
-///      b. Views that do not have a defining op, in which case it materializes
-///         new range extraction ops to retrieve the range. This is not yet
-///         implemented and depends on future operations (e.g. extract_range).
-/// Precedence is given to a. over b. because it allows propagating existing
-/// values instead of creating new, duplicate, values.
-// TODO(ntv): Implement range extraction ops.
-SmallVector<Value *, 8> getRanges(Operation *op);
-
-/// Returns a value of ViewType at `b`, `loc` by applying the `ranges` to
-/// `viewDefiningOp`. This creates a new op unless `viewDefiningOp` already has
-/// the same exact `ranges`, in which case its (unique) result is returned.
-Value *createOrReturnView(FuncBuilder *b, Location loc,
-                          Operation *viewDefiningOp,
-                          llvm::ArrayRef<Value *> ranges);
-
-/// Returns the min/max/step from a RangeType value, depending on `part`:
-///   1. If `range` comes from a range defining op, this just returns the proper
-///      operand.
-///   2. Otherwise (e.g. if range is a function parameter), it materializes new
-///      part extraction ops to retrieve the min/max/step. This is not yet
-///      implemented and depends on future operations (e.g. extract_min, ...).
-/// Precedence is given to 1. over 2. because it allows propagating existing
-/// values instead of creating new, duplicate, values.
-/// This is used to abstract away the extraction of the min/max/step from a
-/// RangeType value.
-// TODO(ntv): Implement range extraction ops.
-enum class RangePart { Min = 0, Max, Step };
-Value *extractRangePart(Value *range, RangePart part);
+// Returns the linearized list of all view dimensions in a linalgOp. Applying
+// the inverse, concatenated loopToOperandRangeMaps to this list allows the
+// derivation of loop ranges for any linalgOp.
+SmallVector<Value *, 8> getViewSizes(LinalgOp &linalgOp);
 
 /// Returns the values obtained by applying `map` to the list of values.
 /// Performs simplifications and foldings where possible.
@@ -96,14 +69,7 @@
                                          ArrayRef<Value *> values,
                                          FunctionConstants &state);
 
-/// Returns the values obtained by applying `map` to the list of range parts
-/// extracted from `ranges`. Performs simplifications and foldings where
-/// possible.
-SmallVector<Value *, 4> applyMapToRangePart(FuncBuilder *b, Location loc,
-                                            AffineMap map,
-                                            ArrayRef<Value *> ranges,
-                                            RangePart part,
-                                            FunctionConstants &state);
+} // namespace linalg
 } // namespace mlir
 
 #endif // MLIR_LINALG_UTILS_H_
diff --git a/lib/Linalg/IR/LinalgOps.cpp b/lib/Linalg/IR/LinalgOps.cpp
index 4d7b829..24a47da 100644
--- a/lib/Linalg/IR/LinalgOps.cpp
+++ b/lib/Linalg/IR/LinalgOps.cpp
@@ -239,8 +239,6 @@
   if (llvm::empty(getOperands()))
     return emitOpError(
         "requires at least a view operand followed by 'rank' indices");
-  if (!dyn_cast_or_null<ViewOp>(getOperand(0)->getDefiningOp()))
-    return emitOpError("first operand must come from a ViewOp");
   unsigned rank = getBaseViewRank();
   if (llvm::size(getIndexings()) != rank) {
     return emitOpError("requires at least a view operand followed by ")
diff --git a/lib/Linalg/Transforms/LowerToLoops.cpp b/lib/Linalg/Transforms/LowerToLoops.cpp
index b248578..b2f59c4 100644
--- a/lib/Linalg/Transforms/LowerToLoops.cpp
+++ b/lib/Linalg/Transforms/LowerToLoops.cpp
@@ -50,20 +50,6 @@
   return res;
 }
 
-// Returns the linearized list of all view dimensions in a linalgOp. Appliying
-// the inverse, concatenated loopToOperandRangeMaps to this list allows the
-// derivation of loop ranges for any linalgOp.
-static SmallVector<Value *, 8> getViewSizes(LinalgOp &linalgOp) {
-  SmallVector<Value *, 8> res;
-  using dim = ValueBuilder<linalg::DimOp>;
-  for (auto v : linalgOp.getInputsAndOutputs()) {
-    ViewType t = v->getType().cast<ViewType>();
-    for (unsigned i = 0; i < t.getRank(); ++i)
-      res.push_back(dim(v, i));
-  }
-  return res;
-}
-
 static void emitLinalgOpAsLoops(LinalgOp &linalgOp, FunctionConstants &state) {
   FuncBuilder b(linalgOp.getOperation());
   ScopedContext scope(b, linalgOp.getOperation()->getLoc());
diff --git a/lib/Linalg/Transforms/Tiling.cpp b/lib/Linalg/Transforms/Tiling.cpp
index 6b56795..6e72ecf 100644
--- a/lib/Linalg/Transforms/Tiling.cpp
+++ b/lib/Linalg/Transforms/Tiling.cpp
@@ -28,17 +28,15 @@
 #include "mlir/Linalg/Passes.h"
 #include "mlir/Linalg/Utils/Utils.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Support/STLExtras.h"
 
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 
 using namespace mlir;
 using namespace mlir::edsc;
 using namespace mlir::edsc::intrinsics;
 using namespace mlir::linalg;
-using namespace llvm;
 
 static llvm::cl::OptionCategory clOptionsCategory("linalg options");
 static llvm::cl::list<unsigned>
@@ -77,48 +75,76 @@
 // are tiled and for which new loops will be created.
 static SmallVector<Value *, 4>
 makeTiledLoopRanges(FuncBuilder *b, Location loc, AffineMap map,
-                    ArrayRef<Value *> allOpRanges, ArrayRef<Value *> tileSizes,
-                    FunctionConstants &state) {
-  assert(tileSizes.size() == map.getNumResults());
-  // Tile sizes are in loop order by construction, apply `map` to
-  // get mins/maxes/steps in loop order.
-  auto mins =
-      applyMapToRangePart(b, loc, map, allOpRanges, RangePart::Min, state);
-  auto maxes =
-      applyMapToRangePart(b, loc, map, allOpRanges, RangePart::Max, state);
-  auto steps =
-      applyMapToRangePart(b, loc, map, allOpRanges, RangePart::Step, state);
-  SmallVector<Value *, 4> sizes(tileSizes.begin(), tileSizes.end());
+                    ArrayRef<Value *> allViewSizes,
+                    ArrayRef<Value *> allTileSizes, FunctionConstants &state) {
+  assert(allTileSizes.size() == map.getNumResults());
+  // Apply `map` to get view sizes in loop order.
+  auto viewSizes = applyMapToValues(b, loc, map, allViewSizes, state);
+  SmallVector<Value *, 4> tileSizes(allTileSizes.begin(), allTileSizes.end());
 
   // Traverse the tile sizes, which are in loop order, erase zeros everywhere.
-  for (int idx = mins.size() - 1; idx >= 0; --idx) {
+  for (int idx = tileSizes.size() - 1; idx >= 0; --idx) {
     if (isZero(tileSizes[idx])) {
-      mins.erase(mins.begin() + idx);
-      maxes.erase(maxes.begin() + idx);
-      steps.erase(steps.begin() + idx);
-      sizes.erase(sizes.begin() + idx);
+      viewSizes.erase(viewSizes.begin() + idx);
+      tileSizes.erase(tileSizes.begin() + idx);
     }
   }
 
   // Create a new range with the applied tile sizes.
   SmallVector<Value *, 4> res;
-  for (unsigned idx = 0, e = steps.size(); idx < e; ++idx) {
-    auto *step = steps[idx];
-    auto *tileSize = sizes[idx];
-    // clang-format off
-    // Steps must be constant for now to abide by affine.for semantics.
-    auto *newStep =
-        state.getOrCreateIndex(
-            cast<ConstantIndexOp>(step->getDefiningOp()).getValue() *
-            cast<ConstantIndexOp>(tileSize->getDefiningOp()).getValue());
-    res.push_back(b->create<RangeOp>(loc, mins[idx], maxes[idx], newStep));
-    // clang-format on
+  for (unsigned idx = 0, e = tileSizes.size(); idx < e; ++idx) {
+    res.push_back(b->create<RangeOp>(loc, state.getOrCreateIndex(0),
+                                     viewSizes[idx], tileSizes[idx]));
   }
   return res;
 }
 
+// E.g. for `A` in the expression:
+//     `A(i, k) * B(k, j) -> C(i, j)`
+// and for map:
+//     `(i, j, k) -> (i, k)`
+// and for `r` such that:
+//     `r == 1` (i.e. result `k`)
+// returns 2 (i.e. `k` on the map domain).
+static unsigned getPosInDomain(LinalgOp &op, unsigned viewIndex, unsigned dim) {
+  auto map = loopToOperandRangesMaps(op)[viewIndex];
+  return map.getResult(dim).cast<AffineDimExpr>().getPosition();
+}
+
+static bool isTiledView(LinalgOp &linalgOp, unsigned viewIndex,
+                        ArrayRef<Value *> tileSizes) {
+  auto viewIteratorBegin = linalgOp.getInputsAndOutputs().begin();
+  Value *view = *(viewIteratorBegin + viewIndex);
+  unsigned viewRank = view->getType().cast<ViewType>().getRank();
+  for (unsigned r = 0; r < viewRank; ++r) {
+    // Loop position for the range dimension.
+    auto pos = getPosInDomain(linalgOp, viewIndex, r);
+    auto tileSize = tileSizes[pos];
+    if (!isZero(tileSize))
+      return true;
+  }
+  return false;
+}
+
+static Value *foldRange(Value *view, unsigned dim) {
+  assert(view->getType().isa<ViewType>() && "View expected");
+  if (auto *op = view->getDefiningOp()) {
+    if (auto viewOp = dyn_cast<ViewOp>(op)) {
+      return *(viewOp.getIndexings().begin() + dim);
+    }
+    auto sliceOp = cast<SliceOp>(op);
+    for (auto *i : sliceOp.getIndexings())
+      if (i->getType().isa<RangeType>()) {
+        if (dim == 0)
+          return i;
+        --dim;
+      }
+  }
+  return nullptr;
+}
+
 static SmallVector<Value *, 4> makeTiledViews(FuncBuilder *b, Location loc,
-                                              Operation *op,
+                                              LinalgOp &linalgOp,
                                               ArrayRef<Value *> ivs,
                                               ArrayRef<Value *> tileSizes,
                                               FunctionConstants &state) {
@@ -126,57 +152,66 @@
                            llvm::make_range(tileSizes.begin(), tileSizes.end()),
                            [](Value *v) { return !isZero(v); })) &&
          "expected as many ivs as non-zero sizes");
-  auto *context = op->getContext();
+
+  using edsc::intrinsics::select;
+  using edsc::op::operator+;
+  using edsc::op::operator<;
+  using dim = ValueBuilder<linalg::DimOp>;
+  using range = ValueBuilder<RangeOp>;
+
+  auto *op = linalgOp.getOperation();
 
   SmallVector<Value *, 4> res;
   res.reserve(op->getNumOperands());
-  for (unsigned i = 0, ei = op->getNumOperands(); i < ei; ++i) {
-    auto *viewDefiningOp = op->getOperand(i)->getDefiningOp();
-    assert(viewDefiningOp && "Need operations to extract ranges from views");
-    auto ranges = getRanges(viewDefiningOp);
-    // E.g. for A in A(i, k) * B(k, j) -> C(i, j) returns the map:
-    //   (i, j, k) -> (i, k)
-    auto map = loopToOperandRangesMaps(op)[i];
-    if (!map) {
-      assert(ranges.empty() && "scalar should have empty ranges");
-      res.push_back(op->getOperand(i));
+  auto viewIteratorBegin = linalgOp.getInputsAndOutputs().begin();
+  for (unsigned viewIndex = 0; viewIndex < linalgOp.getNumInputsAndOutputs();
+       ++viewIndex) {
+    Value *view = *(viewIteratorBegin + viewIndex);
+    unsigned viewRank = view->getType().cast<ViewType>().getRank();
+    // Early exit in the untiled case.
+    if (!isTiledView(linalgOp, viewIndex, tileSizes)) {
+      res.push_back(view);
       continue;
     }
-    assert(ranges.size() == map.getNumResults());
-    // E.g. for {0, 0, v2} returns the map:
-    //   (i, j, k) -> (k)
-    auto nzMap = nonZeroMap(tileSizes);
 
+    // If not a scalar, then construct a new slice.
     SmallVector<Value *, 4> newRanges;
-    newRanges.reserve(ranges.size());
-    for (unsigned j = 0, ej = ranges.size(); j < ej; ++j) {
+    newRanges.reserve(viewRank);
+    for (unsigned r = 0; r < viewRank; ++r) {
       // Loop position for the range dimension.
-      // E.g. for A in A(i, k) * B(k, j) -> C(i, j) and map: (i, j, k) -> (i, k)
-      //   and for j == 1 (i.e. result `k`)
-      //   returns loopPos = 2 (i.e. `k` on the map domain).
-      auto pos = map.getResult(j).template cast<AffineDimExpr>().getPosition();
-      if (isZero(tileSizes[pos])) {
-        newRanges.push_back(ranges[j]);
+      auto pos = getPosInDomain(linalgOp, viewIndex, r);
+      auto tileSize = tileSizes[pos];
+      if (isZero(tileSize)) {
+        auto *foldedRange = foldRange(view, r);
+        foldedRange
+            ? newRanges.push_back(foldedRange)
+            : newRanges.push_back(range(state.getOrCreateIndex(0), dim(view, r),
+                                        state.getOrCreateIndex(1)));
         continue;
       }
-      auto it = llvm::find_if(nzMap.getResults(), [pos, context](AffineExpr e) {
-        return e == getAffineDimExpr(pos, context);
-      });
-      assert(it != nzMap.getResults().end() &&
-             "position does not correspond to a valid induction variable");
-      unsigned pos2 = it - nzMap.getResults().begin();
-      using edsc::op::operator+;
-      using range = ValueBuilder<RangeOp>;
-      using range_intersect = ValueBuilder<RangeIntersectOp>;
+
+      // `tileSizes` of `0` don't have an induction variable counterpart. So
+      // we count the number of zeros ot align the index in `ivs` to pos.
+      auto count = llvm::count_if(
+          llvm::make_range(tileSizes.begin(), tileSizes.begin() + pos),
+          [](Value *v) { return isZero(v); });
+      auto iv = ivs[pos - count];
+
       ScopedContext scope(*b, loc);
-      ValueHandle iv(ivs[pos2]), step(tileSizes[pos]);
-      auto min = ValueHandle(extractRangePart(ranges[j], RangePart::Min));
-      // zero case is important enough to fold away by special-casing.
-      auto newMin = isZero(min) ? iv : min + iv;
-      Value *r = range_intersect(ranges[j], range(newMin, newMin + step, step));
-      newRanges.push_back(r);
+      // TODO(ntv): lb = iv is a poor man's folding of max(0, i) == i which is
+      // generally wrong but correct in the specific case of tiling linalg ops.
+      // Tie this loose end in the future.
+      ValueHandle lb(iv);
+      ValueHandle step(tileSize);
+      ValueHandle steppedlb = lb + step;
+      ValueHandle viewSize = dim(view, r);
+      ValueHandle ub = select(viewSize < steppedlb, viewSize, steppedlb);
+      // Tiling creates a new slice at the proper index, the slice step is 1
+      // (i.e. the slice view does not subsample, stepping occurs in the loop).
+      newRanges.push_back(range(lb, ub, state.getOrCreateIndex(1)));
     }
-    res.push_back(createOrReturnView(b, loc, viewDefiningOp, newRanges));
+    // res.push_back(createOrReturnView(b, loc, viewDefiningOp, newRanges));
+    res.push_back(b->create<SliceOp>(loc, view, newRanges));
   }
   return res;
 }
@@ -198,7 +233,7 @@
       // The flattened loopToOperandRangesMaps is expected to be an invertible
       // permutation map (which is asserted in the inverse calculation).
       inversePermutation(concatAffineMaps(loopToOperandRangesMaps(op))),
-      getRanges(op.getOperation()), tileSizes, state);
+      getViewSizes(op), tileSizes, state);
 
   SmallVector<IndexHandle, 4> ivs(loopRanges.size());
   auto pivs = IndexHandle::makeIndexHandlePointers(ivs);
@@ -209,11 +244,8 @@
     // If/when the assertion below becomes false, we will have to templatize
     // `makeTiledViews`.
     assert(op.getNumInputsAndOutputs() == op.getOperation()->getNumOperands());
-    auto views =
-        makeTiledViews(b, loc, op.getOperation(), ivValues, tileSizes, state);
+    auto views = makeTiledViews(b, loc, op, ivValues, tileSizes, state);
     op.create(*b, loc, views);
-    /// NestedBuilders expect handles, we thus return an IndexHandle.
-    return IndexHandle();
   });
 
   return success();
diff --git a/lib/Linalg/Utils/Utils.cpp b/lib/Linalg/Utils/Utils.cpp
index 9d4e2c9..4928c19 100644
--- a/lib/Linalg/Utils/Utils.cpp
+++ b/lib/Linalg/Utils/Utils.cpp
@@ -34,7 +34,6 @@
 using namespace mlir::edsc;
 using namespace mlir::edsc::intrinsics;
 using namespace mlir::linalg;
-using namespace llvm;
 
 mlir::edsc::LoopNestRangeBuilder::LoopNestRangeBuilder(
     ArrayRef<ValueHandle *> ivs, ArrayRef<ValueHandle> ranges) {
@@ -68,78 +67,17 @@
   return ValueHandle::null();
 }
 
-SmallVector<Value *, 8> mlir::getRanges(Operation *op) {
+SmallVector<Value *, 8> mlir::linalg::getViewSizes(LinalgOp &linalgOp) {
   SmallVector<Value *, 8> res;
-  if (auto view = dyn_cast<ViewOp>(op)) {
-    res.append(view.getIndexings().begin(), view.getIndexings().end());
-  } else if (auto slice = dyn_cast<SliceOp>(op)) {
-    for (auto *i : slice.getIndexings())
-      if (i->getType().isa<RangeType>())
-        res.push_back(i);
-  } else {
-    for (auto *v : op->getOperands()) {
-      if (v->getType().isa<ViewType>()) {
-        if (auto *vOp = v->getDefiningOp()) {
-          auto tmp = getRanges(vOp);
-          res.append(tmp.begin(), tmp.end());
-        } else {
-          llvm_unreachable("Needs an operation to extract ranges from a view");
-        }
-      }
-    }
+  using dim = ValueBuilder<linalg::DimOp>;
+  for (auto v : linalgOp.getInputsAndOutputs()) {
+    ViewType t = v->getType().cast<ViewType>();
+    for (unsigned i = 0; i < t.getRank(); ++i)
+      res.push_back(dim(v, i));
   }
   return res;
 }
 
-// Implementation details:
-//   1. Checks whether `ranges` define a new View by performing an equality
-//      check between the range ssa-values and the operands of
-//      `viewDefiningOp`.
-//   2. If all ranges happen to be equal, op creation is elided and the
-//      original result is returned instead.
-//   3. Otherwise, creates a SliceOp with the new `ranges`.
-// This is used to abstract away the creation of a SliceOp.
-Value *mlir::createOrReturnView(FuncBuilder *b, Location loc,
-                                Operation *viewDefiningOp,
-                                ArrayRef<Value *> ranges) {
-  if (auto view = dyn_cast<ViewOp>(viewDefiningOp)) {
-    auto indexings = view.getIndexings();
-    if (std::equal(indexings.begin(), indexings.end(), ranges.begin()))
-      return view.getResult();
-    return b->create<SliceOp>(loc, view.getResult(), ranges);
-  }
-  auto slice = cast<SliceOp>(viewDefiningOp);
-  unsigned idxRange = 0;
-  SmallVector<Value *, 4> newIndexings;
-  bool elide = true;
-  for (auto indexing : slice.getIndexings()) {
-    if (indexing->getType().isa<RangeType>()) {
-      elide &= (indexing != ranges[idxRange]);
-      newIndexings.push_back(ranges[idxRange++]);
-    } else
-      newIndexings.push_back(indexing);
-  }
-  if (elide)
-    return slice.getResult();
-  return b->create<SliceOp>(loc, slice.getBaseView(), newIndexings);
-}
-
-Value *mlir::extractRangePart(Value *range, RangePart part) {
-  assert(range->getType().isa<RangeType>() && "expected range type");
-  if (range->getDefiningOp()) {
-    if (auto r = dyn_cast_or_null<RangeOp>(range->getDefiningOp())) {
-      switch (part) {
-      case RangePart::Min:
-        return r.min();
-      case RangePart::Max:
-        return r.max();
-      case RangePart::Step:
-        return r.step();
-      }
-    }
-  }
-  llvm_unreachable("need operations to extract range parts");
-}
 // Folding eagerly is necessary to abide by affine.for static step requirement.
 // We must propagate constants on the steps as aggressively as possible.
 // Returns nullptr if folding is not trivially feasible.
@@ -167,10 +105,10 @@
   return b->create<AffineApplyOp>(loc, map, operands);
 }
 
-SmallVector<Value *, 4> mlir::applyMapToValues(FuncBuilder *b, Location loc,
-                                               AffineMap map,
-                                               ArrayRef<Value *> values,
-                                               FunctionConstants &state) {
+SmallVector<Value *, 4>
+mlir::linalg::applyMapToValues(FuncBuilder *b, Location loc, AffineMap map,
+                               ArrayRef<Value *> values,
+                               FunctionConstants &state) {
   SmallVector<Value *, 4> res;
   res.reserve(map.getNumResults());
   unsigned numDims = map.getNumDims();
@@ -184,17 +122,6 @@
   return res;
 }
 
-SmallVector<Value *, 4> mlir::applyMapToRangePart(FuncBuilder *b, Location loc,
-                                                  AffineMap map,
-                                                  ArrayRef<Value *> ranges,
-                                                  RangePart part,
-                                                  FunctionConstants &state) {
-  SmallVector<Value *, 4> rangeParts(ranges.size());
-  llvm::transform(ranges, rangeParts.begin(),
-                  [&](Value *range) { return extractRangePart(range, part); });
-  return applyMapToValues(b, loc, map, rangeParts, state);
-}
-
 Value *FunctionConstants::getOrCreateIndex(int64_t v) {
   auto it = map.find(v);
   if (it != map.end())
diff --git a/test/Linalg/tile.mlir b/test/Linalg/tile.mlir
index cc6f0e9..bf4c656 100644
--- a/test/Linalg/tile.mlir
+++ b/test/Linalg/tile.mlir
@@ -30,75 +30,117 @@
 //       TILE-2: %[[A:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?x?xf32>
 //  TILE-2-NEXT: %[[B:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?x?xf32>
 //  TILE-2-NEXT: %[[C:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?x?xf32>
-//       TILE-2: affine.for %i0 = #[[ID]](%c0{{.*}}) to #[[ID]](%arg1) step 2 {
+//       TILE-2: %[[M:.*]] = linalg.dim %[[A]], 0 : !linalg.view<?x?xf32>
+//       TILE-2: affine.for %i0 = #[[ID]](%c0{{.*}}) to #[[ID]](%[[M]]) step 2 {
 //  TILE-2-NEXT:   %[[a:.*]] = affine.apply #[[UB0]](%i0)
-//  TILE-2-NEXT:   %[[ra0:.*]] = linalg.range %i0:%[[a]]:%c2 : !linalg.range
-//  TILE-2-NEXT:   %[[ra:.*]] = linalg.range_intersect %{{.}}, %[[ra0]] : !linalg.range
-//  TILE-2-NEXT:   %[[sAi:.*]] = linalg.slice %[[A]][%[[ra]], %2] : !linalg.view<?x?xf32>, !linalg.range, !linalg.range, !linalg.view<?x?xf32>
-//  TILE-2-NEXT:   %[[c:.*]] = affine.apply #[[UB0]](%i0)
-//  TILE-2-NEXT:   %[[rc0:.*]] = linalg.range %i0:%[[c]]:%c2 : !linalg.range
-//  TILE-2-NEXT:   %[[rc:.*]] = linalg.range_intersect %{{.}}, %[[rc0]] : !linalg.range
-//  TILE-2-NEXT:   %[[sCi:.*]] = linalg.slice %[[C]][%[[rc]], %1] : !linalg.view<?x?xf32>, !linalg.range, !linalg.range, !linalg.view<?x?xf32>
+//  TILE-2-NEXT:   %[[M:.*]] = linalg.dim %[[A]], 0 : !linalg.view<?x?xf32>
+//  TILE-2-NEXT:   %[[cmpuba:.*]] = cmpi "slt", %[[M]], %[[a]] : index
+//  TILE-2-NEXT:   %[[uba:.*]] = select %[[cmpuba]], %[[M]], %[[a]] : index
+//  TILE-2-NEXT:   %[[ra:.*]] = linalg.range %i0:%[[uba]]:%c1 : !linalg.range
+//       TILE-2:   %[[sAi:.*]] = linalg.slice %[[A]][%[[ra]], {{.*}}] : !linalg.view<?x?xf32>, !linalg.range, !linalg.range, !linalg.view<?x?xf32>
+//
+//       TILE-2:   %[[c:.*]] = affine.apply #[[UB0]](%i0)
+//  TILE-2-NEXT:   %[[M:.*]] = linalg.dim %[[C]], 0 : !linalg.view<?x?xf32>
+//  TILE-2-NEXT:   %[[cmpubc:.*]] = cmpi "slt", %[[M]], %[[c]] : index
+//  TILE-2-NEXT:   %[[ubc:.*]] = select %[[cmpubc]], %[[M]], %[[c]] : index
+//  TILE-2-NEXT:   %[[rc:.*]] = linalg.range %i0:%[[ubc]]:%c1 : !linalg.range
+//       TILE-2:   %[[sCi:.*]] = linalg.slice %[[C]][%[[rc]], {{.*}}] : !linalg.view<?x?xf32>, !linalg.range, !linalg.range, !linalg.view<?x?xf32>
+//
 //  TILE-2-NEXT:   linalg.matmul(%[[sAi]], %[[B]], %[[sCi]]) : !linalg.view<?x?xf32>, !linalg.view<?x?xf32>, !linalg.view<?x?xf32>
 
 // TILE-02-LABEL: func @matmul(%arg0: !linalg.buffer<f32>, %arg1: index, %arg2: index, %arg3: index) {
 //       TILE-02: %[[A:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?x?xf32>
 //  TILE-02-NEXT: %[[B:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?x?xf32>
 //  TILE-02-NEXT: %[[C:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?x?xf32>
-//       TILE-02: affine.for %i0 = #[[ID]](%c0_0) to #[[ID]](%arg2) step 2 {
-//  TILE-02-NEXT:   %[[b:.*]] = affine.apply #[[UB0]](%i0)
-//  TILE-02-NEXT:   %[[rb0:.*]] = linalg.range %i0:%[[b]]:%c2 : !linalg.range
-//  TILE-02-NEXT:   %[[rb:.*]] = linalg.range_intersect %{{.}}, %[[rb0]] : !linalg.range
-//  TILE-02-NEXT:   %[[sBj:.*]] = linalg.slice %[[B]][%{{.*}}, %[[rb]]] : !linalg.view<?x?xf32>, !linalg.range, !linalg.range, !linalg.view<?x?xf32>
-//  TILE-02-NEXT:   %[[c:.*]] = affine.apply #[[UB0]](%i0)
-//  TILE-02-NEXT:   %[[rc0:.*]] = linalg.range %i0:%[[c]]:%c2 : !linalg.range
-//  TILE-02-NEXT:   %[[rc:.*]] = linalg.range_intersect %{{.}}, %[[rc0]] : !linalg.range
+//       TILE-02: %[[N:.*]] = linalg.dim %[[B]], 1 : !linalg.view<?x?xf32>
+//       TILE-02: affine.for %i0 = #[[ID]](%c0) to #[[ID]](%[[N]]) step 2 {
+//       TILE-02:   %[[b:.*]] = affine.apply #[[UB0]](%i0)
+//  TILE-02-NEXT:   %[[N:.*]] = linalg.dim %[[B]], 1 : !linalg.view<?x?xf32>
+//  TILE-02-NEXT:   %[[cmpubb:.*]] = cmpi "slt", %[[N]], %[[b]] : index
+//  TILE-02-NEXT:   %[[ubb:.*]] = select %[[cmpubb]], %[[N]], %[[b]] : index
+//  TILE-02-NEXT:   %[[rb:.*]] = linalg.range %i0:%[[ubb]]:%c1 : !linalg.range
+//       TILE-02:   %[[sBj:.*]] = linalg.slice %[[B]][%{{.*}}, %[[rb]]] : !linalg.view<?x?xf32>, !linalg.range, !linalg.range, !linalg.view<?x?xf32>
+//
+//       TILE-02:   %[[c:.*]] = affine.apply #[[UB0]](%i0)
+//       TILE-02:   %[[N:.*]] = linalg.dim %[[C]], 1 : !linalg.view<?x?xf32>
+//  TILE-02-NEXT:   %[[cmpubc:.*]] = cmpi "slt", %[[N]], %[[c]] : index
+//  TILE-02-NEXT:   %[[ubc:.*]] = select %[[cmpubc]], %[[N]], %[[c]] : index
+//  TILE-02-NEXT:   %[[rc:.*]] = linalg.range %i0:%[[ubc]]:%c1 : !linalg.range
 //  TILE-02-NEXT:   %[[sCj:.*]] = linalg.slice %[[C]][%{{.*}}, %[[rc]]] : !linalg.view<?x?xf32>, !linalg.range, !linalg.range, !linalg.view<?x?xf32>
-//  TILE-02-NEXT:   linalg.matmul(%[[A]], %[[sBj]], %[[sCj]]) : !linalg.view<?x?xf32>, !linalg.view<?x?xf32>, !linalg.view<?x?xf32>
+//
+//       TILE-02:   linalg.matmul(%[[A]], %[[sBj]], %[[sCj]]) : !linalg.view<?x?xf32>, !linalg.view<?x?xf32>, !linalg.view<?x?xf32>
 
 // TILE-002-LABEL: func @matmul(%arg0: !linalg.buffer<f32>, %arg1: index, %arg2: index, %arg3: index) {
 //       TILE-002: %[[A:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?x?xf32>
 //  TILE-002-NEXT: %[[B:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?x?xf32>
 //  TILE-002-NEXT: %[[C:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?x?xf32>
-//       TILE-002: affine.for %i0 = #[[ID]](%c0{{.*}}) to #[[ID]](%arg3) step 2 {
-//  TILE-002-NEXT:   %[[a:.*]] = affine.apply #[[UB0]](%i0)
-//  TILE-002-NEXT:   %[[ra0:.*]] = linalg.range %i0:%[[a]]:%c2 : !linalg.range
-//  TILE-002-NEXT:   %[[ra:.*]] = linalg.range_intersect %{{.}}, %[[ra0]] : !linalg.range
+//       TILE-002: %[[K:.*]] = linalg.dim %[[A]], 1 : !linalg.view<?x?xf32>
+//       TILE-002: affine.for %i0 = #[[ID]](%c0{{.*}}) to #[[ID]](%[[K]]) step 2 {
+//       TILE-002:   %[[a:.*]] = affine.apply #[[UB0]](%i0)
+//  TILE-002-NEXT:   %[[K:.*]] = linalg.dim %[[A]], 1 : !linalg.view<?x?xf32>
+//  TILE-002-NEXT:   %[[cmpuba:.*]] = cmpi "slt", %[[K]], %[[a]] : index
+//  TILE-002-NEXT:   %[[uba:.*]] = select %[[cmpuba]], %[[K]], %[[a]] : index
+//  TILE-002-NEXT:   %[[ra:.*]] = linalg.range %i0:%[[uba]]:%c1 : !linalg.range
 //  TILE-002-NEXT:   %[[sAj:.*]] = linalg.slice %[[A]][%{{.*}}, %[[ra]]] : !linalg.view<?x?xf32>, !linalg.range, !linalg.range, !linalg.view<?x?xf32>
-//  TILE-002-NEXT:   %[[b:.*]] = affine.apply #[[UB0]](%i0)
-//  TILE-002-NEXT:   %[[rb0:.*]] = linalg.range %i0:%[[b]]:%c2 : !linalg.range
-//  TILE-002-NEXT:   %[[rb:.*]] = linalg.range_intersect %{{.}}, %[[rb0]] : !linalg.range
-//  TILE-002-NEXT:   %[[sBj:.*]] = linalg.slice %[[B]][%[[rb]], %{{.*}}] : !linalg.view<?x?xf32>, !linalg.range, !linalg.range, !linalg.view<?x?xf32>
-//  TILE-002-NEXT:   linalg.matmul(%[[sAj]], %[[sBj]], %[[C]]) : !linalg.view<?x?xf32>, !linalg.view<?x?xf32>, !linalg.view<?x?xf32>
+//
+//       TILE-002:   %[[b:.*]] = affine.apply #[[UB0]](%i0)
+//  TILE-002-NEXT:   %[[K:.*]] = linalg.dim %[[B]], 0 : !linalg.view<?x?xf32>
+//  TILE-002-NEXT:   %[[cmpubb:.*]] = cmpi "slt", %[[K]], %[[b]] : index
+//  TILE-002-NEXT:   %[[ubb:.*]] = select %[[cmpubb]], %[[K]], %[[b]] : index
+//  TILE-002-NEXT:   %[[rb:.*]] = linalg.range %i0:%[[ubb]]:%c1 : !linalg.range
+//       TILE-002:   %[[sBj:.*]] = linalg.slice %[[B]][%[[rb]], %{{.*}}] : !linalg.view<?x?xf32>, !linalg.range, !linalg.range, !linalg.view<?x?xf32>
+//
+//       TILE-002:   linalg.matmul(%[[sAj]], %[[sBj]], %[[C]]) : !linalg.view<?x?xf32>, !linalg.view<?x?xf32>, !linalg.view<?x?xf32>
 
 // TILE-234-LABEL: func @matmul(%arg0: !linalg.buffer<f32>, %arg1: index, %arg2: index, %arg3: index) {
 //       TILE-234: %[[A:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?x?xf32>
 //  TILE-234-NEXT: %[[B:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?x?xf32>
 //  TILE-234-NEXT: %[[C:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?x?xf32>
-//       TILE-234:  affine.for %i0 = #[[ID]](%c0{{.*}}) to #[[ID]](%arg1) step 2 {
-//  TILE-234-NEXT:    affine.for %i1 = #[[ID]](%c0{{.*}}) to #[[ID]](%arg2) step 3 {
-//  TILE-234-NEXT:      affine.for %i2 = #[[ID]](%c0{{.*}}) to #[[ID]](%arg3) step 4 {
+//       TILE-234: %[[M:.*]] = linalg.dim %[[A]], 0 : !linalg.view<?x?xf32>
+//       TILE-234: %[[K:.*]] = linalg.dim %[[A]], 1 : !linalg.view<?x?xf32>
+//       TILE-234: %[[N:.*]] = linalg.dim %[[B]], 1 : !linalg.view<?x?xf32>
+//       TILE-234:  affine.for %i0 = #[[ID]](%c0{{.*}}) to #[[ID]](%[[M]]) step 2 {
+//  TILE-234-NEXT:    affine.for %i1 = #[[ID]](%c0{{.*}}) to #[[ID]](%[[N]]) step 3 {
+//  TILE-234-NEXT:      affine.for %i2 = #[[ID]](%c0{{.*}}) to #[[ID]](%[[K]]) step 4 {
 //  TILE-234-NEXT:        %[[ai:.*]]  = affine.apply #[[UB0]](%i0)
-//  TILE-234-NEXT:        %[[rai0:.*]] = linalg.range %i0:%[[ai]]:%c2 : !linalg.range
-//  TILE-234-NEXT:        %[[rai:.*]] = linalg.range_intersect %{{.}}, %[[rai0]] : !linalg.range
+//  TILE-234-NEXT:        %[[M:.*]] = linalg.dim %[[A]], 0 : !linalg.view<?x?xf32>
+//  TILE-234-NEXT:        %[[cmpubai:.*]] = cmpi "slt", %[[M]], %[[ai]] : index
+//  TILE-234-NEXT:        %[[ubai:.*]] = select %[[cmpubai]], %[[M]], %[[ai]] : index
+//  TILE-234-NEXT:        %[[rai:.*]] = linalg.range %i0:%[[ubai]]:%c1 : !linalg.range
+//
 //  TILE-234-NEXT:        %[[ak:.*]] = affine.apply #[[UB2]](%i2)
-//  TILE-234-NEXT:        %[[rak0:.*]] = linalg.range %i2:%[[ak]]:%c4{{.*}} : !linalg.range
-//  TILE-234-NEXT:        %[[rak:.*]] = linalg.range_intersect %{{.}}, %[[rak0]] : !linalg.range
+//  TILE-234-NEXT:        %[[K:.*]] = linalg.dim %[[A]], 1 : !linalg.view<?x?xf32>
+//  TILE-234-NEXT:        %[[cmpubak:.*]] = cmpi "slt", %[[K]], %[[ak]] : index
+//  TILE-234-NEXT:        %[[ubak:.*]] = select %[[cmpubak]], %[[K]], %[[ak]] : index
+//  TILE-234-NEXT:        %[[rak:.*]] = linalg.range %i2:%[[ubak]]:%c1 : !linalg.range
 //  TILE-234-NEXT:        %[[sAik:.*]] = linalg.slice %[[A]][%[[rai]], %[[rak]]] : !linalg.view<?x?xf32>, !linalg.range, !linalg.range, !linalg.view<?x?xf32>
+//
 //  TILE-234-NEXT:        %[[bk:.*]] = affine.apply #[[UB2]](%i2)
-//  TILE-234-NEXT:        %[[rbk0:.*]] = linalg.range %i2:%[[bk]]:%c4{{.*}} : !linalg.range
-//  TILE-234-NEXT:        %[[rbk:.*]] = linalg.range_intersect %{{.}}, %[[rbk0]] : !linalg.range
+//  TILE-234-NEXT:        %[[K:.*]] = linalg.dim %[[B]], 0 : !linalg.view<?x?xf32>
+//  TILE-234-NEXT:        %[[cmpubbk:.*]] = cmpi "slt", %[[K]], %[[bk]] : index
+//  TILE-234-NEXT:        %[[ubbk:.*]] = select %[[cmpubbk]], %[[K]], %[[bk]] : index
+//  TILE-234-NEXT:        %[[rbk:.*]] = linalg.range %i2:%[[ubbk]]:%c1 : !linalg.range
+//
 //  TILE-234-NEXT:        %[[bj:.*]] = affine.apply #[[UB1]](%i1)
-//  TILE-234-NEXT:        %[[rbj0:.*]] = linalg.range %i1:%[[bj]]:%c3{{.*}} : !linalg.range
-//  TILE-234-NEXT:        %[[rbj:.*]] = linalg.range_intersect %{{.}}, %[[rbj0]] : !linalg.range
+//  TILE-234-NEXT:        %[[N:.*]] = linalg.dim %[[B]], 1 : !linalg.view<?x?xf32>
+//  TILE-234-NEXT:        %[[cmpubbj:.*]] = cmpi "slt", %[[N]], %[[bj]] : index
+//  TILE-234-NEXT:        %[[ubbj:.*]] = select %[[cmpubbj]], %[[N]], %[[bj]] : index
+//  TILE-234-NEXT:        %[[rbj:.*]] = linalg.range %i1:%[[ubbj]]:%c1 : !linalg.range
 //  TILE-234-NEXT:        %[[sBkj:.*]] = linalg.slice %[[B]][%[[rbk]], %[[rbj]]] : !linalg.view<?x?xf32>, !linalg.range, !linalg.range, !linalg.view<?x?xf32>
+//
 //  TILE-234-NEXT:        %[[ci:.*]] = affine.apply #[[UB0]](%i0)
-//  TILE-234-NEXT:        %[[rci0:.*]] = linalg.range %i0:%[[ci]]:%c2{{.*}} : !linalg.range
-//  TILE-234-NEXT:        %[[rci:.*]] = linalg.range_intersect %{{.}}, %[[rci0]] : !linalg.range
+//  TILE-234-NEXT:        %[[M:.*]] = linalg.dim %[[C]], 0 : !linalg.view<?x?xf32>
+//  TILE-234-NEXT:        %[[cmpubci:.*]] = cmpi "slt", %[[M]], %[[ci]] : index
+//  TILE-234-NEXT:        %[[ubci:.*]] = select %[[cmpubci]], %[[M]], %[[ci]] : index
+//  TILE-234-NEXT:        %[[rci:.*]] = linalg.range %i0:%[[ubci]]:%c1 : !linalg.range
+//
 //  TILE-234-NEXT:        %[[cj:.*]] = affine.apply #[[UB1]](%i1)
-//  TILE-234-NEXT:        %[[rcj0:.*]] = linalg.range %i1:%[[cj]]:%c3{{.*}} : !linalg.range
-//  TILE-234-NEXT:        %[[rcj:.*]] = linalg.range_intersect %{{.}}, %[[rcj0]] : !linalg.range
+//  TILE-234-NEXT:        %[[N:.*]] = linalg.dim %[[C]], 1 : !linalg.view<?x?xf32>
+//  TILE-234-NEXT:        %[[cmpubcj:.*]] = cmpi "slt", %[[N]], %[[cj]] : index
+//  TILE-234-NEXT:        %[[ubcj:.*]] = select %[[cmpubcj]], %[[N]], %[[cj]] : index
+//  TILE-234-NEXT:        %[[rcj:.*]] = linalg.range %i1:%[[ubcj]]:%c1 : !linalg.range
 //  TILE-234-NEXT:        %[[sCij:.*]] = linalg.slice %[[C]][%[[rci]], %[[rcj]]] : !linalg.view<?x?xf32>, !linalg.range, !linalg.range, !linalg.view<?x?xf32>
+//
 //  TILE-234-NEXT:        linalg.matmul(%[[sAik]], %[[sBkj]], %[[sCij]]) : !linalg.view<?x?xf32>, !linalg.view<?x?xf32>, !linalg.view<?x?xf32>
 
 func @matvec(%arg0: !linalg.buffer<f32>, %arg1: index, %arg2: index, %arg3: index) {
@@ -116,31 +158,45 @@
 //       TILE-2: %[[A:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?x?xf32>
 //  TILE-2-NEXT: %[[B:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?xf32>
 //  TILE-2-NEXT: %[[C:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?xf32>
-//       TILE-2: affine.for %i0 = #[[ID]](%c0{{.*}}) to #[[ID]](%arg1) step 2 {
+//       TILE-2: %[[M:.*]] = linalg.dim %[[A]], 0 : !linalg.view<?x?xf32>
+//       TILE-2: affine.for %i0 = #[[ID]](%c0{{.*}}) to #[[ID]](%[[M]]) step 2 {
 //  TILE-2-NEXT:   %[[a:.*]] = affine.apply #[[UB0]](%i0)
-//  TILE-2-NEXT:   %[[ra0:.*]] = linalg.range %i0:%[[a]]:%c2 : !linalg.range
-//  TILE-2-NEXT:   %[[ra:.*]] = linalg.range_intersect %{{.}}, %[[ra0]] : !linalg.range
-//  TILE-2-NEXT:   %[[sAi:.*]] = linalg.slice %[[A]][%[[ra]], %{{.*}}] : !linalg.view<?x?xf32>, !linalg.range, !linalg.range, !linalg.view<?x?xf32>
-//  TILE-2-NEXT:   %[[c:.*]] = affine.apply #[[UB0]](%i0)
-//  TILE-2-NEXT:   %[[rc0:.*]] = linalg.range %i0:%[[c]]:%c2 : !linalg.range
-//  TILE-2-NEXT:   %[[rc:.*]] = linalg.range_intersect %{{.}}, %[[rc0]] : !linalg.range
-//  TILE-2-NEXT:   %[[sCi:.*]] = linalg.slice %[[C]][%[[rc]]] : !linalg.view<?xf32>, !linalg.range, !linalg.view<?xf32>
+//  TILE-2-NEXT:   %[[M:.*]] = linalg.dim %[[A]], 0 : !linalg.view<?x?xf32>
+//  TILE-2-NEXT:   %[[cmpuba:.*]] = cmpi "slt", %[[M]], %[[a]] : index
+//  TILE-2-NEXT:   %[[uba:.*]] = select %[[cmpuba]], %[[M]], %[[a]] : index
+//  TILE-2-NEXT:   %[[ra:.*]] = linalg.range %i0:%[[uba]]:%c1 : !linalg.range
+//       TILE-2:   %[[sAi:.*]] = linalg.slice %[[A]][%[[ra]], %{{.*}}] : !linalg.view<?x?xf32>, !linalg.range, !linalg.range, !linalg.view<?x?xf32>
+//
+//       TILE-2:   %[[c:.*]] = affine.apply #[[UB0]](%i0)
+//  TILE-2-NEXT:   %[[M:.*]] = linalg.dim %[[C]], 0 : !linalg.view<?xf32>
+//  TILE-2-NEXT:   %[[cmpubc:.*]] = cmpi "slt", %[[M]], %[[c]] : index
+//  TILE-2-NEXT:   %[[ubc:.*]] = select %[[cmpubc]], %[[M]], %[[c]] : index
+//  TILE-2-NEXT:   %[[rc:.*]] = linalg.range %i0:%[[ubc]]:%c1 : !linalg.range
+//       TILE-2:   %[[sCi:.*]] = linalg.slice %[[C]][%[[rc]]] : !linalg.view<?xf32>, !linalg.range, !linalg.view<?xf32>
+//
 //  TILE-2-NEXT:   linalg.matvec(%[[sAi]], %[[B]], %[[sCi]]) : !linalg.view<?x?xf32>, !linalg.view<?xf32>, !linalg.view<?xf32>
 
 // TILE-02-LABEL: func @matvec(%arg0: !linalg.buffer<f32>, %arg1: index, %arg2: index, %arg3: index) {
 //       TILE-02: %[[A:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?x?xf32>
 //  TILE-02-NEXT: %[[B:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?xf32>
 //  TILE-02-NEXT: %[[C:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?xf32>
-//       TILE-02: affine.for %i0 = #[[ID]](%c0{{.*}}) to #[[ID]](%arg2) step 2 {
-//  TILE-02-NEXT:   %[[a:.*]] = affine.apply #[[UB0]](%i0)
-//  TILE-02-NEXT:   %[[ra0:.*]] = linalg.range %i0:%[[a]]:%c2 : !linalg.range
-//  TILE-02-NEXT:   %[[ra:.*]] = linalg.range_intersect %{{.}}, %[[ra0]] : !linalg.range
+//       TILE-02: %[[K:.*]] = linalg.dim %[[A]], 1 : !linalg.view<?x?xf32>
+//       TILE-02: affine.for %i0 = #[[ID]](%c0{{.*}}) to #[[ID]](%[[K]]) step 2 {
+//       TILE-02:   %[[a:.*]] = affine.apply #[[UB0]](%i0)
+//  TILE-02-NEXT:   %[[K:.*]] = linalg.dim %[[A]], 1 : !linalg.view<?x?xf32>
+//  TILE-02-NEXT:   %[[cmpuba:.*]] = cmpi "slt", %[[K]], %[[a]] : index
+//  TILE-02-NEXT:   %[[uba:.*]] = select %[[cmpuba]], %[[K]], %[[a]] : index
+//  TILE-02-NEXT:   %[[ra:.*]] = linalg.range %i0:%[[uba]]:%c1 : !linalg.range
 //  TILE-02-NEXT:   %[[sAj:.*]] = linalg.slice %[[A]][%{{.*}}, %[[ra]]] : !linalg.view<?x?xf32>, !linalg.range, !linalg.range, !linalg.view<?x?xf32>
+//
 //  TILE-02-NEXT:   %[[b:.*]] = affine.apply #[[UB0]](%i0)
-//  TILE-02-NEXT:   %[[rb0:.*]] = linalg.range %i0:%[[b]]:%c2 : !linalg.range
-//  TILE-02-NEXT:   %[[rb:.*]] = linalg.range_intersect %{{.}}, %[[rb0]] : !linalg.range
+//  TILE-02-NEXT:   %[[K:.*]] = linalg.dim %[[B]], 0 : !linalg.view<?xf32>
+//  TILE-02-NEXT:   %[[cmpubb:.*]] = cmpi "slt", %[[K]], %[[b]] : index
+//  TILE-02-NEXT:   %[[ubb:.*]] = select %[[cmpubb]], %[[K]], %[[b]] : index
+//  TILE-02-NEXT:   %[[rb:.*]] = linalg.range %i0:%[[ubb]]:%c1 : !linalg.range
 //  TILE-02-NEXT:   %[[sBj:.*]] = linalg.slice %[[B]][%[[rb]]] : !linalg.view<?xf32>, !linalg.range, !linalg.view<?xf32>
-//  TILE-02-NEXT:   linalg.matvec(%[[sAj]], %[[sBj]], %[[C]]) : !linalg.view<?x?xf32>, !linalg.view<?xf32>, !linalg.view<?xf32>
+//
+//       TILE-02:   linalg.matvec(%[[sAj]], %[[sBj]], %[[C]]) : !linalg.view<?x?xf32>, !linalg.view<?xf32>, !linalg.view<?xf32>
 
 // TILE-002-LABEL: func @matvec(%arg0: !linalg.buffer<f32>, %arg1: index, %arg2: index, %arg3: index) {
 //   TILE-002-NOT: affine.for
@@ -149,67 +205,83 @@
 //       TILE-234: %[[A:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?x?xf32>
 //  TILE-234-NEXT: %[[B:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?xf32>
 //  TILE-234-NEXT: %[[C:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?xf32>
-//       TILE-234:  affine.for %i0 = #[[ID]](%c0{{.*}}) to #[[ID]](%arg1) step 2 {
-//  TILE-234-NEXT:    affine.for %i1 = #[[ID]](%c0{{.*}}) to #[[ID]](%arg2) step 3 {
+//       TILE-234: %[[M:.*]] = linalg.dim %[[A]], 0 : !linalg.view<?x?xf32>
+//       TILE-234: %[[K:.*]] = linalg.dim %[[A]], 1 : !linalg.view<?x?xf32>
+//       TILE-234:  affine.for %i0 = #[[ID]](%c0{{.*}}) to #[[ID]](%[[M]]) step 2 {
+//  TILE-234-NEXT:    affine.for %i1 = #[[ID]](%c0{{.*}}) to #[[ID]](%[[K]]) step 3 {
 //  TILE-234-NEXT:      %[[ai:.*]] = affine.apply #[[UB0]](%i0)
-//  TILE-234-NEXT:      %[[rai0:.*]] = linalg.range %i0:%[[ai]]:%c2 : !linalg.range
-//  TILE-234-NEXT:      %[[rai:.*]] = linalg.range_intersect %{{.}}, %[[rai0]] : !linalg.range
+//  TILE-234-NEXT:      %[[M:.*]] = linalg.dim %[[A]], 0 : !linalg.view<?x?xf32>
+//  TILE-234-NEXT:      %[[cmpubai:.*]] = cmpi "slt", %[[M]], %[[ai]] : index
+//  TILE-234-NEXT:      %[[ubai:.*]] = select %[[cmpubai]], %[[M]], %[[ai]] : index
+//  TILE-234-NEXT:      %[[rai:.*]] = linalg.range %i0:%[[ubai]]:%c1 : !linalg.range
+//
 //  TILE-234-NEXT:      %[[aj:.*]] = affine.apply #[[UB1]](%i1)
-//  TILE-234-NEXT:      %[[raj0:.*]] = linalg.range %i1:%[[aj]]:%c3 : !linalg.range
-//  TILE-234-NEXT:      %[[raj:.*]] = linalg.range_intersect %{{.}}, %[[raj0]] : !linalg.range
+//  TILE-234-NEXT:      %[[K:.*]] = linalg.dim %[[A]], 1 : !linalg.view<?x?xf32>
+//  TILE-234-NEXT:      %[[cmpubaj:.*]] = cmpi "slt", %[[K]], %[[aj]] : index
+//  TILE-234-NEXT:      %[[ubaj:.*]] = select %[[cmpubaj]], %[[K]], %[[aj]] : index
+//  TILE-234-NEXT:      %[[raj:.*]] = linalg.range %i1:%[[ubaj]]:%c1 : !linalg.range
 //  TILE-234-NEXT:      %[[sAij:.*]] = linalg.slice %[[A]][%[[rai]], %[[raj]]] : !linalg.view<?x?xf32>, !linalg.range, !linalg.range, !linalg.view<?x?xf32>
+//
 //  TILE-234-NEXT:      %[[b:.*]] = affine.apply #[[UB1]](%i1)
-//  TILE-234-NEXT:      %[[rb0:.*]] = linalg.range %i1:%[[b]]:%c3 : !linalg.range
-//  TILE-234-NEXT:      %[[rb:.*]] = linalg.range_intersect %{{.}}, %[[rb0]] : !linalg.range
+//  TILE-234-NEXT:      %[[K:.*]] = linalg.dim %[[B]], 0 : !linalg.view<?xf32>
+//  TILE-234-NEXT:      %[[cmpubb:.*]] = cmpi "slt", %[[K]], %[[b]] : index
+//  TILE-234-NEXT:      %[[ubb:.*]] = select %[[cmpubb]], %[[K]], %[[b]] : index
+//  TILE-234-NEXT:      %[[rb:.*]] = linalg.range %i1:%[[ubb]]:%c1 : !linalg.range
 //  TILE-234-NEXT:      %[[sB:.*]] = linalg.slice %[[B]][%[[rb]]] : !linalg.view<?xf32>, !linalg.range, !linalg.view<?xf32>
+//
 //  TILE-234-NEXT:      %[[c:.*]] = affine.apply #[[UB0]](%i0)
-//  TILE-234-NEXT:      %[[rc0:.*]] = linalg.range %i0:%[[c]]:%c2 : !linalg.range
-//  TILE-234-NEXT:      %[[rc:.*]] = linalg.range_intersect %{{.}}, %[[rc0]] : !linalg.range
+//  TILE-234-NEXT:      %[[M:.*]] = linalg.dim %[[C]], 0 : !linalg.view<?xf32>
+//  TILE-234-NEXT:      %[[cmpubc:.*]] = cmpi "slt", %[[M]], %[[c]] : index
+//  TILE-234-NEXT:      %[[ubc:.*]] = select %[[cmpubc]], %[[M]], %[[c]] : index
+//  TILE-234-NEXT:      %[[rc:.*]] = linalg.range %i0:%[[ubc]]:%c1 : !linalg.range
 //  TILE-234-NEXT:      %[[sC:.*]] = linalg.slice %[[C]][%[[rc]]] : !linalg.view<?xf32>, !linalg.range, !linalg.view<?xf32>
+//
 //  TILE-234-NEXT:      linalg.matvec(%[[sAij]], %[[sB]], %[[sC]]) : !linalg.view<?x?xf32>, !linalg.view<?xf32>, !linalg.view<?xf32>
 
-func @dot(%arg0: !linalg.buffer<f32>, %arg1: index, %arg2: index, %arg3: index) {
-  %c0 = constant 0 : index
-  %c1 = constant 1 : index
-  %I = linalg.range %c0:%arg1:%c1 : !linalg.range
-  %1 = linalg.view %arg0[%I] : !linalg.view<?xf32>
-  %2 = linalg.view %arg0[%I] : !linalg.view<?xf32>
-  %3 = linalg.view %arg0[] : !linalg.view<f32>
-  linalg.dot(%1, %2, %3) : !linalg.view<?xf32>, !linalg.view<?xf32>, !linalg.view<f32>
+func @dot(%arg0: !linalg.view<?xf32>, %arg1: !linalg.view<?xf32>, %arg2: !linalg.view<f32>) {
+  linalg.dot(%arg0, %arg1, %arg2) : !linalg.view<?xf32>, !linalg.view<?xf32>, !linalg.view<f32>
   return
 }
-// TILE-2-LABEL: func @dot(%arg0: !linalg.buffer<f32>, %arg1: index, %arg2: index, %arg3: index) {
-//       TILE-2: %[[A:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?xf32>
-//  TILE-2-NEXT: %[[B:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?xf32>
-//  TILE-2-NEXT: %[[C:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<f32>
-//       TILE-2: affine.for %i0 = #[[ID]](%c0{{.*}}) to #[[ID]](%arg1) step 2 {
+// TILE-2-LABEL: func @dot(%arg0: !linalg.view<?xf32>, %arg1: !linalg.view<?xf32>, %arg2: !linalg.view<f32>) {
+//       TILE-2: %[[M:.*]] = linalg.dim %arg0, 0 : !linalg.view<?xf32>
+//       TILE-2: affine.for %i0 = #[[ID]](%c0{{.*}}) to #[[ID]](%[[M]]) step 2 {
 //  TILE-2-NEXT:   %[[a:.*]] = affine.apply #[[UB0]](%i0)
-//  TILE-2-NEXT:   %[[ra0:.*]] = linalg.range %i0:%[[a]]:%c2 : !linalg.range
-//  TILE-2-NEXT:   %[[ra:.*]] = linalg.range_intersect %{{.}}, %[[ra0]] : !linalg.range
-//  TILE-2-NEXT:   %[[sAi:.*]] = linalg.slice %[[A]][%[[ra]]] : !linalg.view<?xf32>, !linalg.range, !linalg.view<?xf32>
+//  TILE-2-NEXT:   %[[M:.*]] = linalg.dim %arg0, 0 : !linalg.view<?xf32>
+//  TILE-2-NEXT:   %[[cmpuba:.*]] = cmpi "slt", %[[M]], %[[a]] : index
+//  TILE-2-NEXT:   %[[uba:.*]] = select %[[cmpuba]], %[[M]], %[[a]] : index
+//  TILE-2-NEXT:   %[[ra:.*]] = linalg.range %i0:%[[uba]]:%c1 : !linalg.range
+//       TILE-2:   %[[sAi:.*]] = linalg.slice %arg0[%[[ra]]] : !linalg.view<?xf32>, !linalg.range, !linalg.view<?xf32>
+//
 //  TILE-2-NEXT:   %[[b:.*]] = affine.apply #[[UB0]](%i0)
-//  TILE-2-NEXT:   %[[rb0:.*]] = linalg.range %i0:%[[b]]:%c2 : !linalg.range
-//  TILE-2-NEXT:   %[[rb:.*]] = linalg.range_intersect %{{.}}, %[[rb0]] : !linalg.range
-//  TILE-2-NEXT:   %[[sBi:.*]] = linalg.slice %[[B]][%[[rb]]] : !linalg.view<?xf32>, !linalg.range, !linalg.view<?xf32>
-//  TILE-2-NEXT:   linalg.dot(%[[sAi]], %[[sBi]], %[[C]]) : !linalg.view<?xf32>, !linalg.view<?xf32>, !linalg.view<f32>
+//  TILE-2-NEXT:   %[[K:.*]] = linalg.dim %arg1, 0 : !linalg.view<?xf32>
+//  TILE-2-NEXT:   %[[cmpubb:.*]] = cmpi "slt", %[[K]], %[[b]] : index
+//  TILE-2-NEXT:   %[[ubb:.*]] = select %[[cmpubb]], %[[K]], %[[b]] : index
+//  TILE-2-NEXT:   %[[rb:.*]] = linalg.range %i0:%[[ubb]]:%c1 : !linalg.range
+//  TILE-2-NEXT:   %[[sBi:.*]] = linalg.slice %arg1[%[[rb]]] : !linalg.view<?xf32>, !linalg.range, !linalg.view<?xf32>
+//
+//  TILE-2-NEXT:   linalg.dot(%[[sAi]], %[[sBi]], {{.*}}) : !linalg.view<?xf32>, !linalg.view<?xf32>, !linalg.view<f32>
 
-// TILE-02-LABEL: func @dot(%arg0: !linalg.buffer<f32>, %arg1: index, %arg2: index, %arg3: index) {
+// TILE-02-LABEL: func @dot(%arg0: !linalg.view<?xf32>, %arg1: !linalg.view<?xf32>, %arg2: !linalg.view<f32>) {
 //   TILE-02-NOT: affine.for
 
-// TILE-002-LABEL: func @dot(%arg0: !linalg.buffer<f32>, %arg1: index, %arg2: index, %arg3: index) {
+// TILE-002-LABEL: func @dot(%arg0: !linalg.view<?xf32>, %arg1: !linalg.view<?xf32>, %arg2: !linalg.view<f32>) {
 //   TILE-002-NOT: affine.for
 
-// TILE-234-LABEL: func @dot(%arg0: !linalg.buffer<f32>, %arg1: index, %arg2: index, %arg3: index) {
-//       TILE-234: %[[A:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?xf32>
-//  TILE-234-NEXT: %[[B:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<?xf32>
-//  TILE-234-NEXT: %[[C:.*]] = linalg.view %arg0[{{.*}}] : !linalg.view<f32>
-//       TILE-234:  affine.for %i0 = #[[ID]](%c0{{.*}}) to #[[ID]](%arg1) step 2 {
+// TILE-234-LABEL: func @dot(%arg0: !linalg.view<?xf32>, %arg1: !linalg.view<?xf32>, %arg2: !linalg.view<f32>) {
+//       TILE-234: %[[K:.*]] = linalg.dim %arg0, 0 : !linalg.view<?xf32>
+//       TILE-234:  affine.for %i0 = #[[ID]](%c0{{.*}}) to #[[ID]](%[[K]]) step 2 {
 //  TILE-234-NEXT:    %[[a:.*]] = affine.apply #[[UB0]](%i0)
-//  TILE-234-NEXT:    %[[ra0:.*]] = linalg.range %i0:%[[a]]:%c2 : !linalg.range
-//  TILE-234-NEXT:    %[[ra:.*]] = linalg.range_intersect %{{.}}, %[[ra0]] : !linalg.range
-//  TILE-234-NEXT:    %[[sA:.*]] = linalg.slice %[[A]][%[[ra]]] : !linalg.view<?xf32>, !linalg.range, !linalg.view<?xf32>
+//  TILE-234-NEXT:    %[[K:.*]] = linalg.dim %arg0, 0 : !linalg.view<?xf32>
+//  TILE-234-NEXT:    %[[cmpuba:.*]] = cmpi "slt", %[[K]], %[[a]] : index
+//  TILE-234-NEXT:    %[[uba:.*]] = select %[[cmpuba]], %[[K]], %[[a]] : index
+//  TILE-234-NEXT:    %[[ra:.*]] = linalg.range %i0:%[[uba]]:%c1 : !linalg.range
+//  TILE-234-NEXT:    %[[sA:.*]] = linalg.slice %arg0[%[[ra]]] : !linalg.view<?xf32>, !linalg.range, !linalg.view<?xf32>
+//
 //  TILE-234-NEXT:    %[[b:.*]] = affine.apply #[[UB0]](%i0)
-//  TILE-234-NEXT:    %[[rb0:.*]] = linalg.range %i0:%[[b]]:%c2 : !linalg.range
-//  TILE-234-NEXT:    %[[rb:.*]] = linalg.range_intersect %{{.}}, %[[rb0]] : !linalg.range
-//  TILE-234-NEXT:    %[[sB:.*]] = linalg.slice %[[B]][%[[rb]]] : !linalg.view<?xf32>, !linalg.range, !linalg.view<?xf32>
-//  TILE-234-NEXT:    linalg.dot(%[[sA]], %[[sB]], %[[C]]) : !linalg.view<?xf32>, !linalg.view<?xf32>, !linalg.view<f32>
+//  TILE-234-NEXT:    %[[K:.*]] = linalg.dim %arg1, 0 : !linalg.view<?xf32>
+//  TILE-234-NEXT:    %[[cmpubb:.*]] = cmpi "slt", %[[K]], %[[b]] : index
+//  TILE-234-NEXT:    %[[ubb:.*]] = select %[[cmpubb]], %[[K]], %[[b]] : index
+//  TILE-234-NEXT:    %[[rb:.*]] = linalg.range %i0:%[[ubb]]:%c1 : !linalg.range
+//  TILE-234-NEXT:    %[[sB:.*]] = linalg.slice %arg1[%[[rb]]] : !linalg.view<?xf32>, !linalg.range, !linalg.view<?xf32>
+//
+//  TILE-234-NEXT:    linalg.dot(%[[sA]], %[[sB]], %arg2) : !linalg.view<?xf32>, !linalg.view<?xf32>, !linalg.view<f32>