Extend loop unrolling and unroll-jamming to non-matching bound operands and
multi-result upper bounds, complete TODOs, fix/improve test cases.

- complete TODOs for loop unroll/unroll-and-jam. Something as simple as
  "for %i = 0 to %N" wasn't being unrolled earlier (unless it had been written
  as "for %i = ()[s0] -> (0)()[%N] to %N"; addressed now.

- update/replace getTripCountExpr with buildTripCountMapAndOperands; makes it
  more powerful as it composes inputs into it

- getCleanupLowerBound and getUnrolledLoopUpperBound actually needed the same
  code; refactor and remove one.

- reorganize test cases, write previous ones better; most of these changes are
  "label replacements".

- fix wrongly labeled test cases in unroll-jam.mlir

PiperOrigin-RevId: 238014653
diff --git a/include/mlir/Analysis/LoopAnalysis.h b/include/mlir/Analysis/LoopAnalysis.h
index e15fffc..7b4142b 100644
--- a/include/mlir/Analysis/LoopAnalysis.h
+++ b/include/mlir/Analysis/LoopAnalysis.h
@@ -36,10 +36,17 @@
 class MemRefType;
 class Value;
 
-/// Returns the trip count of the loop as an affine expression if the latter is
-/// expressible as an affine expression, and nullptr otherwise. The trip count
-/// expression is simplified before returning.
-AffineExpr getTripCountExpr(ConstOpPointer<AffineForOp> forOp);
+/// Returns the trip count of the loop as an affine map with its corresponding
+/// operands if the latter is expressible as an affine expression, and nullptr
+/// otherwise. This method always succeeds as long as the lower bound is not a
+/// multi-result map. The trip count expression is simplified before returning.
+/// This method only utilizes map composition to construct lower and upper
+/// bounds before computing the trip count expressions
+// TODO(mlir-team): this should be moved into 'Transforms/' and be replaced by a
+// pure analysis method relying on FlatAffineConstraints
+void buildTripCountMapAndOperands(ConstOpPointer<AffineForOp> forOp,
+                                  AffineMap *map,
+                                  SmallVectorImpl<Value *> *operands);
 
 /// Returns the trip count of the loop if it's a constant, None otherwise. This
 /// uses affine expression analysis and is able to determine constant trip count
diff --git a/include/mlir/Transforms/LoopUtils.h b/include/mlir/Transforms/LoopUtils.h
index ffab79d..8a8c488 100644
--- a/include/mlir/Transforms/LoopUtils.h
+++ b/include/mlir/Transforms/LoopUtils.h
@@ -34,6 +34,7 @@
 class Function;
 class FuncBuilder;
 template <typename T> class OpPointer;
+class Value;
 
 /// Unrolls this for instruction completely if the trip count is known to be
 /// constant. Returns failure otherwise.
@@ -66,16 +67,15 @@
 /// their body into the containing Block.
 void promoteSingleIterationLoops(Function *f);
 
-/// Returns the lower bound of the cleanup loop when unrolling a loop
-/// with the specified unroll factor.
-AffineMap getCleanupLoopLowerBound(ConstOpPointer<AffineForOp> forOp,
-                                   unsigned unrollFactor, FuncBuilder *builder);
-
-/// Returns the upper bound of an unrolled loop when unrolling with
-/// the specified trip count, stride, and unroll factor.
-AffineMap getUnrolledLoopUpperBound(ConstOpPointer<AffineForOp> forOp,
-                                    unsigned unrollFactor,
-                                    FuncBuilder *builder);
+/// Computes the cleanup loop lower bound of the loop being unrolled with
+/// the specified unroll factor; this bound will also be upper bound of the main
+/// part of the unrolled loop. Computes the bound as an AffineMap with its
+/// operands or a null map when the trip count can't be expressed as an affine
+/// expression.
+void getCleanupLoopLowerBound(ConstOpPointer<AffineForOp> forOp,
+                              unsigned unrollFactor, AffineMap *map,
+                              SmallVectorImpl<Value *> *operands,
+                              FuncBuilder *builder);
 
 /// Skew the instructions in the body of a 'for' instruction with the specified
 /// instruction-wise shifts. The shifts are with respect to the original
diff --git a/lib/Analysis/LoopAnalysis.cpp b/lib/Analysis/LoopAnalysis.cpp
index 96ba195..bc48f96 100644
--- a/lib/Analysis/LoopAnalysis.cpp
+++ b/lib/Analysis/LoopAnalysis.cpp
@@ -26,6 +26,7 @@
 #include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Analysis/NestedMatcher.h"
 #include "mlir/Analysis/VectorAnalysis.h"
+#include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Instruction.h"
 #include "mlir/StandardOps/Ops.h"
@@ -41,88 +42,141 @@
 
 /// Returns the trip count of the loop as an affine expression if the latter is
 /// expressible as an affine expression, and nullptr otherwise. The trip count
-/// expression is simplified before returning.
-AffineExpr mlir::getTripCountExpr(ConstOpPointer<AffineForOp> forOp) {
-  // upper_bound - lower_bound
+/// expression is simplified before returning. This method only utilizes map
+/// composition to construct lower and upper bounds before computing the trip
+/// count expressions.
+// TODO(mlir-team): this should be moved into 'Transforms/' and be replaced by a
+// pure analysis method relying on FlatAffineConstraints; the latter will also
+// be more powerful (since both inequalities and equalities will be considered).
+void mlir::buildTripCountMapAndOperands(
+    ConstOpPointer<AffineForOp> forOp, AffineMap *map,
+    SmallVectorImpl<Value *> *tripCountOperands) {
   int64_t loopSpan;
 
   int64_t step = forOp->getStep();
-  auto *context = forOp->getInstruction()->getContext();
+
+  // We need to get operands; we aren't changing them here.
+  auto ncForOp = *reinterpret_cast<OpPointer<AffineForOp> *>(&forOp);
+
+  FuncBuilder b(ncForOp->getInstruction());
 
   if (forOp->hasConstantBounds()) {
     int64_t lb = forOp->getConstantLowerBound();
     int64_t ub = forOp->getConstantUpperBound();
     loopSpan = ub - lb;
-  } else {
-    auto lbMap = forOp->getLowerBoundMap();
-    auto ubMap = forOp->getUpperBoundMap();
-    // TODO(bondhugula): handle max/min of multiple expressions.
-    if (lbMap.getNumResults() != 1 || ubMap.getNumResults() != 1)
-      return nullptr;
-
-    // TODO(bondhugula): handle bounds with different operands.
-    // Bounds have different operands, unhandled for now.
-    if (!forOp->matchingBoundOperandList())
-      return nullptr;
-
-    // ub_expr - lb_expr
-    AffineExpr lbExpr(lbMap.getResult(0));
-    AffineExpr ubExpr(ubMap.getResult(0));
-    auto loopSpanExpr = simplifyAffineExpr(
-        ubExpr - lbExpr, std::max(lbMap.getNumDims(), ubMap.getNumDims()),
-        std::max(lbMap.getNumSymbols(), ubMap.getNumSymbols()));
-    auto cExpr = loopSpanExpr.dyn_cast<AffineConstantExpr>();
-    if (!cExpr)
-      return loopSpanExpr.ceilDiv(step);
-    loopSpan = cExpr.getValue();
+    if (loopSpan < 0)
+      loopSpan = 0;
+    *map = b.getConstantAffineMap(ceilDiv(loopSpan, step));
+    tripCountOperands->clear();
+    return;
   }
+  auto lbMap = forOp->getLowerBoundMap();
+  auto ubMap = forOp->getUpperBoundMap();
+  if (lbMap.getNumResults() != 1) {
+    *map = AffineMap();
+    return;
+  }
+  SmallVector<Value *, 4> lbOperands(ncForOp->getLowerBoundOperands());
+  SmallVector<Value *, 4> ubOperands(ncForOp->getUpperBoundOperands());
+  auto lb = b.create<AffineApplyOp>(forOp->getLoc(), lbMap, lbOperands);
+  SmallVector<Value *, 4> ubs;
+  ubs.reserve(ubMap.getNumResults());
+  for (auto ubExpr : ubMap.getResults())
+    ubs.push_back(b.create<AffineApplyOp>(
+        forOp->getLoc(),
+        b.getAffineMap(ubMap.getNumDims(), ubMap.getNumSymbols(), {ubExpr}, {}),
+        ubOperands));
 
-  // 0 iteration loops.
-  if (loopSpan < 0)
-    return 0;
+  tripCountOperands->clear();
+  tripCountOperands->reserve(1 + ubs.size());
+  tripCountOperands->push_back(lb);
+  tripCountOperands->append(ubs.begin(), ubs.end());
 
-  return getAffineConstantExpr(static_cast<uint64_t>(ceilDiv(loopSpan, step)),
-                               context);
+  SmallVector<AffineExpr, 4> tripCountExprs(ubs.size());
+  for (unsigned i = 0, e = ubs.size(); i < e; i++)
+    tripCountExprs[i] =
+        (b.getAffineDimExpr(1 + i) - b.getAffineDimExpr(0)).ceilDiv(step);
+  *map = b.getAffineMap(1 + ubs.size(), 0, tripCountExprs, {});
+  forOp->getInstruction()->getFunction()->dump();
+  fullyComposeAffineMapAndOperands(map, tripCountOperands);
+  *map = simplifyAffineMap(*map);
+  canonicalizeMapAndOperands(map, tripCountOperands);
+  // Remove any affine.apply's that became dead as a result of composition,
+  // simplification, and canonicalization above.
+  for (auto *v : ubs)
+    if (v->use_empty())
+      v->getDefiningInst()->erase();
+  if (lb->use_empty())
+    lb->erase();
 }
 
 /// Returns the trip count of the loop if it's a constant, None otherwise. This
 /// method uses affine expression analysis (in turn using getTripCount) and is
 /// able to determine constant trip count in non-trivial cases.
+// FIXME(mlir-team): this is really relying on buildTripCountMapAndOperands;
+// being an analysis utility, it shouldn't. Replace with a version that just
+// works with analysis structures (FlatAffineConstraints) and thus doesn't
+// update the IR.
 llvm::Optional<uint64_t>
 mlir::getConstantTripCount(ConstOpPointer<AffineForOp> forOp) {
-  auto tripCountExpr = getTripCountExpr(forOp);
+  SmallVector<Value *, 4> operands;
+  AffineMap map;
+  buildTripCountMapAndOperands(forOp, &map, &operands);
 
-  if (!tripCountExpr)
+  if (!map)
     return None;
 
-  if (auto constExpr = tripCountExpr.dyn_cast<AffineConstantExpr>())
-    return constExpr.getValue();
-
-  return None;
+  // Take the min if all trip counts are constant.
+  Optional<uint64_t> tripCount;
+  for (auto resultExpr : map.getResults()) {
+    if (auto constExpr = resultExpr.dyn_cast<AffineConstantExpr>()) {
+      if (tripCount.hasValue())
+        tripCount = std::min(tripCount.getValue(),
+                             static_cast<uint64_t>(constExpr.getValue()));
+      else
+        tripCount = constExpr.getValue();
+    } else
+      return None;
+  }
+  return tripCount;
 }
 
 /// Returns the greatest known integral divisor of the trip count. Affine
 /// expression analysis is used (indirectly through getTripCount), and
 /// this method is thus able to determine non-trivial divisors.
 uint64_t mlir::getLargestDivisorOfTripCount(ConstOpPointer<AffineForOp> forOp) {
-  auto tripCountExpr = getTripCountExpr(forOp);
+  SmallVector<Value *, 4> operands;
+  AffineMap map;
+  buildTripCountMapAndOperands(forOp, &map, &operands);
 
-  if (!tripCountExpr)
+  if (!map)
     return 1;
 
-  if (auto constExpr = tripCountExpr.dyn_cast<AffineConstantExpr>()) {
-    uint64_t tripCount = constExpr.getValue();
-
-    // 0 iteration loops (greatest divisor is 2^64 - 1).
-    if (tripCount == 0)
-      return ULONG_MAX;
-
-    // The greatest divisor is the trip count.
-    return tripCount;
+  // The largest divisor of the trip count is the GCD of the individual largest
+  // divisors.
+  assert(map.getNumResults() >= 1 && "expected one or more results");
+  Optional<uint64_t> gcd;
+  for (auto resultExpr : map.getResults()) {
+    uint64_t thisGcd;
+    if (auto constExpr = resultExpr.dyn_cast<AffineConstantExpr>()) {
+      uint64_t tripCount = constExpr.getValue();
+      // 0 iteration loops (greatest divisor is 2^64 - 1).
+      if (tripCount == 0)
+        thisGcd = std::numeric_limits<uint64_t>::max();
+      else
+        // The greatest divisor is the trip count.
+        thisGcd = tripCount;
+    } else {
+      // Trip count is not a known constant; return its largest known divisor.
+      thisGcd = resultExpr.getLargestKnownDivisor();
+    }
+    if (gcd.hasValue())
+      gcd = llvm::GreatestCommonDivisor64(gcd.getValue(), thisGcd);
+    else
+      gcd = thisGcd;
   }
-
-  // Trip count is not a known constant; return its largest known divisor.
-  return tripCountExpr.getLargestKnownDivisor();
+  assert(gcd.hasValue() && "value expected per above logic");
+  return gcd.getValue();
 }
 
 bool mlir::isAccessInvariant(const Value &iv, const Value &index) {
diff --git a/lib/Transforms/LoopUnrollAndJam.cpp b/lib/Transforms/LoopUnrollAndJam.cpp
index 6e4b932..f1cc7c6 100644
--- a/lib/Transforms/LoopUnrollAndJam.cpp
+++ b/lib/Transforms/LoopUnrollAndJam.cpp
@@ -152,32 +152,23 @@
 
   assert(unrollJamFactor >= 1 && "unroll jam factor should be >= 1");
 
-  if (unrollJamFactor == 1 || forOp->getBody()->empty())
+  if (unrollJamFactor == 1)
+    return promoteIfSingleIteration(forOp);
+
+  if (forOp->getBody()->empty())
+    return failure();
+
+  // Loops where both lower and upper bounds are multi-result maps won't be
+  // unrolled (since the trip can't be expressed as an affine function in
+  // general).
+  // TODO(mlir-team): this may not be common, but we could support the case
+  // where the lower bound is a multi-result map and the ub is a single result
+  // one.
+  if (forOp->getLowerBoundMap().getNumResults() != 1)
     return failure();
 
   Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
-
-  if (!mayBeConstantTripCount.hasValue() &&
-      getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0)
-    return failure();
-
-  auto lbMap = forOp->getLowerBoundMap();
-  auto ubMap = forOp->getUpperBoundMap();
-
-  // Loops with max/min expressions won't be unrolled here (the output can't be
-  // expressed as a Function in the general case). However, the right way to
-  // do such unrolling for a Function would be to specialize the loop for the
-  // 'hotspot' case and unroll that hotspot.
-  if (lbMap.getNumResults() != 1 || ubMap.getNumResults() != 1)
-    return failure();
-
-  // Same operand list for lower and upper bound for now.
-  // TODO(bondhugula): handle bounds with different sets of operands.
-  if (!forOp->matchingBoundOperandList())
-    return failure();
-
   // If the trip count is lower than the unroll jam factor, no unroll jam.
-  // TODO(bondhugula): option to specify cleanup loop unrolling.
   if (mayBeConstantTripCount.hasValue() &&
       mayBeConstantTripCount.getValue() < unrollJamFactor)
     return failure();
@@ -191,21 +182,25 @@
 
   // Generate the cleanup loop if trip count isn't a multiple of
   // unrollJamFactor.
-  if (mayBeConstantTripCount.hasValue() &&
-      mayBeConstantTripCount.getValue() % unrollJamFactor != 0) {
+  if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
     // Insert the cleanup loop right after 'forOp'.
     FuncBuilder builder(forInst->getBlock(),
                         std::next(Block::iterator(forInst)));
     auto cleanupAffineForOp = builder.clone(*forInst)->cast<AffineForOp>();
-    cleanupAffineForOp->setLowerBoundMap(
-        getCleanupLoopLowerBound(forOp, unrollJamFactor, &builder));
+    // Adjust the lower bound of the cleanup loop; its upper bound is the same
+    // as the original loop's upper bound.
+    AffineMap cleanupMap;
+    SmallVector<Value *, 4> cleanupOperands;
+    getCleanupLoopLowerBound(forOp, unrollJamFactor, &cleanupMap,
+                             &cleanupOperands, &builder);
+    cleanupAffineForOp->setLowerBound(cleanupOperands, cleanupMap);
 
-    // The upper bound needs to be adjusted.
-    forOp->setUpperBoundMap(
-        getUnrolledLoopUpperBound(forOp, unrollJamFactor, &builder));
-
-    // Promote the loop body up if this has turned into a single iteration loop.
+    // Promote the cleanup loop if it has turned into a single iteration loop.
     promoteIfSingleIteration(cleanupAffineForOp);
+
+    // Adjust the upper bound of the original loop - it will be the same as the
+    // cleanup loop's lower bound. Its lower bound remains unchanged.
+    forOp->setUpperBound(cleanupOperands, cleanupMap);
   }
 
   // Scale the step of loop being unroll-jammed by the unroll-jam factor.
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index 99ea0b8..9588d0c 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -38,54 +38,78 @@
 
 using namespace mlir;
 
-/// Returns the upper bound of an unrolled loop with lower bound 'lb' and with
-/// the specified trip count, stride, and unroll factor. Returns nullptr when
-/// the trip count can't be expressed as an affine expression.
-AffineMap mlir::getUnrolledLoopUpperBound(ConstOpPointer<AffineForOp> forOp,
-                                          unsigned unrollFactor,
-                                          FuncBuilder *builder) {
+/// Computes the cleanup loop lower bound of the loop being unrolled with
+/// the specified unroll factor; this bound will also be upper bound of the main
+/// part of the unrolled loop. Computes the bound as an AffineMap with its
+/// operands or a null map when the trip count can't be expressed as an affine
+/// expression.
+void mlir::getCleanupLoopLowerBound(ConstOpPointer<AffineForOp> forOp,
+                                    unsigned unrollFactor, AffineMap *map,
+                                    SmallVectorImpl<Value *> *operands,
+                                    FuncBuilder *b) {
   auto lbMap = forOp->getLowerBoundMap();
 
   // Single result lower bound map only.
-  if (lbMap.getNumResults() != 1)
-    return AffineMap();
+  if (lbMap.getNumResults() != 1) {
+    *map = AffineMap();
+    return;
+  }
 
-  // Sometimes, the trip count cannot be expressed as an affine expression.
-  auto tripCount = getTripCountExpr(forOp);
-  if (!tripCount)
-    return AffineMap();
-
-  AffineExpr lb(lbMap.getResult(0));
-  unsigned step = forOp->getStep();
-  auto newUb = lb + (tripCount - tripCount % unrollFactor - 1) * step;
-
-  return builder->getAffineMap(lbMap.getNumDims(), lbMap.getNumSymbols(),
-                               {newUb}, {});
-}
-
-/// Returns the lower bound of the cleanup loop when unrolling a loop with lower
-/// bound 'lb' and with the specified trip count, stride, and unroll factor.
-/// Returns an AffinMap with nullptr storage (that evaluates to false)
-/// when the trip count can't be expressed as an affine expression.
-AffineMap mlir::getCleanupLoopLowerBound(ConstOpPointer<AffineForOp> forOp,
-                                         unsigned unrollFactor,
-                                         FuncBuilder *builder) {
-  auto lbMap = forOp->getLowerBoundMap();
-
-  // Single result lower bound map only.
-  if (lbMap.getNumResults() != 1)
-    return AffineMap();
+  AffineMap tripCountMap;
+  SmallVector<Value *, 4> tripCountOperands;
+  buildTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
 
   // Sometimes the trip count cannot be expressed as an affine expression.
-  AffineExpr tripCount(getTripCountExpr(forOp));
-  if (!tripCount)
-    return AffineMap();
+  if (!tripCountMap) {
+    *map = AffineMap();
+    return;
+  }
 
-  AffineExpr lb(lbMap.getResult(0));
   unsigned step = forOp->getStep();
-  auto newLb = lb + (tripCount - tripCount % unrollFactor) * step;
-  return builder->getAffineMap(lbMap.getNumDims(), lbMap.getNumSymbols(),
-                               {newLb}, {});
+
+  // We need to get non-const operands; we aren't changing them here.
+  auto ncForOp = *reinterpret_cast<OpPointer<AffineForOp> *>(&forOp);
+
+  SmallVector<Value *, 4> lbOperands(ncForOp->getLowerBoundOperands());
+  auto lb = b->create<AffineApplyOp>(ncForOp->getLoc(), lbMap, lbOperands);
+
+  // For each upper bound expr, get the range.
+  // Eg: for %i = lb to min (ub1, ub2),
+  // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
+  // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
+  // these affine.apply's make up the cleanup loop lower bound.
+  SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
+  SmallVector<Value *, 4> bumpValues(tripCountMap.getNumResults());
+  for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
+    auto tripCountExpr = tripCountMap.getResult(i);
+    bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
+    auto bumpMap =
+        b->getAffineMap(tripCountMap.getNumDims(), tripCountMap.getNumSymbols(),
+                        bumpExprs[i], {});
+    bumpValues[i] =
+        b->create<AffineApplyOp>(forOp->getLoc(), bumpMap, tripCountOperands);
+  }
+
+  SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
+  for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
+    newUbExprs[i] = b->getAffineDimExpr(0) + b->getAffineDimExpr(i + 1);
+
+  operands->clear();
+  operands->push_back(lb);
+  operands->append(bumpValues.begin(), bumpValues.end());
+  *map = b->getAffineMap(1 + tripCountMap.getNumResults(), 0, newUbExprs, {});
+  // Simplify the map + operands.
+  fullyComposeAffineMapAndOperands(map, operands);
+  *map = simplifyAffineMap(*map);
+  canonicalizeMapAndOperands(map, operands);
+  // Remove any affine.apply's that became dead from the simplification above.
+  for (auto *v : bumpValues) {
+    if (v->use_empty()) {
+      v->getDefiningInst()->erase();
+    }
+  }
+  if (lb->use_empty())
+    lb->erase();
 }
 
 /// Promotes the loop body of a forOp to its containing block if the forOp
@@ -369,25 +393,17 @@
   if (forOp->getBody()->empty())
     return failure();
 
-  auto lbMap = forOp->getLowerBoundMap();
-  auto ubMap = forOp->getUpperBoundMap();
-
-  // Loops with max/min expressions won't be unrolled here (the output can't be
-  // expressed as a Function in the general case). However, the right way to
-  // do such unrolling for a Function would be to specialize the loop for the
-  // 'hotspot' case and unroll that hotspot.
-  if (lbMap.getNumResults() != 1 || ubMap.getNumResults() != 1)
+  // Loops where the lower bound is a max expression isn't supported for
+  // unrolling since the trip count can be expressed as an affine function when
+  // both the lower bound and the upper bound are multi-result maps. However,
+  // one meaningful way to do such unrolling would be to specialize the loop for
+  // the 'hotspot' case and unroll that hotspot.
+  if (forOp->getLowerBoundMap().getNumResults() != 1)
     return failure();
 
-  // Same operand list for lower and upper bound for now.
-  // TODO(bondhugula): handle bounds with different operand lists.
-  if (!forOp->matchingBoundOperandList())
-    return failure();
-
-  Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
-
   // If the trip count is lower than the unroll factor, no unrolled body.
   // TODO(bondhugula): option to specify cleanup loop unrolling.
+  Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
   if (mayBeConstantTripCount.hasValue() &&
       mayBeConstantTripCount.getValue() < unrollFactor)
     return failure();
@@ -397,21 +413,20 @@
   if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
     FuncBuilder builder(forInst->getBlock(), ++Block::iterator(forInst));
     auto cleanupForInst = builder.clone(*forInst)->cast<AffineForOp>();
-    auto clLbMap = getCleanupLoopLowerBound(forOp, unrollFactor, &builder);
-    assert(clLbMap &&
-           "cleanup loop lower bound map for single result bound maps can "
-           "always be determined");
-    cleanupForInst->setLowerBoundMap(clLbMap);
+    AffineMap cleanupMap;
+    SmallVector<Value *, 4> cleanupOperands;
+    getCleanupLoopLowerBound(forOp, unrollFactor, &cleanupMap, &cleanupOperands,
+                             &builder);
+    assert(cleanupMap &&
+           "cleanup loop lower bound map for single result lower bound maps "
+           "can always be determined");
+    cleanupForInst->setLowerBound(cleanupOperands, cleanupMap);
     // Promote the loop body up if this has turned into a single iteration loop.
     promoteIfSingleIteration(cleanupForInst);
 
-    // Adjust upper bound.
-    auto unrolledUbMap =
-        getUnrolledLoopUpperBound(forOp, unrollFactor, &builder);
-    assert(unrolledUbMap &&
-           "upper bound map can alwayys be determined for an unrolled loop "
-           "with single result bounds");
-    forOp->setUpperBoundMap(unrolledUbMap);
+    // Adjust upper bound of the original loop; this is the same as the lower
+    // bound of the cleanup loop.
+    forOp->setUpperBound(cleanupOperands, cleanupMap);
   }
 
   // Scale the step of loop being unrolled by unroll factor.
diff --git a/test/Transforms/unroll-jam.mlir b/test/Transforms/unroll-jam.mlir
index da4f965..b872cb6 100644
--- a/test/Transforms/unroll-jam.mlir
+++ b/test/Transforms/unroll-jam.mlir
@@ -1,20 +1,21 @@
 // RUN: mlir-opt %s -loop-unroll-jam -unroll-jam-factor=2 | FileCheck %s
 
-// CHECK: [[MAP_PLUS_1:#map[0-9]+]] = (d0) -> (d0 + 1)
-// This should be matched to M1, but M1 is defined later.
-// CHECK: {{#map[0-9]+}} = ()[s0] -> (s0 + 8)
+// CHECK-DAG: [[MAP_PLUS_1:#map[0-9]+]] = (d0) -> (d0 + 1)
+// CHECK-DAG: [[M1:#map[0-9]+]] = ()[s0] -> (s0 + 8)
+// CHECK-DAG: [[MAP_DIV_OFFSET:#map[0-9]+]] = ()[s0] -> (((s0 - 1) floordiv 2) * 2 + 1)
+// CHECK-DAG: [[MAP_MULTI_RES:#map[0-9]+]] = ()[s0, s1] -> ((s0 floordiv 2) * 2, (s1 floordiv 2) * 2, 1024)
 
 // CHECK-LABEL: func @unroll_jam_imperfect_nest() {
 func @unroll_jam_imperfect_nest() {
   // CHECK: %c100 = constant 100 : index
-  // CHECK-NEXT: for %i0 = 0 to 99 step 2 {
+  // CHECK-NEXT: for %i0 = 0 to 100 step 2 {
   for %i = 0 to 101 {
     // CHECK: %0 = "addi32"(%i0, %i0) : (index, index) -> i32
     // CHECK-NEXT: %1 = affine.apply [[MAP_PLUS_1]](%i0)
     // CHECK-NEXT: %2 = "addi32"(%1, %1) : (index, index) -> i32
     %x = "addi32"(%i, %i) : (index, index) -> i32
     for %j = 0 to 17 {
-      // CHECK: %3 = "addi32"(%i0, %i0) : (index, index) -> i32
+      // CHECK:      %3 = "addi32"(%i0, %i0) : (index, index) -> i32
       // CHECK-NEXT: %4 = "addi32"(%3, %3) : (i32, i32) -> i32
       // CHECK-NEXT: %5 = affine.apply [[MAP_PLUS_1]](%i0)
       // CHECK-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> i32
@@ -30,31 +31,28 @@
   // cleanup loop (single iteration)
   // CHECK: %11 = "addi32"(%c100, %c100) : (index, index) -> i32
   // CHECK-NEXT: for %i2 = 0 to 17 {
-    // CHECK-NEXT: %12 = "addi32"(%c100, %c100) : (index, index) -> i32
-    // CHECK-NEXT: %13 = "addi32"(%12, %12) : (i32, i32) -> i32
+  // CHECK-NEXT:   %12 = "addi32"(%c100, %c100) : (index, index) -> i32
+  // CHECK-NEXT:   %13 = "addi32"(%12, %12) : (i32, i32) -> i32
   // CHECK-NEXT: }
   // CHECK-NEXT: %14 = "addi32"(%c100, %c100) : (index, index) -> i32
   return
 }
 
-// UNROLL-BY-4-LABEL: func @loop_nest_unknown_count_1(%arg0: index) {
+// CHECK-LABEL: func @loop_nest_unknown_count_1(%arg0: index) {
 func @loop_nest_unknown_count_1(%N : index) {
-  // UNROLL-BY-4-NEXT: for %i0 = 1 to  #map{{[0-9]+}}()[%arg0] step 4 {
-    // UNROLL-BY-4-NEXT: for %i1 = 1 to 100 {
-      // UNROLL-BY-4-NEXT: %0 = "foo"() : () -> i32
-      // UNROLL-BY-4-NEXT: %1 = "foo"() : () -> i32
-      // UNROLL-BY-4-NEXT: %2 = "foo"() : () -> i32
-      // UNROLL-BY-4-NEXT: %3 = "foo"() : () -> i32
-    // UNROLL-BY-4-NEXT: }
-  // UNROLL-BY-4-NEXT: }
+  // CHECK-NEXT: for %i0 = 1 to [[MAP_DIV_OFFSET]]()[%arg0] step 2 {
+  // CHECK-NEXT:   for %i1 = 1 to 100 {
+  // CHECK-NEXT:     %0 = "foo"() : () -> i32
+  // CHECK-NEXT:     %1 = "foo"() : () -> i32
+  // CHECK-NEXT:   }
+  // CHECK-NEXT: }
   // A cleanup loop should be generated here.
-  // UNROLL-BY-4-NEXT: for %i2 = #map{{[0-9]+}}()[%arg0] to %arg0 {
-    // UNROLL-BY-4-NEXT: for %i3 = 1 to 100 {
-      // UNROLL-BY-4-NEXT: %4 = "foo"() : () -> i32
-    // UNROLL-BY-4_NEXT: }
-  // UNROLL-BY-4_NEXT: }
-  // Specify the lower bound in a form so that both lb and ub operands match.
-  for %i = ()[s0] -> (1)()[%N] to %N {
+  // CHECK-NEXT: for %i2 = [[MAP_DIV_OFFSET]]()[%arg0] to %arg0 {
+  // CHECK-NEXT:   for %i3 = 1 to 100 {
+  // CHECK-NEXT:     %2 = "foo"() : () -> i32
+  // CHECK_NEXT:   }
+  // CHECK_NEXT: }
+  for %i = 1 to %N {
     for %j = 1 to 100 {
       %x = "foo"() : () -> i32
     }
@@ -62,29 +60,47 @@
   return
 }
 
-// UNROLL-BY-4-LABEL: func @loop_nest_unknown_count_2(%arg0: index) {
+// CHECK-LABEL: func @loop_nest_unknown_count_2(%arg0: index) {
 func @loop_nest_unknown_count_2(%arg : index) {
-  // UNROLL-BY-4-NEXT: for %i0 = %arg0 to  #map{{[0-9]+}}()[%arg0] step 4 {
-    // UNROLL-BY-4-NEXT: for %i1 = 1 to 100 {
-      // UNROLL-BY-4-NEXT: %0 = "foo"(%i0) : (index) -> i32
-      // UNROLL-BY-4-NEXT: %1 = affine.apply #map{{[0-9]+}}(%i0)
-      // UNROLL-BY-4-NEXT: %2 = "foo"(%1) : (index) -> i32
-      // UNROLL-BY-4-NEXT: %3 = affine.apply #map{{[0-9]+}}(%i0)
-      // UNROLL-BY-4-NEXT: %4 = "foo"(%3) : (index) -> i32
-      // UNROLL-BY-4-NEXT: %5 = affine.apply #map{{[0-9]+}}(%i0)
-      // UNROLL-BY-4-NEXT: %6 = "foo"(%5) : (index) -> i32
-    // UNROLL-BY-4-NEXT: }
-  // UNROLL-BY-4-NEXT: }
+  // CHECK-NEXT: for %i0 = %arg0 to  [[M1]]()[%arg0] step 2 {
+  // CHECK-NEXT:   for %i1 = 1 to 100 {
+  // CHECK-NEXT:     %0 = "foo"(%i0) : (index) -> i32
+  // CHECK-NEXT:     %1 = affine.apply #map{{[0-9]+}}(%i0)
+  // CHECK-NEXT:     %2 = "foo"(%1) : (index) -> i32
+  // CHECK-NEXT:   }
+  // CHECK-NEXT: }
   // The cleanup loop is a single iteration one and is promoted.
-  // UNROLL-BY-4-NEXT: %7 = affine.apply [[M1:#map{{[0-9]+}}]]()[%arg0]
-  // UNROLL-BY-4-NEXT: for %i3 = 1 to 100 {
-    // UNROLL-BY-4-NEXT: %8 = "foo"() : () -> i32
-  // UNROLL-BY-4_NEXT: }
-  // Specify the lower bound in a form so that both lb and ub operands match.
-  for %i = ()[s0] -> (s0) ()[%arg] to ()[s0] -> (s0+8) ()[%arg] {
+  // CHECK-NEXT: %3 = affine.apply [[M1]]()[%arg0]
+  // CHECK-NEXT: for %i2 = 1 to 100 {
+  // CHECK-NEXT:   %4 = "foo"(%3) : (index) -> i32
+  // CHECK_NEXT: }
+  for %i = %arg to ()[s0] -> (s0+9) ()[%arg] {
     for %j = 1 to 100 {
       %x = "foo"(%i) : (index) -> i32
     }
   }
   return
 }
+
+// CHECK-LABEL: func @loop_nest_symbolic_and_min_upper_bound
+func @loop_nest_symbolic_and_min_upper_bound(%M : index, %N : index, %K : index) {
+  for %i = 0 to min ()[s0, s1] -> (s0, s1, 1024)()[%M, %N] {
+    for %j = 0 to %K {
+      "foo"(%i, %j) : (index, index) -> ()
+    }
+  }
+  return
+}
+// CHECK-NEXT:  for %i0 = 0 to min [[MAP_MULTI_RES]]()[%arg0, %arg1] step 2 {
+// CHECK-NEXT:    for %i1 = 0 to %arg2 {
+// CHECK-NEXT:      "foo"(%i0, %i1) : (index, index) -> ()
+// CHECK-NEXT:      %0 = affine.apply #map2(%i0)
+// CHECK-NEXT:      "foo"(%0, %i1) : (index, index) -> ()
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  for %i2 = max [[MAP_MULTI_RES]]()[%arg0, %arg1] to min #map9()[%arg0, %arg1] {
+// CHECK-NEXT:    for %i3 = 0 to %arg2 {
+// CHECK-NEXT:      "foo"(%i2, %i3) : (index, index) -> ()
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
diff --git a/test/Transforms/unroll.mlir b/test/Transforms/unroll.mlir
index 17a0c19..4fef9cf 100644
--- a/test/Transforms/unroll.mlir
+++ b/test/Transforms/unroll.mlir
@@ -1,253 +1,244 @@
-// RUN: mlir-opt %s -loop-unroll -unroll-full | FileCheck %s
+// RUN: mlir-opt %s -loop-unroll -unroll-full | FileCheck %s --check-prefix UNROLL-FULL
 // RUN: mlir-opt %s -loop-unroll -unroll-full -unroll-full-threshold=2 | FileCheck %s --check-prefix SHORT
 // RUN: mlir-opt %s -loop-unroll -unroll-factor=4 | FileCheck %s --check-prefix UNROLL-BY-4
 // RUN: mlir-opt %s -loop-unroll -unroll-factor=1 | FileCheck %s --check-prefix UNROLL-BY-1
 
-// CHECK: [[MAP0:#map[0-9]+]] = (d0) -> (d0 + 1)
-// CHECK: [[MAP1:#map[0-9]+]] = (d0) -> (d0 + 2)
-// CHECK: [[MAP2:#map[0-9]+]] = (d0) -> (d0 + 3)
-// CHECK: [[MAP3:#map[0-9]+]] = (d0) -> (d0 + 4)
-// CHECK: [[MAP4:#map[0-9]+]] = (d0, d1) -> (d0 + 1)
-// CHECK: [[MAP5:#map[0-9]+]] = (d0, d1) -> (d0 + 3)
-// CHECK: [[MAP6:#map[0-9]+]] = (d0)[s0] -> (d0 + s0 + 1)
-// CHECK: [[MAP7:#map[0-9]+]] = (d0) -> (d0 + 5)
-// CHECK: [[MAP8:#map[0-9]+]] = (d0) -> (d0 + 6)
-// CHECK: [[MAP9:#map[0-9]+]] = (d0) -> (d0 + 7)
-// CHECK: [[MAP10:#map[0-9]+]] = (d0, d1) -> (d0 * 16 + d1)
-// CHECK: [[MAP11:#map[0-9]+]] = (d0) -> (d0 + 8)
-// CHECK: [[MAP12:#map[0-9]+]] = (d0) -> (d0 + 9)
-// CHECK: [[MAP13:#map[0-9]+]] = (d0) -> (d0 + 10)
-// CHECK: [[MAP14:#map[0-9]+]] = (d0) -> (d0 + 15)
-// CHECK: [[MAP15:#map[0-9]+]] = (d0) -> (d0 + 20)
-// CHECK: [[MAP16:#map[0-9]+]] = (d0) -> (d0 + 25)
-// CHECK: [[MAP17:#map[0-9]+]] = (d0) -> (d0 + 30)
-// CHECK: [[MAP18:#map[0-9]+]] = (d0) -> (d0 + 35)
+// UNROLL-FULL-DAG: [[MAP0:#map[0-9]+]] = (d0) -> (d0 + 1)
+// UNROLL-FULL-DAG: [[MAP1:#map[0-9]+]] = (d0) -> (d0 + 2)
+// UNROLL-FULL-DAG: [[MAP2:#map[0-9]+]] = (d0) -> (d0 + 3)
+// UNROLL-FULL-DAG: [[MAP3:#map[0-9]+]] = (d0) -> (d0 + 4)
+// UNROLL-FULL-DAG: [[MAP4:#map[0-9]+]] = (d0, d1) -> (d0 + 1)
+// UNROLL-FULL-DAG: [[MAP5:#map[0-9]+]] = (d0, d1) -> (d0 + 3)
+// UNROLL-FULL-DAG: [[MAP6:#map[0-9]+]] = (d0)[s0] -> (d0 + s0 + 1)
 
-// SHORT: [[MAP0:#map[0-9]+]] = (d0) -> (d0 + 1)
-// SHORT: [[MAP1:#map[0-9]+]] = (d0) -> (d0 + 2)
-// SHORT: [[MAP2:#map[0-9]+]] = (d0, d1) -> (d0 + 1)
-// SHORT: [[MAP3:#map[0-9]+]] = (d0, d1) -> (d0 + 3)
-// SHORT: [[MAP4:#map[0-9]+]] = (d0)[s0] -> (d0 + s0 + 1)
-// SHORT: [[MAP5:#map[0-9]+]] = (d0, d1) -> (d0 * 16 + d1)
+// SHORT-DAG: [[MAP0:#map[0-9]+]] = (d0) -> (d0 + 1)
 
-// UNROLL-BY-4: [[MAP0:#map[0-9]+]] = (d0) -> (d0 + 1)
-// UNROLL-BY-4: [[MAP1:#map[0-9]+]] = (d0) -> (d0 + 2)
-// UNROLL-BY-4: [[MAP2:#map[0-9]+]] = (d0) -> (d0 + 3)
-// UNROLL-BY-4: [[MAP3:#map[0-9]+]] = (d0, d1) -> (d0 + 1)
-// UNROLL-BY-4: [[MAP4:#map[0-9]+]] = (d0, d1) -> (d0 + 3)
-// UNROLL-BY-4: [[MAP5:#map[0-9]+]] = (d0)[s0] -> (d0 + s0 + 1)
-// UNROLL-BY-4: [[MAP6:#map[0-9]+]] = (d0, d1) -> (d0 * 16 + d1)
-// UNROLL-BY-4: [[MAP7:#map[0-9]+]] = (d0) -> (d0 + 5)
-// UNROLL-BY-4: [[MAP8:#map[0-9]+]] = (d0) -> (d0 + 10)
-// UNROLL-BY-4: [[MAP9:#map[0-9]+]] = (d0) -> (d0 + 15)
-// UNROLL-BY-4: [[MAP10:#map[0-9]+]] = (d0) -> (0)
-// UNROLL-BY-4: [[MAP11:#map[0-9]+]] = (d0) -> (d0)
-// UNROLL-BY-4: [[MAP12:#map[0-9]+]] = ()[s0] -> (0)
+// UNROLL-BY-4-DAG: [[MAP0:#map[0-9]+]] = (d0) -> (d0 + 1)
+// UNROLL-BY-4-DAG: [[MAP1:#map[0-9]+]] = (d0) -> (d0 + 2)
+// UNROLL-BY-4-DAG: [[MAP2:#map[0-9]+]] = (d0) -> (d0 + 3)
+// UNROLL-BY-4-DAG: [[MAP3:#map[0-9]+]] = (d0, d1) -> (d0 + 1)
+// UNROLL-BY-4-DAG: [[MAP4:#map[0-9]+]] = (d0, d1) -> (d0 + 3)
+// UNROLL-BY-4-DAG: [[MAP5:#map[0-9]+]] = (d0)[s0] -> (d0 + s0 + 1)
+// UNROLL-BY-4-DAG: [[MAP6:#map[0-9]+]] = (d0, d1) -> (d0 * 16 + d1)
+// UNROLL-BY-4-DAG: [[MAP11:#map[0-9]+]] = (d0) -> (d0)
+// UNROLL-BY-4-DAG: [[MAP_TRIP_COUNT_MULTIPLE_FOUR:#map[0-9]+]] = ()[s0, s1, s2] -> (s0 + ((-s0 + s1) floordiv 4) * 4, s0 + ((-s0 + s2) floordiv 4) * 4, s0 + ((-s0 + 1024) floordiv 4) * 4)
 
-// CHECK-LABEL: func @loop_nest_simplest() {
+// UNROLL-FULL-LABEL: func @loop_nest_simplest() {
 func @loop_nest_simplest() {
-  // CHECK: for %i0 = 0 to 100 step 2 {
+  // UNROLL-FULL: for %i0 = 0 to 100 step 2 {
   for %i = 0 to 100 step 2 {
-    // CHECK: %c1_i32 = constant 1 : i32
-    // CHECK-NEXT: %c1_i32_0 = constant 1 : i32
-    // CHECK-NEXT: %c1_i32_1 = constant 1 : i32
-    // CHECK-NEXT: %c1_i32_2 = constant 1 : i32
+    // UNROLL-FULL: %c1_i32 = constant 1 : i32
+    // UNROLL-FULL-NEXT: %c1_i32_0 = constant 1 : i32
+    // UNROLL-FULL-NEXT: %c1_i32_1 = constant 1 : i32
+    // UNROLL-FULL-NEXT: %c1_i32_2 = constant 1 : i32
     for %j = 0 to 4 {
       %x = constant 1 : i32
     }
-  }       // CHECK:  }
-  return  // CHECK:  return
-}         // CHECK }
+  }       // UNROLL-FULL:  }
+  return  // UNROLL-FULL:  return
+}         // UNROLL-FULL }
 
-// CHECK-LABEL: func @loop_nest_simple_iv_use() {
+// UNROLL-FULL-LABEL: func @loop_nest_simple_iv_use() {
 func @loop_nest_simple_iv_use() {
-  // CHECK: %c0 = constant 0 : index
-  // CHECK-NEXT: for %i0 = 0 to 100 step 2 {
+  // UNROLL-FULL: %c0 = constant 0 : index
+  // UNROLL-FULL-NEXT: for %i0 = 0 to 100 step 2 {
   for %i = 0 to 100 step 2 {
-    // CHECK: %0 = "addi32"(%c0, %c0) : (index, index) -> i32
-    // CHECK: %1 = affine.apply [[MAP0]](%c0)
-    // CHECK-NEXT:  %2 = "addi32"(%1, %1) : (index, index) -> i32
-    // CHECK: %3 = affine.apply [[MAP1]](%c0)
-    // CHECK-NEXT:  %4 = "addi32"(%3, %3) : (index, index) -> i32
-    // CHECK: %5 = affine.apply [[MAP2]](%c0)
-    // CHECK-NEXT:  %6 = "addi32"(%5, %5) : (index, index) -> i32
+    // UNROLL-FULL: %0 = "addi32"(%c0, %c0) : (index, index) -> i32
+    // UNROLL-FULL: %1 = affine.apply [[MAP0]](%c0)
+    // UNROLL-FULL-NEXT:  %2 = "addi32"(%1, %1) : (index, index) -> i32
+    // UNROLL-FULL: %3 = affine.apply [[MAP1]](%c0)
+    // UNROLL-FULL-NEXT:  %4 = "addi32"(%3, %3) : (index, index) -> i32
+    // UNROLL-FULL: %5 = affine.apply [[MAP2]](%c0)
+    // UNROLL-FULL-NEXT:  %6 = "addi32"(%5, %5) : (index, index) -> i32
     for %j = 0 to 4 {
       %x = "addi32"(%j, %j) : (index, index) -> i32
     }
-  }       // CHECK:  }
-  return  // CHECK:  return
-}         // CHECK }
+  }       // UNROLL-FULL:  }
+  return  // UNROLL-FULL:  return
+}         // UNROLL-FULL }
 
 // Operations in the loop body have results that are used therein.
-// CHECK-LABEL: func @loop_nest_body_def_use() {
+// UNROLL-FULL-LABEL: func @loop_nest_body_def_use() {
 func @loop_nest_body_def_use() {
-  // CHECK: %c0 = constant 0 : index
-  // CHECK-NEXT: for %i0 = 0 to 100 step 2 {
+  // UNROLL-FULL: %c0 = constant 0 : index
+  // UNROLL-FULL-NEXT: for %i0 = 0 to 100 step 2 {
   for %i = 0 to 100 step 2 {
-    // CHECK: %c0_0 = constant 0 : index
+    // UNROLL-FULL: %c0_0 = constant 0 : index
     %c0 = constant 0 : index
-    // CHECK:      %0 = affine.apply [[MAP0]](%c0)
-    // CHECK-NEXT: %1 = "addi32"(%0, %c0_0) : (index, index) -> index
-    // CHECK-NEXT: %2 = affine.apply [[MAP0]](%c0)
-    // CHECK-NEXT: %3 = affine.apply [[MAP0]](%2)
-    // CHECK-NEXT: %4 = "addi32"(%3, %c0_0) : (index, index) -> index
-    // CHECK-NEXT: %5 = affine.apply [[MAP1]](%c0)
-    // CHECK-NEXT: %6 = affine.apply [[MAP0]](%5)
-    // CHECK-NEXT: %7 = "addi32"(%6, %c0_0) : (index, index) -> index
-    // CHECK-NEXT: %8 = affine.apply [[MAP2]](%c0)
-    // CHECK-NEXT: %9 = affine.apply [[MAP0]](%8)
-    // CHECK-NEXT: %10 = "addi32"(%9, %c0_0) : (index, index) -> index
+    // UNROLL-FULL:      %0 = affine.apply [[MAP0]](%c0)
+    // UNROLL-FULL-NEXT: %1 = "addi32"(%0, %c0_0) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %2 = affine.apply [[MAP0]](%c0)
+    // UNROLL-FULL-NEXT: %3 = affine.apply [[MAP0]](%2)
+    // UNROLL-FULL-NEXT: %4 = "addi32"(%3, %c0_0) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %5 = affine.apply [[MAP1]](%c0)
+    // UNROLL-FULL-NEXT: %6 = affine.apply [[MAP0]](%5)
+    // UNROLL-FULL-NEXT: %7 = "addi32"(%6, %c0_0) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %8 = affine.apply [[MAP2]](%c0)
+    // UNROLL-FULL-NEXT: %9 = affine.apply [[MAP0]](%8)
+    // UNROLL-FULL-NEXT: %10 = "addi32"(%9, %c0_0) : (index, index) -> index
     for %j = 0 to 4 {
       %x = "affine.apply" (%j) { map: (d0) -> (d0 + 1) } :
         (index) -> (index)
       %y = "addi32"(%x, %c0) : (index, index) -> index
     }
-  }       // CHECK:  }
-  return  // CHECK:  return
-}         // CHECK }
+  }       // UNROLL-FULL:  }
+  return  // UNROLL-FULL:  return
+}         // UNROLL-FULL }
 
-// CHECK-LABEL: func @loop_nest_strided() {
+// UNROLL-FULL-LABEL: func @loop_nest_strided() {
 func @loop_nest_strided() {
-  // CHECK: %c2 = constant 2 : index
-  // CHECK-NEXT: %c2_0 = constant 2 : index
-  // CHECK-NEXT: for %i0 = 0 to 100 {
+  // UNROLL-FULL: %c2 = constant 2 : index
+  // UNROLL-FULL-NEXT: %c2_0 = constant 2 : index
+  // UNROLL-FULL-NEXT: for %i0 = 0 to 100 {
   for %i = 0 to 100 {
-    // CHECK:      %0 = affine.apply [[MAP0]](%c2_0)
-    // CHECK-NEXT: %1 = "addi32"(%0, %0) : (index, index) -> index
-    // CHECK-NEXT: %2 = affine.apply [[MAP1]](%c2_0)
-    // CHECK-NEXT: %3 = affine.apply [[MAP0]](%2)
-    // CHECK-NEXT: %4 = "addi32"(%3, %3) : (index, index) -> index
+    // UNROLL-FULL:      %0 = affine.apply [[MAP0]](%c2_0)
+    // UNROLL-FULL-NEXT: %1 = "addi32"(%0, %0) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %2 = affine.apply [[MAP1]](%c2_0)
+    // UNROLL-FULL-NEXT: %3 = affine.apply [[MAP0]](%2)
+    // UNROLL-FULL-NEXT: %4 = "addi32"(%3, %3) : (index, index) -> index
     for %j = 2 to 6 step 2 {
       %x = "affine.apply" (%j) { map: (d0) -> (d0 + 1) } :
         (index) -> (index)
       %y = "addi32"(%x, %x) : (index, index) -> index
     }
-    // CHECK:      %5 = affine.apply [[MAP0]](%c2)
-    // CHECK-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> index
-    // CHECK-NEXT: %7 = affine.apply [[MAP1]](%c2)
-    // CHECK-NEXT: %8 = affine.apply [[MAP0]](%7)
-    // CHECK-NEXT: %9 = "addi32"(%8, %8) : (index, index) -> index
-    // CHECK-NEXT: %10 = affine.apply [[MAP3]](%c2)
-    // CHECK-NEXT: %11 = affine.apply [[MAP0]](%10)
-    // CHECK-NEXT: %12 = "addi32"(%11, %11) : (index, index) -> index
+    // UNROLL-FULL:      %5 = affine.apply [[MAP0]](%c2)
+    // UNROLL-FULL-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %7 = affine.apply [[MAP1]](%c2)
+    // UNROLL-FULL-NEXT: %8 = affine.apply [[MAP0]](%7)
+    // UNROLL-FULL-NEXT: %9 = "addi32"(%8, %8) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %10 = affine.apply [[MAP3]](%c2)
+    // UNROLL-FULL-NEXT: %11 = affine.apply [[MAP0]](%10)
+    // UNROLL-FULL-NEXT: %12 = "addi32"(%11, %11) : (index, index) -> index
     for %k = 2 to 7 step 2 {
       %z = "affine.apply" (%k) { map: (d0) -> (d0 + 1) } :
         (index) -> (index)
       %w = "addi32"(%z, %z) : (index, index) -> index
     }
-  }       // CHECK:  }
-  return  // CHECK:  return
-}         // CHECK }
+  }       // UNROLL-FULL:  }
+  return  // UNROLL-FULL:  return
+}         // UNROLL-FULL }
 
-// CHECK-LABEL: func @loop_nest_multiple_results() {
+// UNROLL-FULL-LABEL: func @loop_nest_multiple_results() {
 func @loop_nest_multiple_results() {
-  // CHECK: %c0 = constant 0 : index
-  // CHECK-NEXT: for %i0 = 0 to 100 {
+  // UNROLL-FULL: %c0 = constant 0 : index
+  // UNROLL-FULL-NEXT: for %i0 = 0 to 100 {
   for %i = 0 to 100 {
-    // CHECK: %0 = affine.apply [[MAP4]](%i0, %c0)
-    // CHECK-NEXT: %1 = "addi32"(%0, %0) : (index, index) -> index
-    // CHECK-NEXT: %2 = affine.apply #map{{.*}}(%i0, %c0)
-    // CHECK-NEXT: %3 = "fma"(%2, %0, %0) : (index, index, index) -> (index, index)
-    // CHECK-NEXT: %4 = affine.apply #map{{.*}}(%c0)
-    // CHECK-NEXT: %5 = affine.apply #map{{.*}}(%i0, %4)
-    // CHECK-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> index
-    // CHECK-NEXT: %7 = affine.apply #map{{.*}}(%i0, %4)
-    // CHECK-NEXT: %8 = "fma"(%7, %5, %5) : (index, index, index) -> (index, index)
+    // UNROLL-FULL: %0 = affine.apply [[MAP4]](%i0, %c0)
+    // UNROLL-FULL-NEXT: %1 = "addi32"(%0, %0) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %2 = affine.apply #map{{.*}}(%i0, %c0)
+    // UNROLL-FULL-NEXT: %3 = "fma"(%2, %0, %0) : (index, index, index) -> (index, index)
+    // UNROLL-FULL-NEXT: %4 = affine.apply #map{{.*}}(%c0)
+    // UNROLL-FULL-NEXT: %5 = affine.apply #map{{.*}}(%i0, %4)
+    // UNROLL-FULL-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %7 = affine.apply #map{{.*}}(%i0, %4)
+    // UNROLL-FULL-NEXT: %8 = "fma"(%7, %5, %5) : (index, index, index) -> (index, index)
     for %j = 0 to 2 step 1 {
       %x = affine.apply (d0, d1) -> (d0 + 1) (%i, %j)
       %y = "addi32"(%x, %x) : (index, index) -> index
       %z = affine.apply (d0, d1) -> (d0 + 3) (%i, %j)
       %w = "fma"(%z, %x, %x) : (index, index, index) -> (index, index)
     }
-  }       // CHECK:  }
-  return  // CHECK:  return
-}         // CHECK }
+  }       // UNROLL-FULL:  }
+  return  // UNROLL-FULL:  return
+}         // UNROLL-FULL }
 
 
 // Imperfect loop nest. Unrolling innermost here yields a perfect nest.
-// CHECK-LABEL: func @loop_nest_seq_imperfect(%arg0: memref<128x128xf32>) {
+// UNROLL-FULL-LABEL: func @loop_nest_seq_imperfect(%arg0: memref<128x128xf32>) {
 func @loop_nest_seq_imperfect(%a : memref<128x128xf32>) {
-  // CHECK: %c0 = constant 0 : index
-  // CHECK-NEXT: %c128 = constant 128 : index
+  // UNROLL-FULL: %c0 = constant 0 : index
+  // UNROLL-FULL-NEXT: %c128 = constant 128 : index
   %c128 = constant 128 : index
-  // CHECK: for %i0 = 0 to 100 {
+  // UNROLL-FULL: for %i0 = 0 to 100 {
   for %i = 0 to 100 {
-    // CHECK: %0 = "vld"(%i0) : (index) -> i32
+    // UNROLL-FULL: %0 = "vld"(%i0) : (index) -> i32
     %ld = "vld"(%i) : (index) -> i32
-    // CHECK: %1 = affine.apply [[MAP0]](%c0)
-    // CHECK-NEXT: %2 = "vmulf"(%c0, %1) : (index, index) -> index
-    // CHECK-NEXT: %3 = "vaddf"(%2, %2) : (index, index) -> index
-    // CHECK-NEXT: %4 = affine.apply [[MAP0]](%c0)
-    // CHECK-NEXT: %5 = affine.apply [[MAP0]](%4)
-    // CHECK-NEXT: %6 = "vmulf"(%4, %5) : (index, index) -> index
-    // CHECK-NEXT: %7 = "vaddf"(%6, %6) : (index, index) -> index
-    // CHECK-NEXT: %8 = affine.apply [[MAP1]](%c0)
-    // CHECK-NEXT: %9 = affine.apply [[MAP0]](%8)
-    // CHECK-NEXT: %10 = "vmulf"(%8, %9) : (index, index) -> index
-    // CHECK-NEXT: %11 = "vaddf"(%10, %10) : (index, index) -> index
-    // CHECK-NEXT: %12 = affine.apply [[MAP2]](%c0)
-    // CHECK-NEXT: %13 = affine.apply [[MAP0]](%12)
-    // CHECK-NEXT: %14 = "vmulf"(%12, %13) : (index, index) -> index
-    // CHECK-NEXT: %15 = "vaddf"(%14, %14) : (index, index) -> index
+    // UNROLL-FULL: %1 = affine.apply [[MAP0]](%c0)
+    // UNROLL-FULL-NEXT: %2 = "vmulf"(%c0, %1) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %3 = "vaddf"(%2, %2) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %4 = affine.apply [[MAP0]](%c0)
+    // UNROLL-FULL-NEXT: %5 = affine.apply [[MAP0]](%4)
+    // UNROLL-FULL-NEXT: %6 = "vmulf"(%4, %5) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %7 = "vaddf"(%6, %6) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %8 = affine.apply [[MAP1]](%c0)
+    // UNROLL-FULL-NEXT: %9 = affine.apply [[MAP0]](%8)
+    // UNROLL-FULL-NEXT: %10 = "vmulf"(%8, %9) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %11 = "vaddf"(%10, %10) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %12 = affine.apply [[MAP2]](%c0)
+    // UNROLL-FULL-NEXT: %13 = affine.apply [[MAP0]](%12)
+    // UNROLL-FULL-NEXT: %14 = "vmulf"(%12, %13) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %15 = "vaddf"(%14, %14) : (index, index) -> index
     for %j = 0 to 4 {
       %x = "affine.apply" (%j) { map: (d0) -> (d0 + 1) } :
         (index) -> (index)
        %y = "vmulf"(%j, %x) : (index, index) -> index
        %z = "vaddf"(%y, %y) : (index, index) -> index
     }
-    // CHECK: %16 = "scale"(%c128, %i0) : (index, index) -> index
+    // UNROLL-FULL: %16 = "scale"(%c128, %i0) : (index, index) -> index
     %addr = "scale"(%c128, %i) : (index, index) -> index
-    // CHECK: "vst"(%16, %i0) : (index, index) -> ()
+    // UNROLL-FULL: "vst"(%16, %i0) : (index, index) -> ()
     "vst"(%addr, %i) : (index, index) -> ()
-  }       // CHECK }
-  return  // CHECK:  return
+  }       // UNROLL-FULL }
+  return  // UNROLL-FULL:  return
 }
 
-// CHECK-LABEL: func @loop_nest_seq_multiple() {
+// UNROLL-FULL-LABEL: func @loop_nest_seq_multiple() {
 func @loop_nest_seq_multiple() {
-  // CHECK: c0 = constant 0 : index
-  // CHECK-NEXT: %c0_0 = constant 0 : index
-  // CHECK-NEXT: %0 = affine.apply [[MAP0]](%c0_0)
-  // CHECK-NEXT: "mul"(%0, %0) : (index, index) -> ()
-  // CHECK-NEXT: %1 = affine.apply [[MAP0]](%c0_0)
-  // CHECK-NEXT: %2 = affine.apply [[MAP0]](%1)
-  // CHECK-NEXT: "mul"(%2, %2) : (index, index) -> ()
-  // CHECK-NEXT: %3 = affine.apply [[MAP1]](%c0_0)
-  // CHECK-NEXT: %4 = affine.apply [[MAP0]](%3)
-  // CHECK-NEXT: "mul"(%4, %4) : (index, index) -> ()
-  // CHECK-NEXT: %5 = affine.apply [[MAP2]](%c0_0)
-  // CHECK-NEXT: %6 = affine.apply [[MAP0]](%5)
-  // CHECK-NEXT: "mul"(%6, %6) : (index, index) -> ()
+  // UNROLL-FULL: c0 = constant 0 : index
+  // UNROLL-FULL-NEXT: %c0_0 = constant 0 : index
+  // UNROLL-FULL-NEXT: %0 = affine.apply [[MAP0]](%c0_0)
+  // UNROLL-FULL-NEXT: "mul"(%0, %0) : (index, index) -> ()
+  // UNROLL-FULL-NEXT: %1 = affine.apply [[MAP0]](%c0_0)
+  // UNROLL-FULL-NEXT: %2 = affine.apply [[MAP0]](%1)
+  // UNROLL-FULL-NEXT: "mul"(%2, %2) : (index, index) -> ()
+  // UNROLL-FULL-NEXT: %3 = affine.apply [[MAP1]](%c0_0)
+  // UNROLL-FULL-NEXT: %4 = affine.apply [[MAP0]](%3)
+  // UNROLL-FULL-NEXT: "mul"(%4, %4) : (index, index) -> ()
+  // UNROLL-FULL-NEXT: %5 = affine.apply [[MAP2]](%c0_0)
+  // UNROLL-FULL-NEXT: %6 = affine.apply [[MAP0]](%5)
+  // UNROLL-FULL-NEXT: "mul"(%6, %6) : (index, index) -> ()
   for %j = 0 to 4 {
     %x = "affine.apply" (%j) { map: (d0) -> (d0 + 1) } :
       (index) -> (index)
     "mul"(%x, %x) : (index, index) -> ()
   }
 
-  // CHECK: %c99 = constant 99 : index
+  // UNROLL-FULL: %c99 = constant 99 : index
   %k = constant 99 : index
-  // CHECK: for %i0 = 0 to 100 step 2 {
+  // UNROLL-FULL: for %i0 = 0 to 100 step 2 {
   for %m = 0 to 100 step 2 {
-    // CHECK: %7 = affine.apply [[MAP0]](%c0)
-    // CHECK-NEXT: %8 = affine.apply [[MAP6]](%c0)[%c99]
-    // CHECK-NEXT: %9 = affine.apply [[MAP0]](%c0)
-    // CHECK-NEXT: %10 = affine.apply [[MAP0]](%9)
-    // CHECK-NEXT: %11 = affine.apply [[MAP6]](%9)[%c99]
-    // CHECK-NEXT: %12 = affine.apply [[MAP1]](%c0)
-    // CHECK-NEXT: %13 = affine.apply [[MAP0]](%12)
-    // CHECK-NEXT: %14 = affine.apply [[MAP6]](%12)[%c99]
-    // CHECK-NEXT: %15 = affine.apply [[MAP2]](%c0)
-    // CHECK-NEXT: %16 = affine.apply [[MAP0]](%15)
-    // CHECK-NEXT: %17 = affine.apply [[MAP6]](%15)[%c99]
+    // UNROLL-FULL: %7 = affine.apply [[MAP0]](%c0)
+    // UNROLL-FULL-NEXT: %8 = affine.apply [[MAP6]](%c0)[%c99]
+    // UNROLL-FULL-NEXT: %9 = affine.apply [[MAP0]](%c0)
+    // UNROLL-FULL-NEXT: %10 = affine.apply [[MAP0]](%9)
+    // UNROLL-FULL-NEXT: %11 = affine.apply [[MAP6]](%9)[%c99]
+    // UNROLL-FULL-NEXT: %12 = affine.apply [[MAP1]](%c0)
+    // UNROLL-FULL-NEXT: %13 = affine.apply [[MAP0]](%12)
+    // UNROLL-FULL-NEXT: %14 = affine.apply [[MAP6]](%12)[%c99]
+    // UNROLL-FULL-NEXT: %15 = affine.apply [[MAP2]](%c0)
+    // UNROLL-FULL-NEXT: %16 = affine.apply [[MAP0]](%15)
+    // UNROLL-FULL-NEXT: %17 = affine.apply [[MAP6]](%15)[%c99]
     for %n = 0 to 4 {
       %y = "affine.apply" (%n) { map: (d0) -> (d0 + 1) } :
         (index) -> (index)
       %z = "affine.apply" (%n, %k) { map: (d0) [s0] -> (d0 + s0 + 1) } :
         (index, index) -> (index)
-    }     // CHECK }
-  }       // CHECK }
-  return  // CHECK:  return
-}         // CHECK }
+    }     // UNROLL-FULL }
+  }       // UNROLL-FULL }
+  return  // UNROLL-FULL:  return
+}         // UNROLL-FULL }
+
+// UNROLL-FULL-LABEL: func @loop_nest_unroll_full() {
+func @loop_nest_unroll_full() {
+  // UNROLL-FULL-NEXT: %0 = "foo"() : () -> i32
+  // UNROLL-FULL-NEXT: %1 = "bar"() : () -> i32
+  // UNROLL-FULL-NEXT:  return
+  for %i = 0 to 1 {
+    %x = "foo"() : () -> i32
+    %y = "bar"() : () -> i32
+  }
+  return
+} // UNROLL-FULL }
 
 // SHORT-LABEL: func @loop_nest_outer_unroll() {
 func @loop_nest_outer_unroll() {
@@ -269,8 +260,8 @@
   return  // SHORT:  return
 }         // SHORT }
 
-// We aren't doing any file check here. We just need this test case to
-// successfully run. Both %i0 and i1 will get unrolled here with the min trip
+// We are doing a minimal FileCheck here. We just need this test case to
+// successfully run. Both %x and %y will get unrolled here as the min trip
 // count threshold set to 2.
 // SHORT-LABEL: func @loop_nest_seq_long() -> i32 {
 func @loop_nest_seq_long() -> i32 {
@@ -284,7 +275,9 @@
 
   %zero_idx = constant 0 : index
 
+  // CHECK: for %i0 = 0 to 512
   for %n0 = 0 to 512 {
+    // CHECK: for %i1 = 0 to 8
     for %n1 = 0 to 8 {
       store %one,  %A[%n0, %n1] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
       store %two,  %B[%n0, %n1] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
@@ -292,22 +285,25 @@
     }
   }
 
-  for %i0 = 0 to 2 {
-    for %i1 = 0 to 2 {
+  for %x = 0 to 2 {
+    for %y = 0 to 2 {
+      // CHECK: for %i2
       for %i2 = 0 to 8 {
-        %b2 = "affine.apply" (%i1, %i2) {map: (d0, d1) -> (16*d0 + d1)} : (index, index) -> index
-        %x = load %B[%i0, %b2] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
-        "op1"(%x) : (i32) -> ()
+        // CHECK-NOT: for %i3
+        // CHECK: %{{[0-9]+}} = affine.apply
+        %b2 = "affine.apply" (%y, %i2) {map: (d0, d1) -> (16*d0 + d1)} : (index, index) -> index
+        %z = load %B[%x, %b2] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
+        "op1"(%z) : (i32) -> ()
       }
       for %j1 = 0 to 8 {
         for %j2 = 0 to 8 {
-          %a2 = "affine.apply" (%i1, %j2) {map: (d0, d1) -> (16*d0 + d1)} : (index, index) -> index
+          %a2 = "affine.apply" (%y, %j2) {map: (d0, d1) -> (16*d0 + d1)} : (index, index) -> index
           %v203 = load %A[%j1, %a2] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
           "op2"(%v203) : (i32) -> ()
         }
         for %k2 = 0 to 8 {
           %s0 = "op3"() : () -> i32
-          %c2 = "affine.apply" (%i0, %k2) {map: (d0, d1) -> (16*d0 + d1)} : (index, index) -> index
+          %c2 = "affine.apply" (%x, %k2) {map: (d0, d1) -> (16*d0 + d1)} : (index, index) -> index
           %s1 =  load %C[%j1, %c2] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
           %s2 = "addi32"(%s0, %s1) : (i32, i32) -> i32
           store %s2, %C[%j1, %c2] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
@@ -353,22 +349,22 @@
 func @unroll_unit_stride_cleanup() {
   // UNROLL-BY-4: for %i0 = 0 to 100 {
   for %i = 0 to 100 {
-    // UNROLL-BY-4: for [[L1:%i[0-9]+]] = 0 to 7 step 4 {
-    // UNROLL-BY-4-NEXT: %0 = "addi32"([[L1]], [[L1]]) : (index, index) -> i32
-    // UNROLL-BY-4-NEXT: %1 = "addi32"(%0, %0) : (i32, i32) -> i32
-    // UNROLL-BY-4-NEXT: %2 = affine.apply #map{{[0-9]+}}([[L1]])
-    // UNROLL-BY-4-NEXT: %3 = "addi32"(%2, %2) : (index, index) -> i32
-    // UNROLL-BY-4-NEXT: %4 = "addi32"(%3, %3) : (i32, i32) -> i32
-    // UNROLL-BY-4-NEXT: %5 = affine.apply #map{{[0-9]+}}([[L1]])
-    // UNROLL-BY-4-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> i32
-    // UNROLL-BY-4-NEXT: %7 = "addi32"(%6, %6) : (i32, i32) -> i32
-    // UNROLL-BY-4-NEXT: %8 = affine.apply #map{{[0-9]+}}([[L1]])
-    // UNROLL-BY-4-NEXT: %9 = "addi32"(%8, %8) : (index, index) -> i32
-    // UNROLL-BY-4-NEXT: %10 = "addi32"(%9, %9) : (i32, i32) -> i32
+    // UNROLL-BY-4: for [[L1:%i[0-9]+]] = 0 to 8 step 4 {
+    // UNROLL-BY-4-NEXT:   %0 = "addi32"([[L1]], [[L1]]) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT:   %1 = "addi32"(%0, %0) : (i32, i32) -> i32
+    // UNROLL-BY-4-NEXT:   %2 = affine.apply #map{{[0-9]+}}([[L1]])
+    // UNROLL-BY-4-NEXT:   %3 = "addi32"(%2, %2) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT:   %4 = "addi32"(%3, %3) : (i32, i32) -> i32
+    // UNROLL-BY-4-NEXT:   %5 = affine.apply #map{{[0-9]+}}([[L1]])
+    // UNROLL-BY-4-NEXT:   %6 = "addi32"(%5, %5) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT:   %7 = "addi32"(%6, %6) : (i32, i32) -> i32
+    // UNROLL-BY-4-NEXT:   %8 = affine.apply #map{{[0-9]+}}([[L1]])
+    // UNROLL-BY-4-NEXT:   %9 = "addi32"(%8, %8) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT:   %10 = "addi32"(%9, %9) : (i32, i32) -> i32
     // UNROLL-BY-4-NEXT: }
     // UNROLL-BY-4-NEXT: for [[L2:%i[0-9]+]] = 8 to 10 {
-    // UNROLL-BY-4-NEXT: %11 = "addi32"([[L2]], [[L2]]) : (index, index) -> i32
-    // UNROLL-BY-4-NEXT: %12 = "addi32"(%11, %11) : (i32, i32) -> i32
+    // UNROLL-BY-4-NEXT:   %11 = "addi32"([[L2]], [[L2]]) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT:   %12 = "addi32"(%11, %11) : (i32, i32) -> i32
     // UNROLL-BY-4-NEXT: }
     for %j = 0 to 10 {
       %x = "addi32"(%j, %j) : (index, index) -> i32
@@ -382,7 +378,7 @@
 func @unroll_non_unit_stride_cleanup() {
   // UNROLL-BY-4: for %i0 = 0 to 100 {
   for %i = 0 to 100 {
-    // UNROLL-BY-4: for [[L1:%i[0-9]+]] = 2 to 37 step 20 {
+    // UNROLL-BY-4: for [[L1:%i[0-9]+]] = 2 to 42 step 20 {
     // UNROLL-BY-4-NEXT: %0 = "addi32"([[L1]], [[L1]]) : (index, index) -> i32
     // UNROLL-BY-4-NEXT: %1 = "addi32"(%0, %0) : (i32, i32) -> i32
     // UNROLL-BY-4-NEXT: %2 = affine.apply #map{{[0-9]+}}([[L1]])
@@ -408,6 +404,7 @@
 }
 
 // Both the unrolled loop and the cleanup loop are single iteration loops.
+// UNROLL-BY-4-LABEL: func @loop_nest_single_iteration_after_unroll
 func @loop_nest_single_iteration_after_unroll(%N: index) {
   // UNROLL-BY-4: %c0 = constant 0 : index
   // UNROLL-BY-4: %c4 = constant 4 : index
@@ -435,7 +432,7 @@
 // UNROLL-BY-4-LABEL: func @loop_nest_operand1() {
 func @loop_nest_operand1() {
 // UNROLL-BY-4:      for %i0 = 0 to 100 step 2 {
-// UNROLL-BY-4-NEXT:   for %i1 = [[MAP10]](%i0) to #map{{[0-9]+}}(%i0) step 4
+// UNROLL-BY-4-NEXT:   for %i1 = 0 to #map{{[0-9]+}}(%i0) step 4
 // UNROLL-BY-4-NEXT:      %0 = "foo"() : () -> i32
 // UNROLL-BY-4-NEXT:      %1 = "foo"() : () -> i32
 // UNROLL-BY-4-NEXT:      %2 = "foo"() : () -> i32
@@ -444,7 +441,7 @@
 // UNROLL-BY-4-NEXT: }
 // UNROLL-BY-4-NEXT: return
   for %i = 0 to 100 step 2 {
-    for %j = (d0) -> (0) (%i) to (d0) -> (d0 - d0 mod 4) (%i) {
+    for %j = 0 to (d0) -> (d0 - d0 mod 4) (%i) {
       %x = "foo"() : () -> i32
     }
   }
@@ -491,11 +488,11 @@
   return
 }
 
-// UNROLL-BY-4-LABEL: func @loop_nest_operand4(%arg0: index) {
-func @loop_nest_operand4(%N : index) {
+// UNROLL-BY-4-LABEL: func @loop_nest_symbolic_bound(%arg0: index) {
+func @loop_nest_symbolic_bound(%N : index) {
   // UNROLL-BY-4: for %i0 = 0 to 100 {
   for %i = 0 to 100 {
-    // UNROLL-BY-4: for %i1 = [[MAP12]]()[%arg0] to #map{{[0-9]+}}()[%arg0] step 4 {
+    // UNROLL-BY-4: for %i1 = 0 to #map{{[0-9]+}}()[%arg0] step 4 {
     // UNROLL-BY-4: %0 = "foo"() : () -> i32
     // UNROLL-BY-4-NEXT: %1 = "foo"() : () -> i32
     // UNROLL-BY-4-NEXT: %2 = "foo"() : () -> i32
@@ -505,25 +502,56 @@
     // UNROLL-BY-4-NEXT: for %i2 = #map{{[0-9]+}}()[%arg0] to %arg0 {
     // UNROLL-BY-4-NEXT: %4 = "foo"() : () -> i32
     // UNROLL-BY-4_NEXT: }
-    // Specify the lower bound so that both lb and ub operands match.
-    for %j = ()[s0] -> (0)()[%N] to %N {
+    for %j = 0 to %N {
       %x = "foo"() : () -> i32
     }
   }
   return
 }
 
-// CHECK-LABEL: func @loop_nest_unroll_full() {
-func @loop_nest_unroll_full() {
-  // CHECK-NEXT: %0 = "foo"() : () -> i32
-  // CHECK-NEXT: %1 = "bar"() : () -> i32
-  // CHECK-NEXT:  return
-  for %i = 0 to 1 {
-    %x = "foo"() : () -> i32
-    %y = "bar"() : () -> i32
+// UNROLL-BY-4-LABEL: func @loop_nest_symbolic_and_min_upper_bound
+func @loop_nest_symbolic_and_min_upper_bound(%M : index, %N : index, %K : index) {
+  for %i = %M to min ()[s0, s1] -> (s0, s1, 1024)()[%N, %K] {
+    "foo"() : () -> ()
   }
   return
-} // CHECK }
+}
+// CHECK-NEXT:  for %i0 = %arg0 to min [[MAP_TRIP_COUNT_MULTIPLE_FOUR]]()[%arg0, %arg1, %arg2] step 4 {
+// CHECK-NEXT:    "foo"() : () -> ()
+// CHECK-NEXT:    "foo"() : () -> ()
+// CHECK-NEXT:    "foo"() : () -> ()
+// CHECK-NEXT:    "foo"() : () -> ()
+// CHECK-NEXT:  }
+// CHECK-NEXT:  for %i1 = max [[MAP_TRIP_COUNT_MULTIPLE_FOUR]]()[%arg0, %arg1, %arg2] to min #map28()[%arg1, %arg2] {
+// CHECK-NEXT:    "foo"() : () -> ()
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+
+// The trip count here is a multiple of four, but this can be inferred only
+// through composition. Check for no cleanup loop.
+// UNROLL-BY-4-LABEL: func @loop_nest_non_trivial_multiple_unroll_factor
+func @loop_nest_non_trivial_multiple_unroll_factor(%M : index, %N : index) {
+  %T = affine.apply (d0) -> (4*d0 + 1)(%M)
+  %K = affine.apply (d0) -> (d0 - 1) (%T)
+  for %i = 0 to min (d0, d1) -> (4 * d0, d1, 1024)(%N, %K) {
+    "foo"() : () -> ()
+  }
+  return
+}
+// UNROLL-BY-4: for %i0 = 0 to min
+// UNROLL-BY-4-NOT: for
+// UNROLL-BY-4: return
+
+// Commented due to b/128340045
+// xUNROLL-BY-4-LABEL: func @loop_nest_non_trivial_multiple_unroll_factor
+// func @loop_nest_non_trivial_multiple_unroll_factor(%M : index, %N : index) {
+//  %K = affine.apply (d0) -> (4*d0) (%M)
+//  for %i = 0 to min ()[s0, s1] -> (4 * s0, s1, 1024)()[%N, %K] {
+//    "foo"() : () -> ()
+//  }
+//  return
+//}
+
 
 // UNROLL-BY-1-LABEL: func @unroll_by_one_should_promote_single_iteration_loop()
 func @unroll_by_one_should_promote_single_iteration_loop() {