Loop unrolling update.

- deal with non-operation stmt's (if/for stmt's) in loops being unrolled
  (unrolling of non-innermost loops works).
- update uses in unrolled bodies to use results of new operations that may be
  introduced in the unrolled bodies.

Unrolling now works for all kinds of loop nests - perfect nests, imperfect
nests, loops at any depth, and with any kind of operation in the body. (IfStmt
support not done, hence untested there).

Added missing dump/print method for StmtBlock.

TODO: add test case for outer loop unrolling.
PiperOrigin-RevId: 207314286
diff --git a/include/mlir/IR/Builders.h b/include/mlir/IR/Builders.h
index 795dde4..b6cc934 100644
--- a/include/mlir/IR/Builders.h
+++ b/include/mlir/IR/Builders.h
@@ -253,6 +253,9 @@
     insertPoint = block->begin();
   }
 
+  /// Get the current insertion point of the builder.
+  StmtBlock::iterator getInsertionPoint() const { return insertPoint; }
+
   OperationStmt *createOperation(Identifier name, ArrayRef<MLValue *> operands,
                                  ArrayRef<Type *> resultTypes,
                                  ArrayRef<NamedAttribute> attributes) {
@@ -262,7 +265,7 @@
     return op;
   }
 
-  OperationStmt *cloneOperation(const OperationStmt &srcOpStmt) {
+  OperationStmt *clone(const OperationStmt &srcOpStmt) {
     auto *op = srcOpStmt.clone();
     block->getStatements().insert(insertPoint, op);
     return op;
@@ -274,6 +277,29 @@
     return OpTy::build(this, args...);
   }
 
+  ForStmt *clone(const ForStmt &srcForStmt) {
+    auto *forStmt = srcForStmt.clone();
+    block->getStatements().insert(insertPoint, forStmt);
+    return forStmt;
+  }
+
+  IfStmt *clone(const IfStmt &srcIfStmt) {
+    auto *ifStmt = srcIfStmt.clone();
+    block->getStatements().insert(insertPoint, ifStmt);
+    return ifStmt;
+  }
+
+  Statement *clone(const Statement &stmt) {
+    switch (stmt.getKind()) {
+    case Statement::Kind::Operation:
+      return clone(cast<const OperationStmt>(stmt));
+    case Statement::Kind::If:
+      return clone(cast<const IfStmt>(stmt));
+    case Statement::Kind::For:
+      return clone(cast<const ForStmt>(stmt));
+    }
+  }
+
   // Creates for statement. When step is not specified, it is set to 1.
   ForStmt *createFor(AffineConstantExpr *lowerBound,
                      AffineConstantExpr *upperBound,
@@ -285,15 +311,6 @@
     return stmt;
   }
 
-  // TODO: subsume with a generate create<ConstantInt>() method.
-  OperationStmt *createConstInt32Op(int value) {
-    std::pair<Identifier, Attribute *> namedAttr(
-        Identifier::get("value", context), getIntegerAttr(value));
-    auto *mlconst = createOperation(Identifier::get("constant", context), {},
-                                    {getIntegerType(32)}, {namedAttr});
-    return mlconst;
-  }
-
 private:
   StmtBlock *block = nullptr;
   StmtBlock::iterator insertPoint;
diff --git a/include/mlir/IR/StandardOps.h b/include/mlir/IR/StandardOps.h
index 93f9818..8cfe432 100644
--- a/include/mlir/IR/StandardOps.h
+++ b/include/mlir/IR/StandardOps.h
@@ -164,6 +164,17 @@
 ///
 class ConstantIntOp : public ConstantOp {
 public:
+  template <class Builder>
+  static OpPointer<ConstantIntOp> build(Builder *builder, int64_t value,
+                                        unsigned width) {
+    std::pair<Identifier, Attribute *> namedAttr(
+        builder->getIdentifier("value"), builder->getIntegerAttr(value));
+    auto *type = builder->getIntegerType(width);
+
+    return OpPointer<ConstantIntOp>(ConstantIntOp(builder->createOperation(
+        builder->getIdentifier("constant"), {}, type, {namedAttr})));
+  }
+
   int64_t getValue() const {
     return getAttrOfType<IntegerAttr>("value")->getValue();
   }
diff --git a/include/mlir/IR/Statement.h b/include/mlir/IR/Statement.h
index 83e3ff8..2326d50 100644
--- a/include/mlir/IR/Statement.h
+++ b/include/mlir/IR/Statement.h
@@ -48,6 +48,9 @@
   /// Remove this statement from its block and delete it.
   void eraseFromBlock();
 
+  /// Clone this statement, the cloning is deep.
+  Statement *clone() const;
+
   /// Returns the statement block that contains this statement.
   StmtBlock *getBlock() const { return block; }
 
diff --git a/include/mlir/IR/Statements.h b/include/mlir/IR/Statements.h
index 1894437..3e9f06c 100644
--- a/include/mlir/IR/Statements.h
+++ b/include/mlir/IR/Statements.h
@@ -209,6 +209,9 @@
     clear();
   }
 
+  /// Deep clone this for stmt.
+  ForStmt *clone() const;
+
   AffineConstantExpr *getLowerBound() const { return lowerBound; }
   AffineConstantExpr *getUpperBound() const { return upperBound; }
   AffineConstantExpr *getStep() const { return step; }
@@ -270,6 +273,9 @@
 
   ~IfStmt();
 
+  /// Deep clone this IfStmt.
+  IfStmt *clone() const;
+
   IfClause *getThenClause() const { return thenClause; }
   IfClause *getElseClause() const { return elseClause; }
   bool hasElseClause() const { return elseClause != nullptr; }
diff --git a/include/mlir/IR/StmtBlock.h b/include/mlir/IR/StmtBlock.h
index 609aabb..a8a1f20 100644
--- a/include/mlir/IR/StmtBlock.h
+++ b/include/mlir/IR/StmtBlock.h
@@ -22,8 +22,9 @@
 #ifndef MLIR_IR_STMTBLOCK_H
 #define MLIR_IR_STMTBLOCK_H
 
-#include "mlir/Support/LLVM.h"
 #include "mlir/IR/Statement.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace mlir {
 class MLFunction;
@@ -101,6 +102,9 @@
     return &StmtBlock::statements;
   }
 
+  void print(raw_ostream &os) const;
+  void dump() const;
+
 protected:
   StmtBlock(StmtBlockKind kind) : kind(kind) {}
 
diff --git a/lib/IR/AsmPrinter.cpp b/lib/IR/AsmPrinter.cpp
index d71fb92..6875120 100644
--- a/lib/IR/AsmPrinter.cpp
+++ b/lib/IR/AsmPrinter.cpp
@@ -1223,6 +1223,15 @@
 
 void Statement::dump() const { print(llvm::errs()); }
 
+void StmtBlock::print(raw_ostream &os) const {
+  MLFunction *function = findFunction();
+  ModuleState state(function->getContext());
+  ModulePrinter modulePrinter(os, state);
+  MLFunctionPrinter(function, modulePrinter).print(this);
+}
+
+void StmtBlock::dump() const { print(llvm::errs()); }
+
 void Function::print(raw_ostream &os) const {
   ModuleState state(getContext());
   ModulePrinter(os, state).print(this);
diff --git a/lib/IR/Statement.cpp b/lib/IR/Statement.cpp
index 978137b..44e44c8 100644
--- a/lib/IR/Statement.cpp
+++ b/lib/IR/Statement.cpp
@@ -77,6 +77,19 @@
   return nlc.numNestedLoops == 1;
 }
 
+Statement *Statement::clone() const {
+  switch (kind) {
+  case Kind::Operation:
+    return cast<OperationStmt>(this)->clone();
+  case Kind::If:
+    llvm_unreachable("cloning for if's not implemented yet");
+    return cast<IfStmt>(this)->clone();
+  case Kind::For:
+    llvm_unreachable("cloning for loops not implemented yet");
+    return cast<ForStmt>(this)->clone();
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // ilist_traits for Statement
 //===----------------------------------------------------------------------===//
@@ -227,6 +240,15 @@
       StmtBlock(StmtBlockKind::For), lowerBound(lowerBound),
       upperBound(upperBound), step(step) {}
 
+ForStmt *ForStmt::clone() const {
+  auto *stmt = new ForStmt(getLowerBound(), getUpperBound(), getStep(),
+                           Statement::findFunction()->getContext());
+  for (auto &s : getStatements()) {
+    stmt->getStatements().push_back(s.clone());
+  }
+  return stmt;
+}
+
 //===----------------------------------------------------------------------===//
 // IfStmt
 //===----------------------------------------------------------------------===//
@@ -236,3 +258,8 @@
   if (elseClause)
     delete elseClause;
 }
+
+IfStmt *IfStmt::clone() const {
+  llvm_unreachable("cloning for if's not implemented yet");
+  return nullptr;
+}
diff --git a/lib/Transforms/LoopUnroll.cpp b/lib/Transforms/LoopUnroll.cpp
index fe110d2..27bb43f 100644
--- a/lib/Transforms/LoopUnroll.cpp
+++ b/lib/Transforms/LoopUnroll.cpp
@@ -26,6 +26,7 @@
 #include "mlir/IR/Module.h"
 #include "mlir/IR/OperationSet.h"
 #include "mlir/IR/Pass.h"
+#include "mlir/IR/StandardOps.h"
 #include "mlir/IR/Statements.h"
 #include "mlir/IR/StmtVisitor.h"
 #include "mlir/Transforms/Passes.h"
@@ -96,61 +97,94 @@
     runOnForStmt(forStmt);
 }
 
-/// Replace an IV with a constant value.
-static void replaceIterator(Statement *stmt, const ForStmt &iv,
-                            MLValue *constVal) {
-  struct ReplaceIterator : public StmtWalker<ReplaceIterator> {
-    // IV to be replaced.
-    const ForStmt *iv;
-    // Constant to be replaced with.
-    MLValue *constVal;
+/// Replace all uses of 'oldVal' with 'newVal' in 'stmt'
+static void replaceAllStmtUses(Statement *stmt, MLValue *oldVal,
+                               MLValue *newVal) {
+  struct ReplaceUseWalker : public StmtWalker<ReplaceUseWalker> {
+    // Value to be replaced.
+    MLValue *oldVal;
+    // Value to be replaced with.
+    MLValue *newVal;
 
-    ReplaceIterator(const ForStmt &iv, MLValue *constVal)
-        : iv(&iv), constVal(constVal){};
+    ReplaceUseWalker(MLValue *oldVal, MLValue *newVal)
+        : oldVal(oldVal), newVal(newVal){};
 
     void visitOperationStmt(OperationStmt *os) {
       for (auto &operand : os->getStmtOperands()) {
-        if (operand.get() == static_cast<const MLValue *>(iv)) {
-          operand.set(constVal);
-        }
+        if (operand.get() == oldVal)
+          operand.set(newVal);
       }
     }
   };
 
-  ReplaceIterator ri(iv, constVal);
+  ReplaceUseWalker ri(oldVal, newVal);
   ri.walk(stmt);
 }
 
-/// Unrolls this loop completely.
+/// Unroll this 'for stmt' / loop completely.
 void LoopUnroll::runOnForStmt(ForStmt *forStmt) {
   auto lb = forStmt->getLowerBound()->getValue();
   auto ub = forStmt->getUpperBound()->getValue();
   auto step = forStmt->getStep()->getValue();
-  auto trip_count = (ub - lb + 1) / step;
 
+  // Builder to add constants need for the unrolled iterator.
   auto *mlFunc = forStmt->Statement::findFunction();
   MLFuncBuilder funcTopBuilder(mlFunc);
   funcTopBuilder.setInsertionPointAtStart(mlFunc);
 
+  // Builder to insert the unrolled bodies.
   MLFuncBuilder builder(forStmt->getBlock());
-  for (int i = 0; i < trip_count; i++) {
-    auto *ivUnrolledVal = funcTopBuilder.createConstInt32Op(i)->getResult(0);
-    for (auto &stmt : forStmt->getStatements()) {
-      switch (stmt.getKind()) {
-      case Statement::Kind::For:
-        llvm_unreachable("unrolling loops that have only operations");
-        break;
-      case Statement::Kind::If:
-        llvm_unreachable("unrolling loops that have only operations");
-        break;
-      case Statement::Kind::Operation:
-        auto *cloneOp = builder.cloneOperation(*cast<OperationStmt>(&stmt));
-        // TODO(bondhugula): only generate constants when the IV actually
-        // appears in the body.
-        replaceIterator(cloneOp, *forStmt, ivUnrolledVal);
-        break;
+  // Set insertion point to right after where the for stmt ends.
+  builder.setInsertionPoint(forStmt->getBlock(),
+                            ++StmtBlock::iterator(forStmt));
+
+  // Unroll the contents of 'forStmt'.
+  for (int i = lb; i <= ub; i += step) {
+    // TODO(bondhugula): generate constants only when IV actually appears.
+    auto constOp = funcTopBuilder.create<ConstantIntOp>(i, 32);
+    auto *ivConst = cast<OperationStmt>(constOp->getOperation())->getResult(0);
+
+    // Iterator pointing to just before 'this' (i^th) unrolled iteration.
+    StmtBlock::iterator beforeUnrolledBody = --builder.getInsertionPoint();
+
+    // Pairs of <old op stmt result whose uses need to be replaced,
+    // new result generated by the corresponding cloned op stmt>.
+    SmallVector<std::pair<MLValue *, MLValue *>, 8> oldNewResultPairs;
+
+    for (auto &loopBodyStmt : forStmt->getStatements()) {
+      auto *cloneStmt = builder.clone(loopBodyStmt);
+      // Replace all uses of the IV in the clone with constant iteration value.
+      replaceAllStmtUses(cloneStmt, forStmt, ivConst);
+
+      // Whenever we have an op stmt, we'll have a new ML Value defined: replace
+      // uses of the old result with this one.
+      if (auto *opStmt = dyn_cast<OperationStmt>(&loopBodyStmt)) {
+        if (opStmt->getNumResults()) {
+          auto *cloneOpStmt = cast<OperationStmt>(cloneStmt);
+          for (unsigned i = 0, e = opStmt->getNumResults(); i < e; i++) {
+            // Store old/new result pairs.
+            // TODO *only* if needed later: storing of old/new results can be
+            // avoided, by cloning the statement list in the reverse direction
+            // (and running the IR builder in the reverse
+            // (iplist.insertAfter()). That way, a newly created result can be
+            // immediately propagated to all its uses, which would already  been
+            // cloned/inserted.
+            oldNewResultPairs.push_back(std::make_pair(
+                &opStmt->getStmtResult(i), &cloneOpStmt->getStmtResult(i)));
+          }
+        }
+      }
+    }
+    // Replace uses of old op results' with the results in the just
+    // unrolled body.
+    StmtBlock::iterator endOfUnrolledBody = builder.getInsertionPoint();
+    for (auto it = ++beforeUnrolledBody; it != endOfUnrolledBody; it++) {
+      for (unsigned i = 0; i < oldNewResultPairs.size(); i++) {
+        replaceAllStmtUses(&(*it), oldNewResultPairs[i].first,
+                           oldNewResultPairs[i].second);
       }
     }
   }
+  // Erase the original for stmt from the block.
   forStmt->eraseFromBlock();
 }
diff --git a/test/Transforms/unroll.mlir b/test/Transforms/unroll.mlir
index 6a7d9cf..aef6a01 100644
--- a/test/Transforms/unroll.mlir
+++ b/test/Transforms/unroll.mlir
@@ -1,11 +1,11 @@
 // RUN: %S/../../mlir-opt %s -o - -unroll-innermost-loops | FileCheck %s
 
-// CHECK-LABEL: mlfunc @loops1() {
-mlfunc @loops1() {
-  // CHECK: %c0_i32 = constant 0 : i32
-  // CHECK-NEXT: %c1_i32 = constant 1 : i32
+// CHECK-LABEL: mlfunc @loop_nest_simplest() {
+mlfunc @loop_nest_simplest() {
+  // CHECK: %c1_i32 = constant 1 : i32
   // CHECK-NEXT: %c2_i32 = constant 2 : i32
   // CHECK-NEXT: %c3_i32 = constant 3 : i32
+  // CHECK-NEXT: %c4_i32 = constant 4 : i32
   // CHECK-NEXT: for %i0 = 1 to 100 step 2 {
   for %i = 1 to 100 step 2 {
     // CHECK: %c1_i32_0 = constant 1 : i32
@@ -19,40 +19,162 @@
   return  // CHECK:  return
 }         // CHECK }
 
-// CHECK-LABEL: mlfunc @loops2() {
-mlfunc @loops2() {
+// CHECK-LABEL: mlfunc @loop_nest_simple_iv_use() {
+mlfunc @loop_nest_simple_iv_use() {
+  // CHECK: %c1_i32 = constant 1 : i32
+  // CHECK-NEXT: %c2_i32 = constant 2 : i32
+  // CHECK-NEXT: %c3_i32 = constant 3 : i32
+  // CHECK-NEXT: %c4_i32 = constant 4 : i32
+  // CHECK-NEXT: for %i0 = 1 to 100 step 2 {
+  for %i = 1 to 100 step 2 {
+    // CHECK:       %0 = "addi32"(%c1_i32, %c1_i32) : (i32, i32) -> i32
+    // CHECK-NEXT:  %1 = "addi32"(%c2_i32, %c2_i32) : (i32, i32) -> i32
+    // CHECK-NEXT:  %2 = "addi32"(%c3_i32, %c3_i32) : (i32, i32) -> i32
+    // CHECK-NEXT:  %3 = "addi32"(%c4_i32, %c4_i32) : (i32, i32) -> i32
+    for %j = 1 to 4 {
+      %x = "addi32"(%j, %j) : (affineint, affineint) -> i32
+    }
+  }       // CHECK:  }
+  return  // CHECK:  return
+}         // CHECK }
+
+// CHECK-LABEL: mlfunc @loop_nest_strided() {
+mlfunc @loop_nest_strided() {
+  // CHECK: %c3_i32 = constant 3 : i32
+  // CHECK-NEXT: %c5_i32 = constant 5 : i32
+  // CHECK-NEXT: %c7_i32 = constant 7 : i32
+  // CHECK-NEXT: %c3_i32_0 = constant 3 : i32
+  // CHECK-NEXT: %c5_i32_1 = constant 5 : i32
+  // CHECK-NEXT: for %i0 = 1 to 100 {
+  for %i = 1 to 100 {
+    // CHECK:      %0 = affine_apply (d0) -> (d0 + 1)(%c3_i32_0)
+    // CHECK-NEXT: %1 = "addi32"(%0, %0) : (affineint, affineint) -> affineint
+    // CHECK-NEXT: %2 = affine_apply (d0) -> (d0 + 1)(%c5_i32_1)
+    // CHECK-NEXT: %3 = "addi32"(%2, %2) : (affineint, affineint) -> affineint
+    for %j = 3 to 6 step 2 {
+      %x = "affine_apply" (%j) { map: (d0) -> (d0 + 1) } :
+        (affineint) -> (affineint)
+      %y = "addi32"(%x, %x) : (affineint, affineint) -> affineint
+    }
+    // CHECK:      %4 = affine_apply (d0) -> (d0 + 1)(%c3_i32)
+    // CHECK-NEXT: %5 = "addi32"(%4, %4) : (affineint, affineint) -> affineint
+    // CHECK-NEXT: %6 = affine_apply (d0) -> (d0 + 1)(%c5_i32)
+    // CHECK-NEXT: %7 = "addi32"(%6, %6) : (affineint, affineint) -> affineint
+    // CHECK-NEXT: %8 = affine_apply (d0) -> (d0 + 1)(%c7_i32)
+    // CHECK-NEXT: %9 = "addi32"(%8, %8) : (affineint, affineint) -> affineint
+    for %k = 3 to 7 step 2 {
+      %z = "affine_apply" (%k) { map: (d0) -> (d0 + 1) } :
+        (affineint) -> (affineint)
+      %w = "addi32"(%z, %z) : (affineint, affineint) -> affineint
+    }
+  }       // CHECK:  }
+  return  // CHECK:  return
+}         // CHECK }
+
+// Operations in the loop body have results that are used therein.
+// CHECK-LABEL: mlfunc @loop_nest_body_def_use() {
+mlfunc @loop_nest_body_def_use() {
   // CHECK: %c0_i32 = constant 0 : i32
   // CHECK-NEXT: %c1_i32 = constant 1 : i32
   // CHECK-NEXT: %c2_i32 = constant 2 : i32
   // CHECK-NEXT: %c3_i32 = constant 3 : i32
-  // CHECK-NEXT: %c0_i32_0 = constant 0 : i32
-  // CHECK-NEXT: %c1_i32_1 = constant 1 : i32
-  // CHECK-NEXT: %c2_i32_2 = constant 2 : i32
-  // CHECK-NEXT: %c3_i32_3 = constant 3 : i32
   // CHECK-NEXT: for %i0 = 1 to 100 step 2 {
   for %i = 1 to 100 step 2 {
-     // CHECK: %0 = affine_apply (d0) -> (d0 + 1)(%c0_i32_0)
-    // CHECK-NEXT: %1 = affine_apply (d0) -> (d0 + 1)(%c1_i32_1)
-    // CHECK-NEXT: %2 = affine_apply (d0) -> (d0 + 1)(%c2_i32_2)
-    // CHECK-NEXT: %3 = affine_apply (d0) -> (d0 + 1)(%c3_i32_3)
+    // CHECK: %c0 = constant 0 : affineint
+    %c0 = constant 0 : affineint
+    // CHECK:      %0 = affine_apply (d0) -> (d0 + 1)(%c0_i32)
+    // CHECK-NEXT: %1 = "addi32"(%0, %c0) : (affineint, affineint) -> affineint
+    // CHECK-NEXT: %2 = affine_apply (d0) -> (d0 + 1)(%c1_i32)
+    // CHECK-NEXT: %3 = "addi32"(%2, %c0) : (affineint, affineint) -> affineint
+    // CHECK-NEXT: %4 = affine_apply (d0) -> (d0 + 1)(%c2_i32)
+    // CHECK-NEXT: %5 = "addi32"(%4, %c0) : (affineint, affineint) -> affineint
+    // CHECK-NEXT: %6 = affine_apply (d0) -> (d0 + 1)(%c3_i32)
+    // CHECK-NEXT: %7 = "addi32"(%6, %c0) : (affineint, affineint) -> affineint
+    for %j = 0 to 3 {
+      %x = "affine_apply" (%j) { map: (d0) -> (d0 + 1) } :
+        (affineint) -> (affineint)
+      %y = "addi32"(%x, %c0) : (affineint, affineint) -> affineint
+    }
+  }       // CHECK:  }
+  return  // CHECK:  return
+}         // CHECK }
+
+
+// Imperfect loop nest. Unrolling innermost here yields a perfect nest.
+// CHECK-LABEL: mlfunc @loop_nest_seq_imperfect(memref<128x128xf32>) {
+mlfunc @loop_nest_seq_imperfect(%a : memref<128x128xf32>) {
+  // CHECK: %c1_i32 = constant 1 : i32
+  // CHECK-NEXT: %c2_i32 = constant 2 : i32
+  // CHECK-NEXT: %c3_i32 = constant 3 : i32
+  // CHECK-NEXT: %c4_i32 = constant 4 : i32
+  // CHECK-NEXT: %c128 = constant 128 : affineint
+  %c128 = constant 128 : affineint
+  // CHECK: for %i0 = 1 to 100 {
+  for %i = 1 to 100 {
+    // CHECK: %0 = "vld"(%i0) : (affineint) -> i32
+    %ld = "vld"(%i) : (affineint) -> i32
+    // CHECK: %1 = affine_apply (d0) -> (d0 + 1)(%c1_i32)
+    // CHECK-NEXT: %2 = "vmulf"(%c1_i32, %1) : (i32, affineint) -> affineint
+    // CHECK-NEXT: %3 = "vaddf"(%2, %2) : (affineint, affineint) -> affineint
+    // CHECK-NEXT: %4 = affine_apply (d0) -> (d0 + 1)(%c2_i32)
+    // CHECK-NEXT: %5 = "vmulf"(%c2_i32, %4) : (i32, affineint) -> affineint
+    // CHECK-NEXT: %6 = "vaddf"(%5, %5) : (affineint, affineint) -> affineint
+    // CHECK-NEXT: %7 = affine_apply (d0) -> (d0 + 1)(%c3_i32)
+    // CHECK-NEXT: %8 = "vmulf"(%c3_i32, %7) : (i32, affineint) -> affineint
+    // CHECK-NEXT: %9 = "vaddf"(%8, %8) : (affineint, affineint) -> affineint
+    // CHECK-NEXT: %10 = affine_apply (d0) -> (d0 + 1)(%c4_i32)
+    // CHECK-NEXT: %11 = "vmulf"(%c4_i32, %10) : (i32, affineint) -> affineint
+    // CHECK-NEXT: %12 = "vaddf"(%11, %11) : (affineint, affineint) -> affineint
     for %j = 1 to 4 {
       %x = "affine_apply" (%j) { map: (d0) -> (d0 + 1) } :
         (affineint) -> (affineint)
+       %y = "vmulf"(%j, %x) : (affineint, affineint) -> affineint
+       %z = "vaddf"(%y, %y) : (affineint, affineint) -> affineint
     }
-  }    // CHECK:  }
+    // CHECK: %13 = "scale"(%c128, %i0) : (affineint, affineint) -> affineint
+    %addr = "scale"(%c128, %i) : (affineint, affineint) -> affineint
+    // CHECK: "vst"(%13, %i0) : (affineint, affineint) -> ()
+    "vst"(%addr, %i) : (affineint, affineint) -> ()
+  }       // CHECK }
+  return  // CHECK:  return
+}
+
+// CHECK-LABEL: mlfunc @loop_nest_seq_multiple() {
+mlfunc @loop_nest_seq_multiple() {
+  // CHECK: %c1_i32 = constant 1 : i32
+  // CHECK-NEXT: %c2_i32 = constant 2 : i32
+  // CHECK-NEXT: %c3_i32 = constant 3 : i32
+  // CHECK-NEXT: %c4_i32 = constant 4 : i32
+  // CHECK-NEXT: %c0_i32 = constant 0 : i32
+  // CHECK-NEXT: %c1_i32_0 = constant 1 : i32
+  // CHECK-NEXT: %c2_i32_1 = constant 2 : i32
+  // CHECK-NEXT: %c3_i32_2 = constant 3 : i32
+  // CHECK-NEXT: %0 = affine_apply (d0) -> (d0 + 1)(%c0_i32)
+  // CHECK-NEXT: "mul"(%0, %0) : (affineint, affineint) -> ()
+  // CHECK-NEXT: %1 = affine_apply (d0) -> (d0 + 1)(%c1_i32_0)
+  // CHECK-NEXT: "mul"(%1, %1) : (affineint, affineint) -> ()
+  // CHECK-NEXT: %2 = affine_apply (d0) -> (d0 + 1)(%c2_i32_1)
+  // CHECK-NEXT: "mul"(%2, %2) : (affineint, affineint) -> ()
+  // CHECK-NEXT: %3 = affine_apply (d0) -> (d0 + 1)(%c3_i32_2)
+  // CHECK-NEXT: "mul"(%3, %3) : (affineint, affineint) -> ()
+  for %j = 0 to 3 {
+    %x = "affine_apply" (%j) { map: (d0) -> (d0 + 1) } :
+      (affineint) -> (affineint)
+    "mul"(%x, %x) : (affineint, affineint) -> ()
+  }
 
   // CHECK: %c99 = constant 99 : affineint
   %k = "constant"(){value: 99} : () -> affineint
-  // CHECK: for %i1 = 1 to 100 step 2 {
+  // CHECK: for %i0 = 1 to 100 step 2 {
   for %m = 1 to 100 step 2 {
-    // CHECK: %4 = affine_apply (d0) -> (d0 + 1)(%c0_i32)
-    // CHECK-NEXT: %5 = affine_apply (d0)[s0] -> (d0 + s0 + 1)(%c0_i32)[%c99]
-    // CHECK-NEXT: %6 = affine_apply (d0) -> (d0 + 1)(%c1_i32)
-    // CHECK-NEXT: %7 = affine_apply (d0)[s0] -> (d0 + s0 + 1)(%c1_i32)[%c99]
-    // CHECK-NEXT: %8 = affine_apply (d0) -> (d0 + 1)(%c2_i32)
-    // CHECK-NEXT: %9 = affine_apply (d0)[s0] -> (d0 + s0 + 1)(%c2_i32)[%c99]
-    // CHECK-NEXT: %10 = affine_apply (d0) -> (d0 + 1)(%c3_i32)
-    // CHECK-NEXT: %11 = affine_apply (d0)[s0] -> (d0 + s0 + 1)(%c3_i32)[%c99]
+    // CHECK: %4 = affine_apply (d0) -> (d0 + 1)(%c1_i32)
+    // CHECK-NEXT: %5 = affine_apply (d0)[s0] -> (d0 + s0 + 1)(%c1_i32)[%c99]
+    // CHECK-NEXT: %6 = affine_apply (d0) -> (d0 + 1)(%c2_i32)
+    // CHECK-NEXT: %7 = affine_apply (d0)[s0] -> (d0 + s0 + 1)(%c2_i32)[%c99]
+    // CHECK-NEXT: %8 = affine_apply (d0) -> (d0 + 1)(%c3_i32)
+    // CHECK-NEXT: %9 = affine_apply (d0)[s0] -> (d0 + s0 + 1)(%c3_i32)[%c99]
+    // CHECK-NEXT: %10 = affine_apply (d0) -> (d0 + 1)(%c4_i32)
+    // CHECK-NEXT: %11 = affine_apply (d0)[s0] -> (d0 + s0 + 1)(%c4_i32)[%c99]
     for %n = 1 to 4 {
       %y = "affine_apply" (%n) { map: (d0) -> (d0 + 1) } :
         (affineint) -> (affineint)