test/cpp/tensorexpr/test_boundsinference.cpp - platform/external/pytorch - Git at Google

 #include <memory>
 #include <sstream>
 #include <stdexcept>
 #include <unordered_map>

 #include <gtest/gtest.h>

 #include <test/cpp/tensorexpr/padded_buffer.h>
 #include <torch/csrc/jit/tensorexpr/analysis.h>
 #include <torch/csrc/jit/tensorexpr/bounds_inference.h>
 #include <torch/csrc/jit/tensorexpr/eval.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>

 namespace torch {
 namespace jit {

 using namespace torch::jit::tensorexpr;

 static void verifyConstBounds(
     const TensorAccessBoundsInfo& access_info,
     const std::vector<std::pair<int, int>>& ref) {
   size_t ndim = ref.size();
   ASSERT_EQ(access_info.start.size(), ndim);
   ASSERT_EQ(access_info.stop.size(), ndim);
   for (size_t i = 0; i < ndim; i++) {
     if (ref[i].first >= 0) { // Negative values are used to skip the check
       ASSERT_TRUE(access_info.start[i]->isConstant());
       int start_i = immediateAs<int>(access_info.start[i]);
       ASSERT_EQ(start_i, ref[i].first);
     }
     if (ref[i].second >= 0) {
       ASSERT_TRUE(access_info.stop[i]->isConstant());
       int stop_i = immediateAs<int>(access_info.stop[i]);
       ASSERT_EQ(stop_i, ref[i].second);
     }
   }
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, _1) {
   // Verify that bounds inference works for the following example:
   // for i in 0..100:
   //   b[i] = a[i]
   // For this loop bounds inference should yield the following:
   // {{b, kStore, 0, 99}, {a, kLoad, 0, 99}}
   KernelScope kernel_scope;
   ExprHandle n(100);
   Placeholder a(BufHandle("a", {n}, kFloat));
   Tensor* b =
       Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
   auto bounds_info = inferBounds(l.root_stmt());

   // We should have two entries: one for 'b' and one for 'a'.
   ASSERT_EQ(bounds_info.size(), 2);
   ASSERT_EQ(bounds_info.at(a.data()).size(), 1);
   ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
   verifyConstBounds(bounds_info.at(a.data())[0], {{0, 99}});

   ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
   ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
   verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 99}});
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, _2) {
   // Verify that bounds inference works for the following example:
   // for i in 0..n:
   //   b[i] = a[i]
   // For this loop bounds inference should yield the following:
   // {{b, kStore, 0, n-1}, {a, kLoad, 0, n-1}}
   KernelScope kernel_scope;
   VarHandle n("n", kInt);
   Placeholder a(BufHandle("a", {n}, kFloat));
   Tensor* b =
       Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
   auto bounds_info = inferBounds(l.root_stmt());

   // We should have two entries: one for 'b' and one for 'a'.
   ASSERT_EQ(bounds_info.size(), 2);
   ASSERT_EQ(bounds_info.at(a.data()).size(), 1);
   ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
   verifyConstBounds(bounds_info.at(a.data())[0], {{0, -1}});

   ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
   ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
   verifyConstBounds(bounds_info.at(b->buf())[0], {{0, -1}});
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, _3) {
   // Verify that bounds inference works for the following example:
   // for i in 0..100:
   //   b[i] = a[i] * a[i+10]
   // For this loop bounds inference should yield the following:
   // {{b, kStore, 0, 99}, {a, kLoad, 0, 109}}
   KernelScope kernel_scope;
   ExprHandle n(100);
   Placeholder a(BufHandle("a", {n + 10}, kFloat));
   Tensor* b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) {
     return a.load(i) * a.load(i + 10);
   });
   LoopNest l({b});
   auto bounds_info = inferBounds(l.root_stmt());

   // We should have two entries: one for 'b' and one for 'a'.
   ASSERT_EQ(bounds_info.size(), 2);
   ASSERT_EQ(bounds_info.at(a.data()).size(), 1);
   ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
   verifyConstBounds(bounds_info.at(a.data())[0], {{0, 109}});

   ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
   ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
   verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 99}});
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, _4) {
   // Verify that bounds inference works for the following example:
   //
   // for y in 0..200:
   //   for x in 0..320:
   //     b[y,x] = x*y
   // for y in 0..200:
   //   for x in 0..320:
   //     c[y,x] = a[y,x] * b[y,x]
   KernelScope kernel_scope;
   ExprHandle W(320);
   ExprHandle H(200);
   Placeholder a(BufHandle("a", {H, W}, kFloat));
   Tensor* b = Compute(
       "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
         return x * y;
       });
   Tensor* c = Compute(
       "c", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
         return a.load(y, x) * b->load(y, x);
       });
   LoopNest l({c});
   std::vector<For*> loops = l.getLoopStmtsFor(c);
   Stmt* body = l.getLoopBodyFor(c);
   {
     // Infer bounds on the top-level loop scope
     auto bounds_info = inferBounds(loops[0]);
     ASSERT_EQ(bounds_info.size(), 3);

     ASSERT_EQ(bounds_info.at(a.data()).size(), 1);
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{0, 199}, {0, 319}});

     ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
     ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 199}, {0, 319}});

     ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
     ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
     verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 199}, {0, 319}});
   }
   {
     // Infer bounds on the inner loop scope
     auto bounds_info = inferBounds(loops[1]);
     ASSERT_EQ(bounds_info.size(), 3);

     ASSERT_EQ(bounds_info.at(a.data()).size(), 1);
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {0, 319}});

     ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
     ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {0, 319}});

     ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
     ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
     verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {0, 319}});
   }
   {
     // Infer bounds on the inner loop body's scope
     auto bounds_info = inferBounds(body);
     ASSERT_EQ(bounds_info.size(), 3);

     ASSERT_EQ(bounds_info.at(a.data()).size(), 1);
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {-1, -1}});

     ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
     ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {-1, -1}});

     ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
     ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
     verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {-1, -1}});
   }
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, _5) {
   // Verify that bounds inference works for the following example:
   // for i in 0..100:
   //   b[i] = a[i]
   //
   // ==> split ==>
   //
   // for i_outer in 0..100/16:
   //   for i_inner in 0..16:
   //     b[i_outer * 16 + i_inner] = a[i_outer * 16 + i_inner]
   // for i_tail in 0..100%16:
   //   b[i_tail + (100/16)*16] = a[i_tail + (100/16)*16];
   KernelScope kernel_scope;
   ExprHandle n(100);
   Placeholder a(BufHandle("a", {n}, kFloat));
   Tensor* b =
       Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});

   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   For* inner;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   For* tail;
   std::vector<For*> loops = l.getLoopStmtsFor(b);
   LoopNest::splitWithTail(loops[0], 16, &inner, &tail);
   For* outer = loops[0];

   {
     // Verify inferred bounds for the outer loop
     auto bounds_info = inferBounds(outer);
     ASSERT_EQ(bounds_info.size(), 2);

     ASSERT_EQ(bounds_info.at(a.data()).size(), 1);
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{0, 95}});

     ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
     ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
     verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 95}});
   }
   {
     // Verify inferred bounds for the tail loop
     auto bounds_info = inferBounds(tail);
     ASSERT_EQ(bounds_info.size(), 2);

     ASSERT_EQ(bounds_info.at(a.data()).size(), 1);
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{96, 99}});

     ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
     ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
     verifyConstBounds(bounds_info.at(b->buf())[0], {{96, 99}});
   }
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, _6) {
   // Verify that bounds inference works for the following example:
   //
   // for y in 0..200:
   //   for x in 0..320:
   //     b[y,x] = x*y
   // for y in 0..20:
   //   for x in 0..32:
   //     c[y,x] = a[y+100,x+100] * b[y*2,x*5]
   KernelScope kernel_scope;
   ExprHandle W(320);
   ExprHandle H(200);
   ExprHandle CW(32);
   ExprHandle CH(20);
   Placeholder a(BufHandle("a", {H, W}, kFloat));
   Tensor* b = Compute(
       "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
         return x * y;
       });
   Tensor* c = Compute(
       "c", {{CH, "y"}, {CW, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
         return a.load(y + 100, x + 100) * b->load(y * 2, x * 5);
       });
   LoopNest l({c});
   std::vector<For*> loops = l.getLoopStmtsFor(c);
   Stmt* body = l.getLoopBodyFor(c);
   {
     // Infer bounds on the top-level loop scope
     auto bounds_info = inferBounds(loops[0]);
     ASSERT_EQ(bounds_info.size(), 3);

     ASSERT_EQ(bounds_info.at(a.data()).size(), 1);
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{100, 119}, {100, 131}});

     ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
     ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 38}, {0, 155}});

     ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
     ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
     verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 19}, {0, 31}});
   }
   {
     // Infer bounds on the inner loop scope
     auto bounds_info = inferBounds(loops[1]);
     ASSERT_EQ(bounds_info.size(), 3);

     ASSERT_EQ(bounds_info.at(a.data()).size(), 1);
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {100, 131}});

     ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
     ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {0, 155}});

     ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
     ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
     verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {0, 31}});
   }
   {
     // Infer bounds on the inner loop body's scope
     auto bounds_info = inferBounds(body);
     ASSERT_EQ(bounds_info.size(), 3);

     ASSERT_EQ(bounds_info.at(a.data()).size(), 1);
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {-1, -1}});

     ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
     ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {-1, -1}});

     ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
     ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
     verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {-1, -1}});
   }
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, Adjacent) {
   KernelScope kernel_scope;
   ExprHandle H(6);
   Placeholder a(BufHandle("a", {20}, kFloat));
   Tensor* b =
       Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x); });
   Tensor* c = Compute(
       "c", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x + H); });
   LoopNest l({b, c});
   std::vector<For*> loops = NodeFinder<For>::find(l.root_stmt());

   {
     // Infer bounds on the top-level loop scope
     auto bounds_info = inferBounds(loops[0]);
     ASSERT_EQ(bounds_info.size(), 2);

     // reads from a[0:5], writes to b[0:5]
     ASSERT_EQ(bounds_info.at(a.data()).size(), 1);
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{0, 5}});

     ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
     ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
     verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 5}});
   }
   {
     // Infer bounds on the inner loop scope
     auto bounds_info = inferBounds(loops[1]);
     ASSERT_EQ(bounds_info.size(), 2);

     // reads from a[0+6:5+6], writes to c[0:5]
     ASSERT_EQ(bounds_info.at(a.data()).size(), 1);
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{6, 11}});

     ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
     ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
     verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 5}});
   }
   {
     // Infer bounds on the high level program.
     auto bounds_info = inferBounds(l.root_stmt());
     ASSERT_EQ(bounds_info.size(), 3);

     // Should be union of above 2 bounds, but this time the bounds of A can be
     // merged.
     ASSERT_EQ(bounds_info.at(a.data()).size(), 1);
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{0, 11}});

     ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
     ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
     verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 5}});

     ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
     ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
     verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 5}});
   }
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, MultipleTopLoopLoad) {
   KernelScope kernel_scope;
   Placeholder a(BufHandle("a", {100}, kFloat));
   Tensor* b =
       Compute("b", {{64, "x"}}, [&](const VarHandle& x) { return a.load(x); });
   Tensor* c = Compute(
       "c", {{32, "x"}}, [&](const VarHandle& x) { return a.load(x + 10); });
   Tensor* d = Compute(
       "d", {{96, "x"}}, [&](const VarHandle& x) { return a.load(x + 2); });
   LoopNest l({b, c, d});

   auto bounds_info = inferBounds(l.root_stmt());

   ASSERT_EQ(bounds_info.size(), 4);

   // a only read.
   {
     auto bounds = bounds_info[a.data()];
     ASSERT_EQ(bounds.size(), 1);
     // One dimension.
     auto bound = bounds[0];
     ASSERT_EQ(bound.kind, TensorAccessKind::kLoad);
     // Bounds:
     // start: Min of the 3 load bounds = Min of loop starts + offset = 0+0 (b).
     // stop: Max of the 3 load bounds = Max of loop stops + offset - 1 =
     //       96 + 2 - 1 (d).
     verifyConstBounds(bound, {{0, 97}});
   }

   // b, c, d only written.
   {
     auto bounds = bounds_info[b->buf()];
     ASSERT_EQ(bounds.size(), 1);
     auto bound = bounds[0];
     ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
     // Just the loop extents for b.
     verifyConstBounds(bound, {{0, 63}});
   }
   {
     auto bounds = bounds_info[c->buf()];
     ASSERT_EQ(bounds.size(), 1);
     auto bound = bounds[0];
     ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
     // Just the loop extents for c.
     verifyConstBounds(bound, {{0, 31}});
   }
   {
     auto bounds = bounds_info[d->buf()];
     ASSERT_EQ(bounds.size(), 1);
     auto bound = bounds[0];
     ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
     // Just the loop extents for d.
     verifyConstBounds(bound, {{0, 95}});
   }
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, MultipleTopLoopStore) {
   KernelScope kernel_scope;
   BufHandle a("a", {100}, kFloat);
   BufHandle b("b", {100}, kFloat);
   BufHandle c("c", {100}, kFloat);
   BufHandle d("d", {100}, kFloat);
   VarHandle x("x", kInt);

   // Same as above but the offsets are on the Store now.
   // Can't do this through ComputeAPI without transforms we don't have yet.
   Stmt* stmt = Block::make(
       {For::make(x, 0, 64, Store::make(b, {x}, Load::make(a, {x}))),
        For::make(x, 0, 32, Store::make(c, {x + 10}, Load::make(a, {x}))),
        For::make(x, 0, 96, Store::make(d, {x + 2}, Load::make(a, {x})))});

   auto bounds_info = inferBounds(stmt);

   ASSERT_EQ(bounds_info.size(), 4);

   // a only read.
   {
     auto bounds = bounds_info[a.node()];
     ASSERT_EQ(bounds.size(), 1);
     // One dimension.
     auto bound = bounds[0];
     ASSERT_EQ(bound.kind, TensorAccessKind::kLoad);
     // Bounds: there are no offsets, so this is just the max loop bounds.
     verifyConstBounds(bound, {{0, 95}});
   }

   // b, c, d only written.
   {
     auto bounds = bounds_info[b.node()];
     ASSERT_EQ(bounds.size(), 1);
     auto bound = bounds[0];
     ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
     // This should be equivalent to {offset, extent + offset} for the b loop.
     // b loop has no offset, so just the loop extents.
     verifyConstBounds(bound, {{0, 63}});
   }
   {
     auto bounds = bounds_info[c.node()];
     ASSERT_EQ(bounds.size(), 1);
     auto bound = bounds[0];
     ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
     // This should be equivalent to {offset, extent + offset} for the c loop.
     // Offset is 10, extent is 32-1.
     verifyConstBounds(bound, {{10, 41}});
   }
   {
     auto bounds = bounds_info[d.node()];
     ASSERT_EQ(bounds.size(), 1);
     auto bound = bounds[0];
     ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
     // This should be equivalent to {offset, extent + offset} for the d loop.
     // Offset is 2, extent is 96-1.
     verifyConstBounds(bound, {{2, 97}});
   }
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, CacheReads) {
   KernelScope kernel_scope;

   Tensor* A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
   Tensor* B = Compute(
       "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
         return A->load(i + 30, j + 3);
       });
   Tensor* C = Compute(
       "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
         return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
       });

   LoopNest l({B, C});
   auto bounds_info_before = inferBounds(l.root_stmt());

   Stmt* j_loop = l.getLoopStmtsFor(B)[1];
   LoopNest::cacheAccesses(A->buf(), "A_local", j_loop);

   auto bounds_info_after = inferBounds(l.root_stmt());

   // CacheAccesses should not change existing bounds, but add a new one for the
   // cache.
   for (auto& pair : bounds_info_after) {
     auto beforeIt = bounds_info_before.find(pair.first);
     if (beforeIt != bounds_info_before.end()) {
       // Same number of TensorAccessBoundInfos.
       ASSERT_EQ(pair.second.size(), beforeIt->second.size());

       for (size_t i = 0; i < pair.second.size(); ++i) {
         TensorAccessBoundsInfo& after = pair.second[i];
         TensorAccessBoundsInfo& before = beforeIt->second[i];
         // Same number of dimensions.
         ASSERT_EQ(before.start.size(), after.start.size());

         // Bounds are equal.
         for (size_t j = 0; j < before.start.size(); ++j) {
           ASSERT_TRUE(exprEquals(before.start[j], after.start[j]));
           ASSERT_TRUE(exprEquals(before.stop[j], after.stop[j]));
         }
       }
     } else {
       // This should be the cache.
       ASSERT_EQ(pair.first->name_hint(), "A_local");
       // Should have both a load and a store.
       ASSERT_EQ(pair.second.size(), 2);
       TensorAccessBoundsInfo& first = pair.second[0];
       TensorAccessBoundsInfo& second = pair.second[1];

       ASSERT_NE(first.kind, second.kind);
       // 2 dimensions.
       ASSERT_EQ(first.start.size(), second.start.size());
       ASSERT_EQ(first.start.size(), 2);

       // bounds for load and store are equal.
       for (size_t j = 0; j < first.start.size(); ++j) {
         ASSERT_TRUE(exprEquals(first.start[j], second.start[j]));
         ASSERT_TRUE(exprEquals(first.stop[j], second.stop[j]));
       }
     }
   }
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, Flattened) {
   KernelScope kernel_scope;
   Tensor* b = Compute(
       "b",
       {{3, "z"}, {4, "y"}, {5, "x"}},
       [&](const VarHandle& z, const VarHandle& y, const VarHandle& x) {
         return x * y + z;
       });

   LoopNest l({b});
   // Flatten indices.
   l.prepareForCodegen();
   auto bounds_info = inferBounds(l.root_stmt());

   // There's only one buffer.
   ASSERT_EQ(bounds_info.size(), 1);
   auto& TABI = bounds_info[b->buf()][0];
   ASSERT_EQ(TABI.kind, TensorAccessKind::kStore);
   // Flattened bounds should have a single dimension.
   ASSERT_EQ(TABI.start.size(), 1);
   ASSERT_EQ(TABI.stop.size(), 1);

   // Bounds should be 0 -> (3*4*5)-1
   ASSERT_TRUE(exprEquals(TABI.start[0], new IntImm(0)));
   ASSERT_TRUE(exprEquals(TABI.stop[0], new IntImm(3 * 4 * 5 - 1)));
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, GetPotentialHazards) {
   KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   BufHandle c("C", {5}, kInt);
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);

   using namespace analysis;

   {
     /*
      * A[0] = B[0];
      * B[0] = 3;      WAR on B
      * A[0] = B[0];   WAW on A, RAW on B
      * C[0] = 5;
      */

     Store* store1 = Store::make(a, {0}, Load::make(b, {0}));
     Store* store2 = Store::make(b, {0}, 3);
     Store* store3 = Store::make(a, {0}, Load::make(b, {0}));
     Store* store4 = Store::make(c, {0}, 5);
     Stmt* stmt = Block::make({store1, store2, store3, store4});

     MemDependencyChecker analyzer;
     stmt->accept(&analyzer);

     ASSERT_EQ(
         HazardKind::WriteAfterRead,
         getPotentialHazards(analyzer, store1, store2));

     ASSERT_EQ(
         HazardKind::ReadAfterWrite,
         getPotentialHazards(analyzer, store2, store3));

     ASSERT_EQ(
         HazardKind::WriteAfterWrite,
         getPotentialHazards(analyzer, store1, store3));

     // Fourth store has no dependencies
     ASSERT_EQ(
         HazardKind::NoDependency,
         getPotentialHazards(analyzer, store1, store4));
     ASSERT_EQ(
         HazardKind::NoDependency,
         getPotentialHazards(analyzer, store2, store4));
     ASSERT_EQ(
         HazardKind::NoDependency,
         getPotentialHazards(analyzer, store3, store4));
   }
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, GetPotentialHazardsLoopNoHazard) {
   KernelScope kernel_scope;

   Tensor* A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
   Tensor* B = Compute(
       "B", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return (i + 1) * (j + 1);
       });

   LoopNest l({A, B});

   using namespace analysis;

   MemDependencyChecker analyzer;
   l.root_stmt()->accept(&analyzer);

   For* loopRootA = l.getLoopStmtsFor(A)[0];
   For* loopRootB = l.getLoopStmtsFor(B)[0];

   // No dependencies between loops.
   ASSERT_EQ(
       HazardKind::NoDependency,
       getPotentialHazards(analyzer, loopRootA, loopRootB));
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, GetPotentialHazardsLoopCall) {
   KernelScope kernel_scope;

   Tensor* A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
   Tensor* B = Compute(
       "B", {{64, "i"}, {64, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
         return A->load(i, j) + 5;
       });

   LoopNest l({A, B});

   using namespace analysis;

   MemDependencyChecker analyzer;
   l.root_stmt()->accept(&analyzer);

   For* loopRootA = l.getLoopStmtsFor(A)[0];
   For* loopRootB = l.getLoopStmtsFor(B)[0];

   ASSERT_EQ(
       HazardKind::ReadAfterWrite,
       getPotentialHazards(analyzer, loopRootA, loopRootB));
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, GetPotentialHazardsLoopSplit) {
   KernelScope kernel_scope;

   Tensor* A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });

   LoopNest l({A});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   For *inner, *tail;

   // Splitting with tail by something offset creates a tail which also writes to
   // A.
   For* outer = l.getLoopStmtsFor(A)[0];
   // `outer` loop get transformed to the outer loop after splitting.
   LoopNest::splitWithTail(outer, 5, &inner, &tail);

   using namespace analysis;

   MemDependencyChecker analyzer;
   l.root_stmt()->accept(&analyzer);

   ASSERT_EQ(
       HazardKind::WriteAfterWrite, getPotentialHazards(analyzer, outer, tail));
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, HasConflictingOverlapSameBufferWithPartialOverlap) {
   KernelScope kernel_scope;

   // Input IR:
   //   for (int j = 10; j < 100; j++) {
   //     A[j] = 10 * j;
   //   }
   //   for (int k = 10; k < 100; k++) {
   //     A[k-1] = 20 * k;
   //   }
   BufHandle a_buf("A", {200}, kInt);
   VarHandle j("j", kInt);
   VarHandle k("k", kInt);
   auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
   auto forK =
       For::make(k, 10, 100, Store::make(a_buf, {k - 1}, Mul::make(20, k)));
   auto par = Block::make({forJ, forK});

   tensorexpr::analysis::MemDependencyChecker analyzer;
   par->accept(&analyzer);
   ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, forK));
   ASSERT_TRUE(hasConflictingOverlap(analyzer, forK, forJ));
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, HasConflictingOverlapSameBufferWithFullOverlap) {
   KernelScope kernel_scope;

   // Input IR:
   //   for (int j = 10; j < 100; j++) {
   //     A[j] = 10 * j;
   //   }
   //   for (int k = 10; k < 100; k++) {
   //     A[k] = 20 * k;
   //   }
   BufHandle a_buf("A", {200}, kInt);
   VarHandle j("j", kInt);
   VarHandle k("k", kInt);
   auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
   auto forK = For::make(k, 10, 100, Store::make(a_buf, {k}, Mul::make(20, k)));
   auto par = Block::make({forJ, forK});

   tensorexpr::analysis::MemDependencyChecker analyzer;
   par->accept(&analyzer);
   ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, forK));
   ASSERT_TRUE(hasConflictingOverlap(analyzer, forK, forJ));
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, HasConflictingOverlapSameBufferWithFullOverlapRAW) {
   KernelScope kernel_scope;

   // Input IR:
   //   for (int j = 10; j < 100; j++) {
   //     A[j] = 10 * j;
   //   }
   //   for (int k = 10; k < 100; k++) {
   //     B[k] = A[k];
   //   }
   BufHandle a_buf("A", {200}, kInt);
   BufHandle b_buf("B", {200}, kInt);
   VarHandle j("j", kInt);
   VarHandle k("k", kInt);
   auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
   auto forK =
       For::make(k, 10, 100, Store::make(b_buf, {k}, Load::make(a_buf, {k})));
   auto par = Block::make({forJ, forK});

   tensorexpr::analysis::MemDependencyChecker analyzer;
   par->accept(&analyzer);
   ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, forK));
   ASSERT_TRUE(hasConflictingOverlap(analyzer, forK, forJ));
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, HasConflictingOverlapSameBufferNotOverlapping) {
   KernelScope kernel_scope;

   // Input IR:
   //   for (int j = 10; j < 100; j++) {
   //     A[j] = 10 * j;
   //   }
   //   for (int k = 10; k < 100; k++) {
   //     A[k+100] = 20 * k;
   //   }
   BufHandle a_buf("A", {200}, kInt);
   VarHandle j("j", kInt);
   VarHandle k("k", kInt);
   auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
   auto forK =
       For::make(k, 10, 100, Store::make(a_buf, {k + 100}, Mul::make(20, k)));
   auto par = Block::make({forJ, forK});

   tensorexpr::analysis::MemDependencyChecker analyzer;
   par->accept(&analyzer);
   ASSERT_FALSE(hasConflictingOverlap(analyzer, forJ, forK));
   ASSERT_FALSE(hasConflictingOverlap(analyzer, forK, forJ));
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, HasConflictingOverlap2DBufferWithOverlap) {
   KernelScope kernel_scope;

   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 100; j++) {
   //       A[i,j] = i * j * 500;
   //     }
   //   }
   //   for (int m = 0; m < 20; m++) {
   //     for (int n = 0; n < 50; n++) {
   //       A[m+1,n] = m + n * 100;
   //     }
   //   }
   BufHandle a_buf("A", {20, 100}, kInt);
   BufHandle b_buf("B", {20, 50}, kInt);
   VarHandle i("i", kInt);
   VarHandle j("j", kInt);
   VarHandle m("m", kInt);
   VarHandle n("n", kInt);
   auto storeA1 = Store::make(a_buf, {i, j}, Mul::make(Mul::make(i, j), 500));
   auto forJ = For::make(j, 0, 100, storeA1);
   auto forI = For::make(i, 0, 20, forJ);
   auto storeA2 =
       Store::make(a_buf, {m + 1, n}, Add::make(m, Mul::make(n, 100)));
   auto forN = For::make(n, 0, 50, storeA2);
   auto forM = For::make(m, 0, 20, forN);
   auto par = Block::make({forI, forM});

   tensorexpr::analysis::MemDependencyChecker analyzer;
   par->accept(&analyzer);
   ASSERT_TRUE(hasConflictingOverlap(analyzer, forI, forM));
   ASSERT_TRUE(hasConflictingOverlap(analyzer, forM, forI));
   ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, forN));
   ASSERT_TRUE(hasConflictingOverlap(analyzer, forN, forJ));
   ASSERT_TRUE(hasConflictingOverlap(analyzer, storeA1, storeA2));
   ASSERT_TRUE(hasConflictingOverlap(analyzer, storeA2, storeA1));
   ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, storeA2));
   ASSERT_TRUE(hasConflictingOverlap(analyzer, storeA1, forM));
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, HasConflictingOverlap2DBufferWithNoOverlap) {
   KernelScope kernel_scope;

   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 100; j++) {
   //       A[i,j] = i * j * 500;
   //     }
   //   }
   //   for (int m = 0; m < 20; m++) {
   //     for (int n = 0; n < 50; n++) {
   //       A[m+20,n+100] = m + n * 100;
   //     }
   //   }
   BufHandle a_buf("A", {20, 100}, kInt);
   BufHandle b_buf("B", {20, 50}, kInt);
   VarHandle i("i", kInt);
   VarHandle j("j", kInt);
   VarHandle m("m", kInt);
   VarHandle n("n", kInt);
   auto storeA1 = Store::make(a_buf, {i, j}, Mul::make(Mul::make(i, j), 500));
   auto forJ = For::make(j, 0, 100, storeA1);
   auto forI = For::make(i, 0, 20, forJ);
   auto storeA2 =
       Store::make(a_buf, {m + 20, n + 100}, Add::make(m, Mul::make(n, 100)));
   auto forN = For::make(n, 0, 50, storeA2);
   auto forM = For::make(m, 0, 20, forN);
   auto par = Block::make({forI, forM});

   tensorexpr::analysis::MemDependencyChecker analyzer;
   par->accept(&analyzer);
   ASSERT_FALSE(hasConflictingOverlap(analyzer, forI, forM));
   ASSERT_FALSE(hasConflictingOverlap(analyzer, forM, forI));
   ASSERT_FALSE(hasConflictingOverlap(analyzer, forJ, forN));
   ASSERT_FALSE(hasConflictingOverlap(analyzer, forN, forJ));
   ASSERT_FALSE(hasConflictingOverlap(analyzer, storeA1, storeA2));
   ASSERT_FALSE(hasConflictingOverlap(analyzer, storeA2, storeA1));
   ASSERT_FALSE(hasConflictingOverlap(analyzer, forJ, storeA2));
   ASSERT_FALSE(hasConflictingOverlap(analyzer, storeA1, forM));
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, HasConflictingOverlapDifferentBuffers) {
   KernelScope kernel_scope;

   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 100; j++) {
   //       A[i,j] = i * j * 500;
   //     }
   //   }
   //   for (int m = 0; m < 20; m++) {
   //     for (int n = 0; n < 50; n++) {
   //       B[m,n] = m + n * 100;
   //     }
   //   }
   BufHandle a_buf("A", {20, 100}, kInt);
   BufHandle b_buf("B", {20, 50}, kInt);
   VarHandle i("i", kInt);
   VarHandle j("j", kInt);
   VarHandle m("m", kInt);
   VarHandle n("n", kInt);
   auto storeA1 = Store::make(a_buf, {i, j}, Mul::make(Mul::make(i, j), 500));
   auto forJ = For::make(j, 0, 100, storeA1);
   auto forI = For::make(i, 0, 20, forJ);
   auto storeA2 = Store::make(b_buf, {m, n}, Add::make(m, Mul::make(n, 100)));
   auto forN = For::make(n, 0, 50, storeA2);
   auto forM = For::make(m, 0, 20, forN);
   auto par = Block::make({forI, forM});

   tensorexpr::analysis::MemDependencyChecker analyzer;
   par->accept(&analyzer);
   ASSERT_FALSE(hasConflictingOverlap(analyzer, forI, forM));
   ASSERT_FALSE(hasConflictingOverlap(analyzer, forM, forI));
   ASSERT_FALSE(hasConflictingOverlap(analyzer, forJ, forN));
   ASSERT_FALSE(hasConflictingOverlap(analyzer, forN, forJ));
   ASSERT_FALSE(hasConflictingOverlap(analyzer, storeA1, storeA2));
   ASSERT_FALSE(hasConflictingOverlap(analyzer, storeA2, storeA1));
   ASSERT_FALSE(hasConflictingOverlap(analyzer, forJ, storeA2));
   ASSERT_FALSE(hasConflictingOverlap(analyzer, storeA1, forM));
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, HasConflictingOverlapDueToRAWDependence) {
   KernelScope kernel_scope;

   // Input IR:
   //   for (int j = 0; j < 100; j++) {
   //     A[j] = 10 * j;
   //   }
   //   for (int k = 0; k < 100; k++) {
   //     B[k] = 20 * A[99-k];
   //   }
   BufHandle a_buf("A", {100}, kInt);
   BufHandle b_buf("B", {100}, kInt);
   VarHandle j("j", kInt);
   VarHandle k("k", kInt);
   auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
   auto forK = For::make(
       k,
       0,
       100,
       Store::make(
           b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
   auto par = Block::make({forJ, forK});

   tensorexpr::analysis::MemDependencyChecker analyzer;
   par->accept(&analyzer);
   ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, forK));
   ASSERT_TRUE(hasConflictingOverlap(analyzer, forK, forJ));
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, HasConflictingOverlapDueToWARDependence) {
   KernelScope kernel_scope;

   // Input IR:
   //   for (int k = 0; k < 100; k++) {
   //     B[k] = 20 * A[99-k];
   //   }
   //   for (int j = 0; j < 100; j++) {
   //     A[j] = 10 * j;
   //   }
   BufHandle a_buf("A", {100}, kInt);
   BufHandle b_buf("B", {100}, kInt);
   VarHandle j("j", kInt);
   VarHandle k("k", kInt);
   auto forK = For::make(
       k,
       0,
       100,
       Store::make(
           b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
   auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
   auto par = Block::make({forK, forJ});

   tensorexpr::analysis::MemDependencyChecker analyzer;
   par->accept(&analyzer);
   ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, forK));
   ASSERT_TRUE(hasConflictingOverlap(analyzer, forK, forJ));
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, HasConflictingOverlapWithLoads) {
   KernelScope kernel_scope;

   // Input IR:
   //   for (int k = 10; k < 100; k++) {
   //     B[k] = 20 * A[99-k];
   //   }
   //   for (int j = 10; j < 100; j++) {
   //     C[j] = 10 * A[j];
   //   }
   BufHandle a_buf("A", {100}, kInt);
   BufHandle b_buf("B", {100}, kInt);
   BufHandle c_buf("C", {100}, kInt);
   VarHandle j("j", kInt);
   VarHandle k("k", kInt);
   auto forK = For::make(
       k,
       10,
       100,
       Store::make(
           b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
   auto forJ = For::make(
       j,
       10,
       100,
       Store::make(c_buf, {j}, Mul::make(10, Load::make(a_buf, {j}))));
   auto par = Block::make({forK, forJ});

   tensorexpr::analysis::MemDependencyChecker analyzer;
   par->accept(&analyzer);
   ASSERT_FALSE(hasConflictingOverlap(analyzer, forJ, forK));
   ASSERT_FALSE(hasConflictingOverlap(analyzer, forK, forJ));
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(BoundsInference, IsOverlapping) {
   KernelScope kernel_scope;

   // Input IR:
   //   for (int i = 0; i < 100; i++) {
   //     A[i] = i * 10;               // storeA1
   //     B[i] = A[99-i] * 20;         // loadA1
   //     C[i] = A[i + 100] * 10;      // loadA2
   //     A[i + 50] = i * 50;          // storeA2
   //     A[i + 150] = i * 150;        // storeA3
   //   }
   BufHandle a_buf("A", {300}, kInt);
   BufHandle b_buf("B", {100}, kInt);
   BufHandle c_buf("C", {100}, kInt);
   VarHandle i("i", kInt);
   auto storeA1 = Store::make(a_buf, {i}, i * 10);
   auto loadA1 = Load::make(a_buf, {ExprHandle(99) - i});
   auto storeB = Store::make(b_buf, {i}, Mul::make(loadA1, 20));
   auto loadA2 = Load::make(a_buf, {i + 100});
   auto storeC = Store::make(c_buf, {i}, Mul::make(loadA2, 10));
   auto storeA2 = Store::make(a_buf, {i + 50}, i * 50);
   auto storeA3 = Store::make(a_buf, {i + 150}, i * 150);
   auto forI = For::make(
       i, 0, 100, Block::make({storeA1, storeB, storeC, storeA2, storeA3}));
   tensorexpr::analysis::MemDependencyChecker analyzer;
   forI->accept(&analyzer);
   ASSERT_TRUE(
       isOverlapping(analyzer, storeA1, dynamic_cast<Load*>(loadA1.node())));
   ASSERT_FALSE(
       isOverlapping(analyzer, storeA1, dynamic_cast<Load*>(loadA2.node())));
   ASSERT_TRUE(isOverlapping(analyzer, storeA1, storeA2));
   ASSERT_FALSE(isOverlapping(analyzer, storeA1, storeA3));
 }

 } // namespace jit
 } // namespace torch