| #include <test/cpp/tensorexpr/test_base.h> |
| #include <memory> |
| #include <sstream> |
| #include <stdexcept> |
| #include <unordered_map> |
| |
| #include <test/cpp/tensorexpr/padded_buffer.h> |
| #include <torch/csrc/jit/tensorexpr/analysis.h> |
| #include <torch/csrc/jit/tensorexpr/bounds_inference.h> |
| #include <torch/csrc/jit/tensorexpr/buffer.h> |
| #include <torch/csrc/jit/tensorexpr/eval.h> |
| #include <torch/csrc/jit/tensorexpr/function.h> |
| #include <torch/csrc/jit/tensorexpr/ir.h> |
| #include <torch/csrc/jit/tensorexpr/ir_printer.h> |
| #include <torch/csrc/jit/tensorexpr/ir_simplifier.h> |
| #include <torch/csrc/jit/tensorexpr/loopnest.h> |
| #include <torch/csrc/jit/tensorexpr/tensor.h> |
| #include <torch/csrc/jit/testing/file_check.h> |
| |
| namespace torch { |
| namespace jit { |
| |
| using namespace torch::jit::tensorexpr; |
| |
| void testExprSimple01() { |
| KernelScope kernel_scope; |
| Tensor* tensor = Compute( |
| "f", {{16, "X"}, {5, "y"}}, [](const VarHandle& x, const VarHandle& y) { |
| return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y; |
| }); |
| LoopNest l({tensor}); |
| For* x_outer; |
| For* x_inner; |
| For* x_tail; |
| std::vector<For*> loops = l.getLoopStmtsFor(tensor); |
| l.splitWithTail(loops[0], 2, &x_outer, &x_inner, &x_tail); |
| |
| For* x_2; |
| For* x_1; |
| For* x_tail_2; |
| l.splitWithTail(x_outer, 2, &x_2, &x_1, &x_tail_2); |
| } |
| |
| void testExprLower01() { |
| KernelScope kernel_scope; |
| Tensor* tensor = Compute( |
| "f", {{16, "x"}, {5, "y"}}, [](const VarHandle& x, const VarHandle& y) { |
| return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y; |
| }); |
| LoopNest l({tensor}); |
| Stmt* stmt = l.root_stmt(); |
| std::ostringstream oss; |
| oss << *stmt; |
| ASSERT_GT(oss.str().size(), 20); |
| ASSERT_LT(oss.str().size(), 200); |
| } |
| |
| void testExprSimple02() { |
| KernelScope kernel_scope; |
| auto func = [](const ExprHandle& x, const ExprHandle& y) { |
| return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y; |
| }; |
| Tensor* tensor = Compute("f", {{26, "x"}, {5, "y"}}, func); |
| LoopNest l({tensor}); |
| For* x_outer; |
| For* x_inner; |
| For* x_tail; |
| std::vector<For*> loops = l.getLoopStmtsFor(tensor); |
| l.splitWithTail(loops[0], 4, &x_outer, &x_inner, &x_tail); |
| |
| Stmt* stmt = l.root_stmt(); |
| std::ostringstream oss; |
| oss << *stmt; |
| ASSERT_GT(oss.str().size(), 200); |
| ASSERT_LT(oss.str().size(), 600); |
| |
| { |
| // Compare to a reference loop structure structure. |
| VarHandle x_outer("x_outer", kInt); |
| VarHandle x_inner("x_inner", kInt); |
| VarHandle y("y", kInt); |
| VarHandle x_tail("x_tail", kInt); |
| BufHandle f("f", {26, 5}, kFloat); |
| ExprHandle x_1 = x_outer * 4 + x_inner; |
| ExprHandle x_outer_end = (ExprHandle(26) - 0) / 4; |
| For* stmt1 = For::make( |
| x_outer, |
| 0, |
| x_outer_end, |
| For::make( |
| x_inner, |
| 0, |
| 4, |
| For::make(y, 0, 5, Store::make(f, {x_1, y}, func(x_1, y), 1)))); |
| ExprHandle x_2 = x_tail + x_outer_end * 4; |
| For* stmt2 = For::make( |
| x_tail, |
| 0, |
| (ExprHandle(26) - 0) % 4, |
| For::make(y, 0, 5, Store::make(f, {x_2, y}, func(x_2, y), 1))); |
| Stmt* stmt = Block::make({stmt1, stmt2}); |
| |
| std::ostringstream oss_ref; |
| oss_ref << *stmt; |
| ASSERT_EQ(oss.str(), oss_ref.str()); |
| } |
| |
| { |
| PaddedBuffer<float> f_v(26, 5, "f_v"); |
| PaddedBuffer<float> f_ref(26, 5, "f_res"); |
| |
| stmt = FlattenIndexes(stmt); |
| SimpleIREvaluator ir_eval(stmt, tensor); |
| ir_eval(f_v); |
| |
| for (int x = 0; x < 26; x++) { |
| for (int y = 0; y < 5; y++) { |
| f_ref(x, y) = 1 + x * x + y * y; |
| } |
| } |
| |
| ExpectAllNear(f_v, f_ref, 1e-5); |
| } |
| } |
| |
| Block* getSimplifiedBody(const LoopNest& l) { |
| Stmt* stmt = l.root_stmt(); |
| Stmt* simplified = IRSimplifier::simplify(stmt); |
| return dynamic_cast<Block*>(simplified); |
| } |
| |
| void assertForRange(For* f, int expected_start, int expected_stop) { |
| ASSERT_NE(f, nullptr); |
| const IntImm* start = dynamic_cast<const IntImm*>(f->start()); |
| ASSERT_NE(start, nullptr); |
| ASSERT_EQ(start->value(), expected_start); |
| const IntImm* stop = dynamic_cast<const IntImm*>(f->stop()); |
| ASSERT_NE(stop, nullptr); |
| ASSERT_EQ(stop->value(), expected_stop); |
| } |
| |
| void assertForRanges( |
| Block* body, |
| const std::vector<std::pair<int, int>>& start_stops) { |
| ASSERT_EQ(body->nstmts(), start_stops.size()); |
| |
| auto it = body->begin(); |
| for (size_t i = 0; i < start_stops.size(); i++, it++) { |
| For* loop = dynamic_cast<For*>(*it); |
| assertForRange(loop, start_stops[i].first, start_stops[i].second); |
| } |
| } |
| |
| void testExprSliceHeadWithLoopOptions() { |
| KernelScope kernel_scope; |
| auto func = [](const ExprHandle& x) { |
| return ExprHandle(1.0f) + cast<float>(x); |
| }; |
| Tensor* tensor = Compute("f", {{10, "x"}}, func); |
| LoopNest l({tensor}); |
| For* head; |
| For* tail; |
| std::vector<For*> loops = l.getLoopStmtsFor(tensor); |
| l.setGPUBlockIndex(loops[0], LoopOptions::IDX_Y); |
| l.sliceHead(loops[0], 2, &head, &tail); |
| |
| Block* body = getSimplifiedBody(l); |
| assertForRanges(body, {{0, 2}, {0, 8}}); |
| |
| ASSERT_TRUE(tail->loop_options().is_gpu_block_index()); |
| ASSERT_EQ(tail->loop_options().gpu_block_index(), LoopOptions::IDX_Y); |
| |
| ASSERT_TRUE(head->loop_options().isDefault()); |
| } |
| |
| void testExprSliceTailWithLoopOptions() { |
| KernelScope kernel_scope; |
| auto func = [](const ExprHandle& x) { |
| return ExprHandle(1.0f) + cast<float>(x); |
| }; |
| Tensor* tensor = Compute("f", {{10, "x"}}, func); |
| LoopNest l({tensor}); |
| For* head; |
| For* tail; |
| std::vector<For*> loops = l.getLoopStmtsFor(tensor); |
| l.sliceTail(loops[0], 4, &head, &tail); |
| |
| For* tail_head; |
| For* tail_tail; |
| l.setGPUBlockIndex(tail, LoopOptions::IDX_Y); |
| l.sliceTail(tail, 2, &tail_head, &tail_tail); |
| |
| Block* body = getSimplifiedBody(l); |
| assertForRanges(body, {{0, 6}, {0, 2}, {8, 10}}); |
| |
| ASSERT_TRUE(tail_head->loop_options().is_gpu_block_index()); |
| ASSERT_EQ(tail_head->loop_options().gpu_block_index(), LoopOptions::IDX_Y); |
| |
| ASSERT_TRUE(head->loop_options().isDefault()); |
| ASSERT_TRUE(tail_tail->loop_options().isDefault()); |
| } |
| |
| void testExprSliceHeadWhenFactorEqualsSize() { |
| // When factor equals the For loop's original size, keep using the original |
| // For loop. |
| KernelScope kernel_scope; |
| auto func = [](const ExprHandle& x) { |
| return ExprHandle(1.0f) + cast<float>(x); |
| }; |
| Tensor* tensor = Compute("f", {{10, "x"}}, func); |
| LoopNest l({tensor}); |
| For* head; |
| For* tail; |
| std::vector<For*> loops = l.getLoopStmtsFor(tensor); |
| l.sliceHead(loops[0], 10, &head, &tail); |
| |
| ASSERT_EQ(head, loops[0]); |
| ASSERT_EQ(tail, nullptr); |
| |
| Block* body = getSimplifiedBody(l); |
| assertForRanges(body, {{0, 10}}); |
| } |
| |
| void testExprSliceHeadWhenFactorLargerThanSize() { |
| KernelScope kernel_scope; |
| auto func = [](const ExprHandle& x) { |
| return ExprHandle(1.0f) + cast<float>(x); |
| }; |
| Tensor* tensor = Compute("f", {{10, "x"}}, func); |
| LoopNest l({tensor}); |
| For* head; |
| For* tail; |
| std::vector<For*> loops = l.getLoopStmtsFor(tensor); |
| l.sliceHead(loops[0], 100, &head, &tail); |
| |
| ASSERT_EQ(head, loops[0]); |
| ASSERT_EQ(tail, nullptr); |
| |
| Block* body = getSimplifiedBody(l); |
| assertForRanges(body, {{0, 10}}); |
| } |
| |
| void testExprSliceHead() { |
| KernelScope kernel_scope; |
| auto func = [](const ExprHandle& x) { |
| return ExprHandle(1.0f) + cast<float>(x); |
| }; |
| Tensor* tensor = Compute("f", {{10, "x"}}, func); |
| LoopNest l({tensor}); |
| For* head; |
| For* tail; |
| std::vector<For*> loops = l.getLoopStmtsFor(tensor); |
| l.sliceHead(loops[0], 4, &head, &tail); |
| |
| ASSERT_NE(head, nullptr); |
| ASSERT_NE(head, loops[0]); |
| ASSERT_NE(tail, nullptr); |
| ASSERT_NE(tail, loops[0]); |
| |
| Block* body = getSimplifiedBody(l); |
| assertForRanges(body, {{0, 4}, {4, 10}}); |
| } |
| |
| void testExprSliceHeadWithNonZeroStart() { |
| KernelScope kernel_scope; |
| auto func = [](const ExprHandle& x) { |
| return ExprHandle(1.0f) + cast<float>(x); |
| }; |
| Tensor* tensor = Compute("f", {{10, "x"}}, func); |
| LoopNest l({tensor}); |
| std::vector<For*> loops = l.getLoopStmtsFor(tensor); |
| |
| For* head; |
| For* tail; |
| l.sliceTail(loops[0], 4, &head, &tail); |
| // head: [0, 6) |
| // tail: [6, 10) |
| |
| For* tail_head; |
| For* tail_tail; |
| l.sliceHead(tail, 2, &tail_head, &tail_tail); |
| // tail_head: [6, 8) |
| // tail_tail: [8, 10) |
| |
| Block* body = getSimplifiedBody(l); |
| assertForRanges(body, {{0, 6}, {6, 8}, {8, 10}}); |
| } |
| |
| void testExprSliceTailWhenFactorEqualsSize() { |
| // When factor equals the For loop's original size, keep using the original |
| // For loop. |
| KernelScope kernel_scope; |
| auto func = [](const ExprHandle& x) { |
| return ExprHandle(1.0f) + cast<float>(x); |
| }; |
| Tensor* tensor = Compute("f", {{10, "x"}}, func); |
| LoopNest l({tensor}); |
| For* head; |
| For* tail; |
| std::vector<For*> loops = l.getLoopStmtsFor(tensor); |
| l.sliceTail(loops[0], 10, &head, &tail); |
| |
| ASSERT_EQ(head, nullptr); |
| ASSERT_EQ(tail, loops[0]); |
| |
| Block* body = getSimplifiedBody(l); |
| assertForRanges(body, {{0, 10}}); |
| } |
| |
| void testExprSliceTailWhenFactorLargerThanSize() { |
| // When factor equals the For loop's original size, keep using the original |
| // For loop. |
| KernelScope kernel_scope; |
| auto func = [](const ExprHandle& x) { |
| return ExprHandle(1.0f) + cast<float>(x); |
| }; |
| Tensor* tensor = Compute("f", {{10, "x"}}, func); |
| LoopNest l({tensor}); |
| For* head; |
| For* tail; |
| std::vector<For*> loops = l.getLoopStmtsFor(tensor); |
| l.sliceTail(loops[0], 100, &head, &tail); |
| |
| ASSERT_EQ(head, nullptr); |
| ASSERT_EQ(tail, loops[0]); |
| |
| Block* body = getSimplifiedBody(l); |
| assertForRanges(body, {{0, 10}}); |
| } |
| |
| void testExprSliceTail() { |
| KernelScope kernel_scope; |
| auto func = [](const ExprHandle& x) { |
| return ExprHandle(1.0f) + cast<float>(x); |
| }; |
| Tensor* tensor = Compute("f", {{10, "x"}}, func); |
| LoopNest l({tensor}); |
| For* head; |
| For* tail; |
| std::vector<For*> loops = l.getLoopStmtsFor(tensor); |
| l.sliceTail(loops[0], 4, &head, &tail); |
| |
| ASSERT_NE(head, nullptr); |
| ASSERT_NE(head, loops[0]); |
| ASSERT_NE(tail, nullptr); |
| ASSERT_NE(tail, loops[0]); |
| |
| Block* body = getSimplifiedBody(l); |
| assertForRanges(body, {{0, 6}, {6, 10}}); |
| } |
| |
| void testExprSplitAndSlice() { |
| // 0: splitWithTail |
| // 1: sliceTail on inner loop |
| // 2: sliceHead on outer loop |
| KernelScope kernel_scope; |
| auto func = [](const ExprHandle& x) { |
| return ExprHandle(1.0f) + cast<float>(x); |
| }; |
| Tensor* tensor = Compute("f", {{100, "x"}}, func); |
| LoopNest l({tensor}); |
| |
| For* outer; |
| For* inner; |
| For* tail; |
| std::vector<For*> loops = l.getLoopStmtsFor(tensor); |
| // outer: [0, 4) |
| // inner: [0, 21) |
| // tail: [84, 100) |
| l.splitWithTail(loops[0], 21, &outer, &inner, &tail); |
| |
| For* inner_head; |
| For* inner_tail; |
| l.sliceTail(inner, 2, &inner_head, &inner_tail); |
| |
| For* outer_head; |
| For* outer_tail; |
| l.sliceHead(outer, 2, &outer_head, &outer_tail); |
| |
| // for (int x_outer = 0; x_outer < 2; x_outer++) { |
| // for (int x_inner = 0; x_inner < 19; x_inner++) { |
| // f[21 * x_outer + x_inner] = 1.f + float(21 * x_outer + x_inner); |
| // } |
| // for (int x_inner = 19; x_inner < 21; x_inner++) { |
| // f[21 * x_outer + x_inner] = 1.f + float(21 * x_outer + x_inner); |
| // } |
| // } |
| // for (int x_outer = 2; x_outer < 4; x_outer++) { |
| // for (int x_inner = 0; x_inner < 19; x_inner++) { |
| // f[21 * x_outer + x_inner] = 1.f + float(21 * x_outer + x_inner); |
| // } |
| // for (int x_inner = 19; x_inner < 21; x_inner++) { |
| // f[21 * x_outer + x_inner] = 1.f + float(21 * x_outer + x_inner); |
| // } |
| // } |
| // for (int x_tail = 0; x_tail < 16; x_tail++) { |
| // f[x_tail + 84] = 1.f + float(x_tail + 84); |
| // } |
| Block* body = getSimplifiedBody(l); |
| assertForRanges(body, {{0, 2}, {2, 4}, {0, 16}}); |
| |
| auto biter = body->begin(); |
| |
| For* loop = dynamic_cast<For*>(*biter++); |
| assertForRanges(loop->body(), {{0, 19}, {19, 21}}); |
| |
| loop = dynamic_cast<For*>(*biter); |
| assertForRanges(loop->body(), {{0, 19}, {19, 21}}); |
| } |
| |
| void testExprSliceAndNormalize() { |
| // 0: sliceHead |
| // 1: normalize tail |
| KernelScope kernel_scope; |
| auto func = [](const ExprHandle& x) { |
| return ExprHandle(1.0f) + cast<float>(x); |
| }; |
| Tensor* tensor = Compute("f", {{10, "x"}}, func); |
| LoopNest l({tensor}); |
| std::vector<For*> loops = l.getLoopStmtsFor(tensor); |
| |
| For* head; |
| For* tail; |
| l.sliceHead(loops[0], 2, &head, &tail); |
| // head: [0, 2) |
| // tail: [2, 10) |
| |
| For* normalized_tail; |
| LoopNest::normalize(tail, &normalized_tail); |
| // normalized_tail: [0, 8) |
| |
| Block* body = getSimplifiedBody(l); |
| assertForRanges(body, {{0, 2}, {0, 8}}); |
| } |
| |
| template <typename T> |
| T evalExpr(const ExprHandle& expr, const VarHandle& var, T value) { |
| ExprEval<SimpleIREvaluator> eval(expr, {var}); |
| return eval.value<T>(value); |
| } |
| |
| void testExprSliceWithVariableDimension() { |
| auto testWithDimension = |
| [](int dimension, |
| const std::vector<std::pair<int, int>>& expected_for_ranges) { |
| KernelScope kernel_scope; |
| VarHandle dim("dim", kInt); |
| Tensor* tensor = |
| Compute("f", {{dim, "x"}}, [](const ExprHandle& x) { return x; }); |
| LoopNest l({tensor}); |
| std::vector<For*> loops = l.getLoopStmtsFor(tensor); |
| |
| For* head; |
| For* tail; |
| l.sliceHead(loops[0], 2, &head, &tail); |
| |
| For* tail_head; |
| For* tail_tail; |
| l.sliceTail(tail, 2, &tail_head, &tail_tail); |
| |
| Block* body = getSimplifiedBody(l); |
| ASSERT_EQ(expected_for_ranges.size(), 3); |
| auto it = body->begin(); |
| for (auto& start_stop : expected_for_ranges) { |
| For* loop = dynamic_cast<For*>(*it++); |
| int start = evalExpr<int>(ExprHandle(loop->start()), dim, dimension); |
| int stop = evalExpr<int>(ExprHandle(loop->stop()), dim, dimension); |
| ASSERT_EQ(start, start_stop.first); |
| ASSERT_EQ(stop, start_stop.second); |
| } |
| }; |
| |
| testWithDimension(1, {{0, 1}, {1, 1}, {1, 1}}); |
| testWithDimension(2, {{0, 2}, {2, 2}, {2, 2}}); |
| testWithDimension(3, {{0, 2}, {2, 2}, {2, 3}}); |
| testWithDimension(4, {{0, 2}, {2, 2}, {2, 4}}); |
| testWithDimension(5, {{0, 2}, {2, 3}, {3, 5}}); |
| testWithDimension(10, {{0, 2}, {2, 8}, {8, 10}}); |
| } |
| |
| void testExprSplitWithTail() { |
| KernelScope kernel_scope; |
| auto func = [](const ExprHandle& x) { |
| return ExprHandle(1.0f) + cast<float>(x); |
| }; |
| Tensor* tensor = Compute("f", {{199, "x"}}, func); |
| LoopNest l({tensor}); |
| For* x_outer; |
| For* x_inner; |
| For* x_tail; |
| std::vector<For*> loops = l.getLoopStmtsFor(tensor); |
| l.splitWithTail(loops[0], 17, &x_outer, &x_inner, &x_tail); |
| |
| For* a; |
| For* b; |
| For* c; |
| l.splitWithTail(x_outer, 7, &a, &b, &c); |
| |
| Stmt* stmt = l.root_stmt(); |
| Stmt* simplified = IRSimplifier::simplify(stmt); |
| Block* body = dynamic_cast<Block*>(simplified); |
| ASSERT_EQ(body->nstmts(), 3); |
| auto biter = body->begin(); |
| |
| // Verify that the split loops are ordered correctly. |
| For* loop = dynamic_cast<For*>(*biter++); |
| assertForRange(loop, 0, 7); |
| |
| loop = dynamic_cast<For*>(*biter++); |
| assertForRange(loop, 0, 4); |
| |
| loop = dynamic_cast<For*>(*biter); |
| assertForRange(loop, 0, 12); |
| } |
| |
| void testExprSplitWithTailNone() { |
| KernelScope kernel_scope; |
| auto func = [](const ExprHandle& x, const ExprHandle& y) { |
| return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y; |
| }; |
| Tensor* tensor = Compute("f", {{24, "x"}, {5, "y"}}, func); |
| LoopNest l({tensor}); |
| For* x_outer; |
| For* x_inner; |
| For* x_tail; |
| std::vector<For*> loops = l.getLoopStmtsFor(tensor); |
| l.splitWithTail(loops[0], 4, &x_outer, &x_inner, &x_tail); |
| |
| Stmt* stmt = l.root_stmt(); |
| std::ostringstream oss; |
| oss << *stmt; |
| ASSERT_GT(oss.str().size(), 200); |
| ASSERT_LT(oss.str().size(), 600); |
| |
| { |
| // Compare to a reference loop structure structure. |
| VarHandle x_outer("x_outer", kInt); |
| VarHandle x_inner("x_inner", kInt); |
| VarHandle y("y", kInt); |
| VarHandle x_tail("x_tail", kInt); |
| BufHandle f("f", {24, 5}, kFloat); |
| ExprHandle x_1 = x_outer * 4 + x_inner; |
| ExprHandle x_outer_end = (ExprHandle(24) - 0) / 4; |
| Stmt* stmt = new Block({For::make( |
| x_outer, |
| 0, |
| x_outer_end, |
| For::make( |
| x_inner, |
| 0, |
| 4, |
| For::make(y, 0, 5, Store::make(f, {x_1, y}, func(x_1, y), 1))))}); |
| |
| std::ostringstream oss_ref; |
| oss_ref << *stmt; |
| ASSERT_EQ(oss.str(), oss_ref.str()); |
| } |
| |
| { |
| PaddedBuffer<float> f_v(24, 5, "f_v"); |
| PaddedBuffer<float> f_ref(24, 5, "f_res"); |
| |
| SimpleIREvaluator ir_eval(stmt, tensor); |
| ir_eval(f_v); |
| |
| for (int x = 0; x < 24; x++) { |
| for (int y = 0; y < 5; y++) { |
| f_ref(x, y) = 1 + x * x + y * y; |
| } |
| } |
| |
| ExpectAllNear(f_v, f_ref, 1e-5); |
| } |
| } |
| |
| void testExprSplitWithMask01() { |
| KernelScope kernel_scope; |
| const int M = 26; |
| const int N = 5; |
| Buffer a_buf("a", kFloat, {M, N}); |
| Buffer b_buf("b", kFloat, {M, N}); |
| Tensor* tensor = Compute( |
| "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) { |
| return a_buf(m, n) + b_buf(m, n) + 1.0f; |
| }); |
| For* n_outer; |
| For* n_inner; |
| |
| LoopNest l({tensor}); |
| std::vector<For*> loops = l.getLoopStmtsFor(tensor); |
| l.splitWithMask(loops[1], 4, &n_outer, &n_inner); |
| |
| Stmt* stmt = l.root_stmt(); |
| |
| PaddedBuffer<float> a_v(M, N, "a"); |
| PaddedBuffer<float> b_v(M, N, "b"); |
| PaddedBuffer<float> c_v(M, N, "c"); |
| PaddedBuffer<float> c_ref(M, N, "c_ref"); |
| for (int m = 0; m < M; m++) { |
| for (int n = 0; n < N; n++) { |
| a_v(m, n) = 2 * m; |
| b_v(m, n) = 3 * n; |
| c_ref(m, n) = a_v(m, n) + b_v(m, n) + 1.0f; |
| } |
| } |
| |
| SimpleIREvaluator(stmt, a_buf, b_buf, tensor)(a_v, b_v, c_v); |
| |
| ExpectAllNear(c_v, c_ref, 1e-5); |
| } |
| |
| void testSplitWithTailWithLoopOptions() { |
| KernelScope kernel_scope; |
| const int M = 21; |
| Buffer a_buf("a", kFloat, {M}); |
| Buffer b_buf("b", kFloat, {M}); |
| Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) { |
| return a_buf(m) + b_buf(m) + 1.0f; |
| }); |
| For *outer, *inner, *tail; |
| |
| LoopNest l({tensor}); |
| auto loops = NodeFinder<For>::find(l.root_stmt()); |
| ASSERT_GT(loops.size(), 0); |
| l.setGPUBlockIndex(loops[0], LoopOptions::IDX_Y); |
| l.splitWithTail(loops[0], 4, &outer, &inner, &tail); |
| ASSERT_NE(outer, nullptr); |
| ASSERT_NE(inner, nullptr); |
| ASSERT_NE(tail, nullptr); |
| |
| // Outer loop carries loop axis bindings. |
| ASSERT_TRUE(outer->loop_options().is_gpu_block_index()); |
| ASSERT_EQ(outer->loop_options().gpu_block_index(), LoopOptions::IDX_Y); |
| |
| // Inner loop has none. |
| ASSERT_TRUE(inner->loop_options().isDefault()); |
| |
| // Tail loop has none. |
| ASSERT_TRUE(tail->loop_options().isDefault()); |
| } |
| |
| void testSplitWithMaskWithLoopOptions() { |
| KernelScope kernel_scope; |
| const int M = 21; |
| Buffer a_buf("a", kFloat, {M}); |
| Buffer b_buf("b", kFloat, {M}); |
| Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) { |
| return a_buf(m) + b_buf(m) + 1.0f; |
| }); |
| For *outer, *inner; |
| |
| LoopNest l({tensor}); |
| auto loops = NodeFinder<For>::find(l.root_stmt()); |
| l.setGPUBlockIndex(loops[0], LoopOptions::IDX_Y); |
| l.splitWithMask(loops[0], 4, &outer, &inner); |
| |
| // Outer loop carries loop axis bindings. |
| ASSERT_TRUE(outer->loop_options().is_gpu_block_index()); |
| ASSERT_EQ(outer->loop_options().gpu_block_index(), LoopOptions::IDX_Y); |
| |
| // Inner loop has none. |
| ASSERT_TRUE(inner->loop_options().isDefault()); |
| } |
| |
| void testScheduleBroadcastAddBuffer() { |
| KernelScope kernel_scope; |
| const int M = 4; |
| const int N = 5; |
| const int K = 6; |
| Buffer a_buf("a", kFloat, {M, N}); |
| Buffer b_buf("b", kFloat, {N, K}); |
| Tensor* c = Compute( |
| "broadcast_add", |
| {{M, "m"}, {N, "n"}, {K, "k"}}, |
| [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { |
| return a_buf(m, n) + b_buf(n, k); |
| }); |
| LoopNest l({c}); |
| Stmt* stmt = l.root_stmt(); |
| |
| PaddedBuffer<float> a_v(M, N, "a_v"); |
| for (int m = 0; m < M; m++) { |
| for (int n = 0; n < N; n++) { |
| a_v(m, n) = 7 * m * n; |
| } |
| } |
| a_v.Backup(); |
| |
| PaddedBuffer<float> b_v(N, K, "b_v"); |
| for (int n = 0; n < N; n++) { |
| for (int k = 0; k < K; k++) { |
| b_v(n, k) = 11 * n * k; |
| } |
| } |
| b_v.Backup(); |
| |
| PaddedBuffer<float> c_v(M, N, K, "c_buf"); |
| SimpleIREvaluator ir_eval(stmt, a_buf, b_buf, c); |
| ir_eval(a_v, b_v, c_v); |
| |
| a_v.CheckBackup(); |
| b_v.CheckBackup(); |
| PaddedBuffer<float> c_ref(M, N, K, "c_ref"); |
| for (int m = 0; m < M; m++) { |
| for (int n = 0; n < N; n++) { |
| for (int k = 0; k < K; k++) { |
| c_ref(m, n, k) = 7 * m * n + 11 * n * k; |
| } |
| } |
| } |
| ExpectAllNear(c_v, c_ref, 1e-5); |
| } |
| |
| void testScheduleFunctionCall01() { |
| KernelScope kernel_scope; |
| const int M = 4; |
| const int N = 5; |
| const int K = 6; |
| Buffer a_buf("a", kFloat, {M, N}); |
| Buffer b_buf("b", kFloat, {N, K}); |
| Tensor* c = Compute( |
| "broadcast_add", |
| {{M, "m"}, {N, "n"}, {K, "k"}}, |
| [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { |
| return a_buf(m, n) + b_buf(n, k); |
| }); |
| Tensor* d = Compute( |
| "d", |
| {{M, "m"}, {N, "n"}, {K, "k"}}, |
| [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { |
| return c->call(m, n, k) + 1; |
| }); |
| |
| LoopNest l({d}); |
| l.prepareForCodegen(); |
| Stmt* stmt = l.root_stmt(); |
| std::ostringstream oss; |
| oss << *stmt; |
| ASSERT_GT(oss.str().size(), 100); |
| |
| PaddedBuffer<float> a_v(M, N); |
| PaddedBuffer<float> b_v(N, K); |
| PaddedBuffer<float> c_v(M, N, K); |
| PaddedBuffer<float> d_v(M, N, K); |
| PaddedBuffer<float> d_ref(M, N, K); |
| |
| for (int i = 0; i < M; i++) { |
| for (int j = 0; j < N; j++) { |
| a_v(i, j) = i * i; |
| } |
| } |
| for (int i = 0; i < N; i++) { |
| for (int j = 0; j < K; j++) { |
| b_v(i, j) = j * j; |
| } |
| } |
| for (int i = 0; i < M; i++) { |
| for (int j = 0; j < N; j++) { |
| for (int k = 0; k < K; k++) { |
| d_ref(i, j, k) = a_v(i, j) + b_v(j, k) + 1; |
| } |
| } |
| } |
| |
| SimpleIREvaluator eval(stmt, a_buf, b_buf, d); |
| eval(a_v, b_v, d_v); |
| |
| ExpectAllNear(d_v, d_ref, 1e-5); |
| } |
| |
| static std::string remove_space(const std::string& str) { |
| std::string str_new = str; |
| str_new.erase( |
| remove_if(str_new.begin(), str_new.end(), isspace), str_new.end()); |
| return str_new; |
| } |
| |
| void InlineFunc01Helper(const std::vector<std::string>& inline_order) { |
| KernelScope kernel_scope; |
| const int M = 4; |
| const int N = 5; |
| const int K = 6; |
| Buffer a_buf("a", kFloat, {M, N}); |
| Buffer b_buf("b", kFloat, {N, K}); |
| Buffer c_buf("c", kFloat, {M, N}); |
| Buffer d_buf("d", kFloat, {M, K}); |
| |
| Tensor* x = Compute( |
| "x", |
| {{M, "m1"}, {N, "n1"}, {K, "k1"}}, |
| [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { |
| return a_buf(m, n) * b_buf(n, k); |
| }); |
| Tensor* y = Compute( |
| "y", |
| {{M, "m2"}, {N, "n2"}, {K, "k2"}}, |
| [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { |
| return c_buf(m, n) * d_buf(m, k) + x->call(m, n, k); |
| }); |
| Tensor* z = Compute( |
| "z", |
| {{M, "m3"}, {N, "n3"}, {K, "k3"}}, |
| [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { |
| return x->call(m, n, k) + y->call(m, n, k); |
| }); |
| |
| LoopNest l({z}); |
| for (const std::string& order : inline_order) { |
| if (order == "x") { |
| l.computeInline(l.getLoopBodyFor(x)); |
| } else if (order == "y") { |
| l.computeInline(l.getLoopBodyFor(y)); |
| } else { |
| throw std::runtime_error("Invalid order: " + order); |
| } |
| } |
| l.prepareForCodegen(); |
| Stmt* stmt = l.root_stmt(); |
| |
| std::ostringstream oss; |
| oss << *stmt; |
| std::string str1 = remove_space(oss.str()); |
| |
| { |
| PaddedBuffer<float> a_v(M, N); |
| PaddedBuffer<float> b_v(N, K); |
| PaddedBuffer<float> c_v(M, N); |
| PaddedBuffer<float> d_v(M, K); |
| |
| for (int i = 0; i < M; i++) { |
| for (int j = 0; j < N; j++) { |
| a_v(i, j) = i * i; |
| } |
| } |
| for (int i = 0; i < N; i++) { |
| for (int j = 0; j < K; j++) { |
| a_v(i, j) = j * j; |
| } |
| } |
| for (int i = 0; i < M; i++) { |
| for (int j = 0; j < N; j++) { |
| c_v(i, j) = i + j; |
| } |
| } |
| for (int i = 0; i < M; i++) { |
| for (int j = 0; j < K; j++) { |
| d_v(i, j) = i * j; |
| } |
| } |
| |
| PaddedBuffer<float> z_v(M, N, K); |
| PaddedBuffer<float> z_ref(M, N, K); |
| for (int m = 0; m < M; m++) { |
| for (int n = 0; n < N; n++) { |
| for (int k = 0; k < K; k++) { |
| z_ref(m, n, k) = a_v(m, n) * b_v(n, k) * 2 + c_v(m, n) * d_v(m, k); |
| } |
| } |
| } |
| |
| SimpleIREvaluator eval(stmt, a_buf, b_buf, c_buf, d_buf, z); |
| eval(a_v, b_v, c_v, d_v, z_v); |
| ExpectAllNear(z_v, z_ref, 1e-5); |
| } |
| |
| if (inline_order.size() == 2) { |
| Tensor* z2 = Compute( |
| "z", |
| {{M, "m3"}, {N, "n3"}, {K, "k3"}}, |
| [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { |
| return a_buf(m, n) * b_buf(n, k) + |
| (c_buf(m, n) * d_buf(m, k) + a_buf(m, n) * b_buf(n, k)); |
| }); |
| LoopNest l2({z2}); |
| l2.prepareForCodegen(); |
| Stmt* stmt2 = l2.root_stmt(); |
| |
| std::ostringstream oss2; |
| oss2 << *stmt2; |
| std::string str2 = remove_space(oss2.str()); |
| |
| ASSERT_EQ(str1, str2); |
| ASSERT_GT(str1.size(), 100); |
| } |
| } |
| |
| void testScheduleInlineFunc01() { |
| InlineFunc01Helper({"x", "y"}); |
| InlineFunc01Helper({"y", "x"}); |
| InlineFunc01Helper({"x"}); |
| InlineFunc01Helper({"y"}); |
| InlineFunc01Helper({}); |
| } |
| |
| void testScheduleFuserStyle() { |
| KernelScope kernel_scope; |
| const int kVectorSize = 8; |
| const int kVectorCount = 128; |
| const int kTotalSize = kVectorSize * kVectorCount; |
| |
| Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); |
| |
| Tensor* b = Compute( |
| "f", {{kTotalSize, "i"}}, [&](const std::vector<VarHandle>& axes) { |
| return a_buf(axes[0]) + 11.0f; |
| }); |
| |
| Tensor* c = Compute( |
| "g", {{kTotalSize, "i"}}, [&](const std::vector<VarHandle>& axes) { |
| return b->call(axes[0]) + 1.0f; |
| }); |
| |
| LoopNest l({b, c}); |
| l.prepareForCodegen(); |
| Stmt* s = l.root_stmt(); |
| |
| std::vector<float> a_data(kTotalSize, 7.0f); |
| std::vector<float> b_data(kTotalSize, 0.0f); |
| std::vector<float> c_data(kTotalSize, 0.0f); |
| SimpleIREvaluator(s, a_buf, b, c)(a_data, b_data, c_data); |
| |
| for (int i = 0; i < kTotalSize; i++) { |
| ASSERT_EQ(b_data[i], 18.0f); |
| ASSERT_EQ(c_data[i], 19.0f); |
| } |
| } |
| |
| void testScheduleFuserThreeArg() { |
| KernelScope kernel_scope; |
| const int kVectorSize = 8; |
| const int kVectorCount = 128; |
| const int kTotalSize = kVectorSize * kVectorCount; |
| |
| Buffer a(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); |
| Buffer b(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); |
| Buffer c(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); |
| Buffer d(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat)); |
| |
| Tensor* e = Compute("e", {{kTotalSize, "i"}}, [&](const VarHandle& i) { |
| return a(i) + b(i); |
| }); |
| Tensor* f = Compute("f", {{kTotalSize, "i"}}, [&](const VarHandle& i) { |
| return (*e)(i) + c(i); |
| }); |
| Tensor* g = Compute("g", {{kTotalSize, "i"}}, [&](const VarHandle& i) { |
| return (*f)(i) + d(i); |
| }); |
| |
| LoopNest l({g}); |
| l.computeInline(l.getLoopBodyFor(e)); |
| l.computeInline(l.getLoopBodyFor(f)); |
| l.prepareForCodegen(); |
| Stmt* s = l.root_stmt(); |
| |
| std::vector<float> a_data(kTotalSize, 1.0f); |
| std::vector<float> b_data(kTotalSize, 2.0f); |
| std::vector<float> c_data(kTotalSize, 3.0f); |
| std::vector<float> d_data(kTotalSize, 4.0f); |
| std::vector<float> g_data(kTotalSize, 0.0f); |
| SimpleIREvaluator(s, a, b, c, d, g)(a_data, b_data, c_data, d_data, g_data); |
| |
| for (int i = 0; i < kTotalSize; i++) { |
| ASSERT_EQ(g_data[i], 10.0f); |
| } |
| } |
| |
| void testScheduleDynamicShape2D() { |
| KernelScope kernel_scope; |
| auto testWithSize = [](int32_t M, int32_t N) { |
| VarHandle m("m", kInt); |
| VarHandle n("n", kInt); |
| Buffer a(BufHandle("a", {m, n}, kFloat)); |
| Buffer b(BufHandle("b", {m, n}, kFloat)); |
| Tensor* c = Compute( |
| "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) { |
| return a(i, j) + b(i, j); |
| }); |
| LoopNest l({c}); |
| Stmt* s = l.root_stmt(); |
| SimpleIREvaluator cg(s, {a, b, c, m, n}); |
| std::vector<float> aData(M * N, 1.0f); |
| std::vector<float> bData(M * N, 2.0f); |
| std::vector<float> cData(M * N, 0.0f); |
| cg.call({aData, bData, cData, M, N}); |
| ExpectAllNear(cData, std::vector<float>(M * N, 3.0f), 1e-7); |
| }; |
| testWithSize(1, 8); |
| testWithSize(16, 32); |
| testWithSize(37, 11); |
| } |
| |
| void testLoopNestComputeAt_1() { |
| // Verify that compute_at works on the following example: |
| // |
| // for (int i_a = 0; i_a < N; i_a++) { |
| // A[i_a] = i_a * i_a |
| // } |
| // for (int i_b = 0; i_b < N; i_b++) { |
| // B[i_b] = A[i_b] |
| // } |
| // |
| // After the transformation the i_b loop should have an allocation for a temp |
| // buffer and that buffer should be used in computation of B. No use of A |
| // should be in that loop after the transformation. Also, computation of A |
| // should not be inlined into B. Instead, it should be computed into the temp, |
| // and the temp should be used in B. |
| KernelScope kernel_scope; |
| VarHandle N("N", kInt); |
| Tensor* A = Compute( |
| "A", {{N, "i_a"}}, [&](const VarHandle& i_a) { return i_a * i_a; }); |
| Tensor* B = Compute( |
| "B", {{N, "i_b"}}, [&](const VarHandle& i_b) { return A->call(i_b); }); |
| LoopNest l({B}); |
| std::vector<For*> loops = l.getLoopStmtsFor(B); |
| l.computeAt(l.getLoopBodyFor(A), loops[0]); |
| l.prepareForCodegen(); |
| Stmt* s = l.root_stmt(); |
| |
| std::ostringstream oss; |
| oss << *s; |
| |
| const std::string& verification_pattern = |
| R"IR( |
| # CHECK: for (int i_b = 0; i_b < N; i_b++) |
| # CHECK: Allocate(temp, int, {1}) |
| # CHECK: temp[ |
| # CHECK-NOT: A[ |
| # CHECK: B[i_b] = temp[0] |
| # CHECK: Free(temp))IR"; |
| |
| torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); |
| |
| // Now check that the loop still produces the correct result. |
| std::vector<int> b_data(100, 0); |
| SimpleIREvaluator cg(s, {B, N}); |
| cg.call({b_data, 100}); |
| |
| std::vector<int> b_ref(100, 0); |
| for (int i = 0; i < 100; i++) { |
| b_ref[i] = i * i; |
| } |
| assertAllEqual(b_data, b_ref); |
| } |
| |
| void testLoopNestComputeAt_2() { |
| // Verify that compute_at works on the following example: |
| // |
| // for (int py = 0; py < H+1; py++) { |
| // for (int px = 0; px < W+1; px++) { |
| // p[py, px] = py*px |
| // } |
| // } |
| // for (int cy = 0; cy < H; cy++) { |
| // for (int cx = 0; cx < W; cx++) { |
| // c[py, px] = p[cy,cx] + p[cy+1,cx] + |
| // p[cy,cx+1] + p[cy+1,cx+1] |
| // } |
| // } |
| KernelScope kernel_scope; |
| |
| const int kW = 16, kH = 16; |
| VarHandle W("W", kInt); |
| VarHandle H("H", kInt); |
| Tensor* p = Compute( |
| "prod", |
| {{H + 1, "py"}, {W + 1, "px"}}, |
| [&](const VarHandle& py, const VarHandle& px) { return px * py; }); |
| Tensor* c = Compute( |
| "cons", |
| {{H, "cy"}, {W, "cx"}}, |
| [&](const VarHandle& y, const VarHandle& x) { |
| return p->call(y, x) + p->call(y + 1, x) + p->call(y, x + 1) + |
| p->call(y + 1, x + 1); |
| }); |
| |
| std::vector<int> c_ref(kW * kH, 0); |
| for (int y = 0; y < kH; y++) { |
| for (int x = 0; x < kW; x++) { |
| c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1); |
| } |
| } |
| |
| { |
| // First let's try to compute P at axis cy (the outer loop) |
| LoopNest l({c}); |
| std::vector<For*> loops = l.getLoopStmtsFor(c); |
| l.computeAt(l.getLoopBodyFor(p), loops[0]); |
| l.prepareForCodegen(); |
| Stmt* s = l.root_stmt(); |
| |
| std::ostringstream oss; |
| oss << *s; |
| |
| // Check the IR we produced |
| const std::string& verification_pattern = |
| R"IR( |
| # CHECK: for (int cy = 0; cy < H; cy++) |
| # CHECK: Allocate(temp, int, {2, W + 1}) |
| # CHECK: for |
| # CHECK: for |
| # CHECK: for (int cx = 0; cx < W; cx++) |
| # CHECK-NOT: prod[ |
| # CHECK: cons[ |
| # CHECK: Free(temp))IR"; |
| torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); |
| |
| // Now check that the loop still produces the correct result. |
| std::vector<int> c_data(kW * kH, 0); |
| SimpleIREvaluator cg(s, {c, W, H}); |
| cg.call({c_data, kW, kH}); |
| |
| assertAllEqual(c_data, c_ref); |
| } |
| { |
| // Now let's try to compute P at axis cx (the inner loop) |
| LoopNest l({c}); |
| std::vector<For*> loops = l.getLoopStmtsFor(c); |
| l.computeAt(l.getLoopBodyFor(p), loops[1]); |
| l.prepareForCodegen(); |
| Stmt* s = l.root_stmt(); |
| |
| std::ostringstream oss; |
| oss << *s; |
| |
| // Check the IR we produced |
| const std::string& verification_pattern = |
| R"IR( |
| # CHECK: for (int cy = 0; cy < H; cy++) |
| # CHECK: for (int cx = 0; cx < W; cx++) |
| # CHECK: Allocate(temp, int, {2, 2}) |
| # CHECK: for |
| # CHECK: for |
| # CHECK-NOT: prod[ |
| # CHECK: cons[ |
| # CHECK: Free(temp))IR"; |
| torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); |
| |
| // Now check that the loop still produces the correct result. |
| std::vector<int> c_data(kW * kH, 0); |
| SimpleIREvaluator cg(s, {c, W, H}); |
| cg.call({c_data, kW, kH}); |
| |
| assertAllEqual(c_data, c_ref); |
| } |
| } |
| |
| void testLoopNestComputeAt_3() { |
| // Verify that compute_at works on the following example: |
| // |
| // A(x,y) = x*y |
| // B(x,y) = A(x, y) |
| // C(x,y) = B(x+1, y) |
| // D(x,y) = A(x, y+1) + C(x, y) |
| // |
| // i.e. when 'A' comes to 'D' directly and indirectly through 'C'. |
| KernelScope kernel_scope; |
| |
| const int kW = 16, kH = 16; |
| VarHandle W("W", kInt); |
| VarHandle H("H", kInt); |
| Tensor* A = Compute( |
| "A", |
| {{H + 1, "ay"}, {W + 1, "ax"}}, |
| [&](const VarHandle& ay, const VarHandle& ax) { return ax * ay; }); |
| Tensor* B = Compute( |
| "B", |
| {{H + 1, "by"}, {W + 1, "bx"}}, |
| [&](const VarHandle& by, const VarHandle& bx) { |
| return A->call(by, bx); |
| }); |
| Tensor* C = Compute( |
| "C", |
| {{H, "cy"}, {W, "cx"}}, |
| [&](const VarHandle& cy, const VarHandle& cx) { |
| return B->call(cy, cx + 1); |
| }); |
| Tensor* D = Compute( |
| "D", |
| {{H, "dy"}, {W, "dx"}}, |
| [&](const VarHandle& dy, const VarHandle& dx) { |
| return A->call(dy + 1, dx) + C->call(dy, dx); |
| }); |
| |
| std::vector<int> c_ref(kW * kH, 0); |
| for (int y = 0; y < kH; y++) { |
| for (int x = 0; x < kW; x++) { |
| c_ref[y * kW + x] = (y + 1) * x + y * (x + 1); |
| } |
| } |
| |
| { |
| // First let's try to compute A at axis dy (the outer loop) |
| LoopNest l({D}); |
| std::vector<For*> loops = l.getLoopStmtsFor(D); |
| l.computeAt(l.getLoopBodyFor(A), loops[0]); |
| l.prepareForCodegen(); |
| Stmt* s = l.root_stmt(); |
| |
| std::ostringstream oss; |
| oss << *s; |
| |
| // Check the IR we produced |
| const std::string& verification_pattern = |
| R"IR( |
| # CHECK: for (int ay = 0; ay < H + 1; ay++) |
| # CHECK: for (int ax = 0; ax < W + 1; ax++) |
| # CHECK: A[ |
| # CHECK: for (int by = 0; by < H + 1; by++) |
| # CHECK: for (int bx = 0; bx < W + 1; bx++) |
| # CHECK: B[ |
| # CHECK: for (int cy = 0; cy < H; cy++) |
| # CHECK: for (int cx = 0; cx < W; cx++) |
| # CHECK: C[ |
| # CHECK: for (int dy = 0; dy < H; dy++) |
| # CHECK: Allocate(temp, int, {1, W}) |
| # CHECK: for (int dx = 0; dx < W; dx++) |
| # CHECK-NOT: A[)IR"; |
| torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); |
| |
| // Now check that the loop still produces the correct result. |
| std::vector<int> c_data(kW * kH, 0); |
| SimpleIREvaluator cg(s, {D, W, H}); |
| cg.call({c_data, kW, kH}); |
| |
| assertAllEqual(c_data, c_ref); |
| } |
| { |
| // Now let's try to compute A at axis dx (the inner loop) |
| LoopNest l({D}); |
| std::vector<For*> loops = l.getLoopStmtsFor(D); |
| l.computeAt(l.getLoopBodyFor(A), loops[1]); |
| l.prepareForCodegen(); |
| Stmt* s = l.root_stmt(); |
| |
| std::ostringstream oss; |
| oss << *s; |
| |
| // Check the IR we produced |
| const std::string& verification_pattern = |
| R"IR( |
| # CHECK: for (int ay = 0; ay < H + 1; ay++) |
| # CHECK: for (int ax = 0; ax < W + 1; ax++) |
| # CHECK: A[ |
| # CHECK: for (int by = 0; by < H + 1; by++) |
| # CHECK: for (int bx = 0; bx < W + 1; bx++) |
| # CHECK: B[ |
| # CHECK: for (int cy = 0; cy < H; cy++) |
| # CHECK: for (int cx = 0; cx < W; cx++) |
| # CHECK: C[ |
| # CHECK: for (int dy = 0; dy < H; dy++) |
| # CHECK: for (int dx = 0; dx < W; dx++) |
| # CHECK: Allocate(temp, int, {1, 1}) |
| # CHECK-NOT: A[)IR"; |
| torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); |
| |
| // Now check that the loop still produces the correct result. |
| std::vector<int> c_data(kW * kH, 0); |
| SimpleIREvaluator cg(s, {D, W, H}); |
| cg.call({c_data, kW, kH}); |
| |
| assertAllEqual(c_data, c_ref); |
| } |
| } |
| |
| void testLoopNestComputeAt_4() { |
| // TODO: Verify that computeAt works with reduction axis |
| } |
| |
| class LoopOrderHelper : public IRVisitor { |
| std::stringstream ordering; |
| |
| public: |
| std::string getOrder(Stmt* s) { |
| ordering.str(""); |
| s->accept(this); |
| return ordering.str(); |
| } |
| |
| void visit(const For* v) { |
| ordering << v->var()->name_hint() << ","; |
| IRVisitor::visit(v); |
| } |
| }; |
| |
| void testLoopNestReorderAxis1() { |
| KernelScope kernel_scope; |
| Tensor* tensor = Compute( |
| "f", {{2, "x"}, {3, "y"}}, [](const VarHandle& x, const VarHandle& y) { |
| return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y; |
| }); |
| LoopNest l({tensor}); |
| Stmt* stmt1 = Stmt::clone(l.root_stmt()); |
| |
| std::vector<int> stmt1_output(6, 0); |
| SimpleIREvaluator cg(stmt1, {tensor}); |
| cg.call({stmt1_output}); |
| |
| auto loops = l.getLoopStmtsFor(tensor); |
| l.reorderAxis(loops[0], loops[1]); |
| Stmt* stmt2 = Stmt::clone(l.root_stmt()); |
| |
| ASSERT_NE(stmt1, stmt2); |
| LoopOrderHelper loopOrderHelper; |
| std::string order1 = loopOrderHelper.getOrder(stmt1); |
| std::string order2 = loopOrderHelper.getOrder(stmt2); |
| |
| ASSERT_EQ(order1, "x,y,"); |
| ASSERT_EQ(order2, "y,x,"); |
| |
| std::vector<int> stmt2_output(6, 0); |
| SimpleIREvaluator cg2(stmt2, {tensor}); |
| cg.call({stmt2_output}); |
| |
| for (int i = 0; i < 6; ++i) { |
| ASSERT_EQ(stmt1_output[i], stmt2_output[i]); |
| } |
| |
| // Reorder them back. |
| loops = l.getLoopStmtsFor(tensor); |
| l.reorderAxis(loops[0], loops[1]); |
| Stmt* stmt3 = l.root_stmt(); |
| |
| std::string order3 = loopOrderHelper.getOrder(stmt3); |
| ASSERT_EQ(order3, order1); |
| |
| std::ostringstream oss1, oss2; |
| oss1 << *stmt1; |
| oss2 << *stmt3; |
| |
| // Should be identical to the unreordered statement. |
| ASSERT_EQ(oss1.str(), oss2.str()); |
| } |
| |
| void testLoopNestReorderPartialAxes() { |
| KernelScope kernel_scope; |
| Tensor* tensor = Compute( |
| "f", |
| {{2, "x"}, {3, "y"}, {4, "z"}}, |
| [](const VarHandle& x, const VarHandle& y, const VarHandle& z) { |
| return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y + |
| cast<float>(z) * z; |
| }); |
| LoopNest l({tensor}); |
| |
| LoopOrderHelper loopOrderHelper; |
| Stmt* stmt1 = Stmt::clone(l.root_stmt()); |
| ASSERT_EQ(loopOrderHelper.getOrder(stmt1), "x,y,z,"); |
| |
| std::vector<int> stmt1_output(24, 0); |
| SimpleIREvaluator cg(stmt1, {tensor}); |
| cg.call({stmt1_output}); |
| |
| auto loops = l.getLoopStmtsFor(tensor); |
| l.reorderAxis(loops[0], loops[1]); |
| ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "y,x,z,"); |
| |
| Stmt* stmt2 = Stmt::clone(l.root_stmt()); |
| |
| std::vector<int> stmt2_output(24, 0); |
| SimpleIREvaluator cg2(stmt2, {tensor}); |
| cg2.call({stmt2_output}); |
| |
| for (int i = 0; i < 24; ++i) { |
| ASSERT_EQ(stmt1_output[i], stmt2_output[i]); |
| } |
| |
| loops = l.getLoopStmtsFor(tensor); |
| l.reorderAxis(loops[1], loops[2]); |
| ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "y,z,x,"); |
| |
| Stmt* stmt3 = Stmt::clone(l.root_stmt()); |
| |
| std::vector<int> stmt3_output(24, 0); |
| SimpleIREvaluator cg3(stmt3, {tensor}); |
| cg3.call({stmt3_output}); |
| |
| for (int i = 0; i < 24; ++i) { |
| ASSERT_EQ(stmt1_output[i], stmt3_output[i]); |
| } |
| } |
| |
| void testLoopNestReorderInternalAxis() { |
| KernelScope kernel_scope; |
| Tensor* tensor = Compute( |
| "f", |
| {{1, "w"}, {2, "x"}, {3, "y"}, {4, "z"}}, |
| [](const VarHandle& w, |
| const VarHandle& x, |
| const VarHandle& y, |
| const VarHandle& z) { |
| return ExprHandle(1.0f) + w + cast<float>(x) * x + cast<float>(y) * y + |
| cast<float>(z) * z; |
| }); |
| LoopNest l({tensor}); |
| |
| LoopOrderHelper loopOrderHelper; |
| Stmt* stmt1 = Stmt::clone(l.root_stmt()); |
| ASSERT_EQ(loopOrderHelper.getOrder(stmt1), "w,x,y,z,"); |
| |
| std::vector<int> stmt1_output(24, 0); |
| SimpleIREvaluator cg(stmt1, {tensor}); |
| cg.call({stmt1_output}); |
| |
| auto loops = l.getLoopStmtsFor(tensor); |
| l.reorderAxis(loops[2], loops[1]); |
| ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "w,y,x,z,"); |
| |
| Stmt* stmt2 = l.root_stmt(); |
| |
| std::vector<int> stmt2_output(24, 0); |
| SimpleIREvaluator cg2(stmt2, {tensor}); |
| cg2.call({stmt2_output}); |
| |
| for (int i = 0; i < 24; ++i) { |
| ASSERT_EQ(stmt1_output[i], stmt2_output[i]); |
| } |
| } |
| |
| void testLoopNestReorderEnclosingAxis() { |
| KernelScope kernel_scope; |
| Tensor* tensor = Compute( |
| "f", |
| {{1, "w"}, {2, "x"}, {3, "y"}, {4, "z"}}, |
| [](const VarHandle& w, |
| const VarHandle& x, |
| const VarHandle& y, |
| const VarHandle& z) { |
| return ExprHandle(1.0f) + w + cast<float>(x) * x + cast<float>(y) * y + |
| cast<float>(z) * z; |
| }); |
| LoopNest l({tensor}); |
| |
| LoopOrderHelper loopOrderHelper; |
| Stmt* stmt1 = Stmt::clone(l.root_stmt()); |
| |
| std::vector<int> stmt1_output(24, 0); |
| SimpleIREvaluator cg(stmt1, {tensor}); |
| cg.call({stmt1_output}); |
| |
| auto loops = l.getLoopStmtsFor(tensor); |
| l.reorderAxis(loops[0], loops[3]); |
| ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "z,x,y,w,"); |
| |
| Stmt* stmt2 = l.root_stmt(); |
| |
| std::vector<int> stmt2_output(24, 0); |
| SimpleIREvaluator cg2(stmt2, {tensor}); |
| cg2.call({stmt2_output}); |
| |
| for (int i = 0; i < 24; ++i) { |
| ASSERT_EQ(stmt1_output[i], stmt2_output[i]); |
| } |
| } |
| |
| void testLoopNestReorderSameAxis() { |
| KernelScope kernel_scope; |
| Tensor* tensor = Compute( |
| "f", {{2, "x"}, {3, "y"}}, [](const VarHandle& x, const VarHandle& y) { |
| return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y; |
| }); |
| LoopNest l({tensor}); |
| Stmt* stmt1 = Stmt::clone(l.root_stmt()); |
| |
| auto loops = l.getLoopStmtsFor(tensor); |
| l.reorderAxis(loops[1], loops[1]); |
| Stmt* stmt2 = Stmt::clone(l.root_stmt()); |
| |
| std::ostringstream oss, oss2; |
| oss << *stmt1; |
| oss2 << *stmt2; |
| ASSERT_EQ(oss.str(), oss2.str()); |
| } |
| |
| void testLoopNestReorderExtraStatements() { |
| /* We're going for a structure like this: |
| * for x in ... |
| * Stmt 1 |
| * for y in ... |
| * Stmt 2 |
| * for z in ... |
| * Stmt 3 |
| * Stmt 4 |
| */ |
| |
| KernelScope kernel_scope; |
| |
| Tensor* tensor = Compute( |
| "f", |
| {{2, "x"}, {3, "y"}, {4, "z"}}, |
| [](const VarHandle& x, const VarHandle& y, const VarHandle& z) { |
| return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y + |
| cast<float>(z) * z; |
| }); |
| LoopNest l({tensor}); |
| |
| Buffer extra(BufHandle("res", {6, 3}, kFloat)); |
| |
| auto loops = l.getLoopStmtsFor(tensor); |
| |
| VarHandle i = VarHandle(loops[0]->var()); |
| |
| Stmt* store_1 = Store::make(extra, {i, 0}, ExprHandle(1.f), 1); |
| Stmt* store_2 = Store::make(extra, {i, 1}, ExprHandle(2.f), 1); |
| // stmt 3 is the Function body. |
| Stmt* store_3 = Store::make(extra, {i, 2}, ExprHandle(4.f), 1); |
| |
| loops[0]->body()->prepend_stmt(store_1); |
| loops[1]->body()->prepend_stmt(store_2); |
| loops[1]->body()->append_stmt(store_3); |
| Stmt* stmt1 = Stmt::clone(l.root_stmt()); |
| |
| std::vector<int> extra1(6, 0); |
| std::vector<int> res1(24, 0); |
| SimpleIREvaluator cg(stmt1, {tensor, extra}); |
| cg.call({res1, extra1}); |
| |
| /* Then we reorder loop y and z, we want it to look like: |
| * |
| * for x in ... |
| * Stmt 1 |
| * for y in ... |
| * Stmt 2 |
| * for z in ... |
| * for y in ... |
| * Stmt 3 |
| * for y in ... |
| * Stmt 4 |
| * |
| * We need extra loops because we don't have dependency info about stmt 3 |
| * and 4. |
| * |
| */ |
| |
| l.reorderAxis(loops[1], loops[2]); |
| Stmt* stmt2 = Stmt::clone(l.root_stmt()); |
| |
| std::ostringstream oss; |
| oss << *l.root_stmt(); |
| |
| // Check the IR we produced |
| const std::string& verification_pattern1 = |
| R"IR( |
| # CHECK: for (int x |
| # CHECK: res[x, 0] = 1 |
| # CHECK: for (int y |
| # CHECK: res[x, 1] = 2 |
| # CHECK: for (int z |
| # CHECK: for (int y |
| # CHECK: f[ |
| # CHECK: for (int y |
| # CHECK: res[x, 2] = 4 |
| )IR"; |
| torch::jit::testing::FileCheck().run(verification_pattern1, oss.str()); |
| |
| std::vector<int> extra2(6, 0); |
| std::vector<int> res2(24, 0); |
| SimpleIREvaluator cg2(stmt2, {tensor, extra}); |
| cg2.call({res2, extra2}); |
| |
| for (int i = 0; i < 24; ++i) { |
| ASSERT_EQ(res1[i], res2[i]); |
| } |
| for (int i = 0; i < 6; ++i) { |
| ASSERT_EQ(extra1[i], extra2[i]); |
| } |
| |
| /* Now reorder x and the y above stmt 3: |
| * |
| * |
| * for x in ... |
| * Stmt 1 |
| * for y in ... |
| * Stmt 2 |
| * |
| * for y in ... |
| * for z in ... |
| * for x in ... |
| * Stmt 3 |
| * |
| * for x in ... |
| * for y in ... |
| * Stmt 4 |
| * |
| * |
| */ |
| loops = l.getLoopStmtsFor(tensor); |
| l.reorderAxis(loops[0], loops[2]); |
| Stmt* stmt3 = Stmt::clone(l.root_stmt()); |
| |
| std::ostringstream oss2; |
| oss2 << *stmt3; |
| |
| // Check the IR we produced |
| const std::string& verification_pattern2 = |
| R"IR( |
| # CHECK: for (int x |
| # CHECK: res[x, 0] = 1 |
| # CHECK: for (int y |
| # CHECK: res[x, 1] = 2 |
| # CHECK: for (int y |
| # CHECK: for (int z |
| # CHECK: for (int x |
| # CHECK: f[ |
| # CHECK: for (int x |
| # CHECK: for (int y |
| # CHECK: res[x, 2] = 4 |
| )IR"; |
| torch::jit::testing::FileCheck().run(verification_pattern2, oss2.str()); |
| |
| std::vector<int> extra3(6, 0); |
| std::vector<int> res3(24, 0); |
| SimpleIREvaluator cg3(stmt3, {tensor, extra}); |
| cg3.call({res3, extra3}); |
| |
| for (int i = 0; i < 24; ++i) { |
| ASSERT_EQ(res1[i], res3[i]); |
| } |
| for (int i = 0; i < 6; ++i) { |
| ASSERT_EQ(extra1[i], extra3[i]); |
| } |
| } |
| |
| void LoopNestReorderTestHelper( |
| bool prepend, |
| bool append, |
| int index1, |
| int index2) { |
| KernelScope kernel_scope; |
| |
| Tensor* c = Compute( |
| "5d", |
| {{2, "a"}, {3, "b"}, {2, "c"}, {3, "d"}, {2, "e"}}, |
| [](const std::vector<VarHandle>&) { return -1; }); |
| LoopNest l({c}); |
| |
| Buffer extra(BufHandle("extra", {5}, kInt)); |
| |
| auto loops = l.getLoopStmtsFor(c); |
| int j = 0; |
| for (auto* l : loops) { |
| // Add an increment at each layer of the loop which counts the number of |
| // times the loop executes. |
| Load* load = new Load(extra, {new IntImm(j)}, new IntImm(1)); |
| Add* add = new Add(load, new IntImm(1)); |
| Stmt* store = Store::make(extra, {j}, ExprHandle(add), 1); |
| if (prepend) { |
| l->body()->prepend_stmt(store); |
| } |
| if (append) { |
| l->body()->append_stmt(Stmt::clone(store)); |
| } |
| |
| j++; |
| } |
| |
| Stmt* stmt1 = Stmt::clone(l.root_stmt()); |
| |
| std::vector<int> extra1(5, 0); |
| std::vector<int> res1(2 * 3 * 2 * 3 * 2, 0); |
| SimpleIREvaluator cg(stmt1, {c, extra}); |
| cg.call({res1, extra1}); |
| |
| std::vector<int> loopExtents = {2, 3, 2, 3, 2}; |
| |
| int expected_loops = 0; |
| if (prepend) { |
| expected_loops++; |
| } |
| if (append) { |
| expected_loops++; |
| } |
| for (int i = 0; i < 5; ++i) { |
| expected_loops *= loopExtents[i]; |
| ASSERT_EQ(extra1[i], expected_loops); |
| } |
| |
| loops = l.getLoopStmtsFor(c); |
| l.reorderAxis(loops[index1], loops[index2]); |
| Stmt* stmt2 = Stmt::clone(l.root_stmt()); |
| |
| std::ostringstream oss, oss2; |
| oss << *stmt1; |
| oss2 << *stmt2; |
| ASSERT_NE(oss.str(), oss2.str()); |
| |
| std::vector<int> extra2(5, 0); |
| std::vector<int> res2(2 * 3 * 2 * 3 * 2, 0); |
| SimpleIREvaluator cg2(stmt2, {c, extra}); |
| cg2.call({res2, extra2}); |
| |
| expected_loops = 0; |
| if (prepend) { |
| expected_loops++; |
| } |
| if (append) { |
| expected_loops++; |
| } |
| |
| for (int i = 0; i < 5; ++i) { |
| expected_loops *= loopExtents[i]; |
| ASSERT_EQ(extra2[i], expected_loops); |
| } |
| |
| for (int i = 0; i < 2 * 3 * 2 * 3 * 2; ++i) { |
| ASSERT_EQ(res2[i], res1[i]); |
| } |
| } |
| |
| void testLoopNestReorderLongStringOfPreOrphans() { |
| for (int i = 0; i < 5; ++i) { |
| for (int j = 0; j < 5; ++j) { |
| // skip noops, since we check the loop isn't the same after reordering. |
| if (i != j) { |
| LoopNestReorderTestHelper(true, false, i, j); |
| } |
| } |
| } |
| } |
| |
| void testLoopNestReorderLongStringOfPostOrphans() { |
| for (int i = 0; i < 5; ++i) { |
| for (int j = 0; j < 5; ++j) { |
| // skip noops, since we check the loop isn't the same after reordering. |
| if (i != j) { |
| LoopNestReorderTestHelper(false, true, i, j); |
| } |
| } |
| } |
| } |
| |
| void testLoopNestReorderLongStringFull() { |
| for (int i = 0; i < 5; ++i) { |
| for (int j = 0; j < 5; ++j) { |
| // skip noops, since we check the loop isn't the same after reordering. |
| if (i != j) { |
| LoopNestReorderTestHelper(true, true, i, j); |
| } |
| } |
| } |
| } |
| |
| void testLoopNestReorderInternalLoopNest() { |
| KernelScope kernel_scope; |
| const int M = 4; |
| const int N = 5; |
| const int K = 6; |
| Buffer a_buf("a", kFloat, {M, N}); |
| Buffer b_buf("b", kFloat, {N, K}); |
| Buffer c_buf("c", kFloat, {M, N}); |
| Buffer d_buf("d", kFloat, {M, K}); |
| |
| Tensor* x = Compute( |
| "x", |
| {{M, "m1"}, {N, "n1"}, {K, "k1"}}, |
| [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { |
| return a_buf(m, n) * b_buf(n, k); |
| }); |
| Tensor* y = Compute( |
| "y", |
| {{M, "m2"}, {N, "n2"}, {K, "k2"}}, |
| [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { |
| return c_buf(m, n) * d_buf(m, k) + x->call(m, n, k); |
| }); |
| Tensor* z = Compute( |
| "z", |
| {{M, "m3"}, {N, "n3"}, {K, "k3"}}, |
| [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { |
| return x->call(m, n, k) + y->call(m, n, k); |
| }); |
| |
| LoopNest l({z}); |
| For* a = nullptr; |
| For* b = nullptr; |
| auto fors = NodeFinder<For>::find(l.root_stmt()); |
| for (auto* f : fors) { |
| if (f->var()->name_hint() == "m2") { |
| a = f; |
| } else if (f->var()->name_hint() == "k2") { |
| b = f; |
| } |
| } |
| l.reorderAxis(a, b); |
| |
| l.prepareForCodegen(); |
| Stmt* stmt = IRSimplifier::simplify(l.root_stmt()); |
| |
| std::ostringstream oss; |
| oss << *stmt; |
| |
| // Check the IR we produced has the 3 nests in the right order, but k and m |
| // swapped in the middle. |
| const std::string& verification_pattern = |
| R"IR( |
| # CHECK: for (int m1 |
| # CHECK: for (int n1 |
| # CHECK: for (int k1 |
| # CHECK: for (int k2 |
| # CHECK: for (int n2 |
| # CHECK: for (int m2 |
| # CHECK: for (int m3 |
| # CHECK: for (int n3 |
| # CHECK: for (int k3)IR"; |
| torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); |
| |
| { |
| PaddedBuffer<float> a_v(M, N); |
| PaddedBuffer<float> b_v(N, K); |
| PaddedBuffer<float> c_v(M, N); |
| PaddedBuffer<float> d_v(M, K); |
| |
| for (int i = 0; i < M; i++) { |
| for (int j = 0; j < N; j++) { |
| a_v(i, j) = i * i; |
| } |
| } |
| for (int i = 0; i < N; i++) { |
| for (int j = 0; j < K; j++) { |
| b_v(i, j) = j * j; |
| } |
| } |
| for (int i = 0; i < M; i++) { |
| for (int j = 0; j < N; j++) { |
| c_v(i, j) = i + j; |
| } |
| } |
| for (int i = 0; i < M; i++) { |
| for (int j = 0; j < K; j++) { |
| d_v(i, j) = i * j; |
| } |
| } |
| |
| PaddedBuffer<float> z_v(M, N, K); |
| PaddedBuffer<float> z_ref(M, N, K); |
| for (int m = 0; m < M; m++) { |
| for (int n = 0; n < N; n++) { |
| for (int k = 0; k < K; k++) { |
| z_ref(m, n, k) = a_v(m, n) * b_v(n, k) * 2 + c_v(m, n) * d_v(m, k); |
| } |
| } |
| } |
| |
| SimpleIREvaluator eval(stmt, a_buf, b_buf, c_buf, d_buf, z); |
| eval(a_v, b_v, c_v, d_v, z_v); |
| ExpectAllNear(z_v, z_ref, 1e-5); |
| } |
| } |
| |
| void testOuterLoopVectorization() { |
| KernelScope kernel_scope; |
| Tensor* tensor = Compute( |
| "f", {{8, "X"}, {8, "y"}}, [](const VarHandle& x, const VarHandle& y) { |
| return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y; |
| }); |
| LoopNest l({tensor}); |
| |
| l.vectorize(l.getLoopStmtsFor(tensor)[0]); |
| |
| Stmt* root_stmt = l.root_stmt(); |
| Block* outer_block = dynamic_cast<Block*>(root_stmt); |
| ASSERT_NE(outer_block, nullptr); |
| while (Block* inner_block = dynamic_cast<Block*>(outer_block->front())) { |
| outer_block = inner_block; |
| } |
| |
| // Verify that we have only a single loop level remaining after |
| // vectorization. |
| ASSERT_EQ(outer_block->nstmts(), 1); |
| For* for_loop = dynamic_cast<For*>(outer_block->front()); |
| ASSERT_NE(for_loop, nullptr); |
| Block* for_body = for_loop->body(); |
| ASSERT_EQ(for_body->nstmts(), 1); |
| ASSERT_EQ(dynamic_cast<For*>(for_body->front()), nullptr); |
| } |
| |
| namespace { |
| |
| std::string constantUpperBoundLoopIR(int upper_bound_val) { |
| KernelScope kernel_scope; |
| ExprHandle upper_bound(upper_bound_val); |
| Tensor* A = Compute( |
| "A", {{upper_bound, "x"}}, [&](const VarHandle& x) { return x * 2; }); |
| LoopNest l({A}); |
| std::vector<For*> loops = l.getLoopStmtsFor(A); |
| Stmt* unrolled = nullptr; |
| LoopNest::unroll(loops[0], &unrolled); |
| std::ostringstream oss; |
| oss << *unrolled; |
| return oss.str(); |
| } |
| |
| } // namespace |
| |
| void testUnroll() { |
| const std::string actual = constantUpperBoundLoopIR(3); |
| const std::string& verification_pattern = |
| R"IR( |
| # CHECK: A[0] = 0; |
| # CHECK: A[1] = 2; |
| # CHECK: A[2] = 4)IR"; |
| |
| torch::jit::testing::FileCheck().run(verification_pattern, actual); |
| } |
| |
| void testUnrollOuter() { |
| KernelScope kernel_scope; |
| ExprHandle outer_bound(3); |
| ExprHandle inner_bound(4); |
| Tensor* A = Compute( |
| "A", |
| {{outer_bound, "x"}, {inner_bound, "y"}}, |
| [&](const VarHandle& x, const VarHandle& y) { return x + y; }); |
| LoopNest l({A}); |
| std::vector<For*> loops = l.getLoopStmtsFor(A); |
| Stmt* unrolled = nullptr; |
| LoopNest::unroll(loops[0], &unrolled); |
| const std::string& verification_pattern = |
| R"IR( |
| # CHECK: for (int y = 0; y < 4; y++) { |
| # CHECK: A[0, y] = y; |
| # CHECK: } |
| # CHECK: for (int y = 0; y < 4; y++) { |
| # CHECK: A[1, y] = y + 1; |
| # CHECK: } |
| # CHECK: for (int y = 0; y < 4; y++) { |
| # CHECK: A[2, y] = y + 2; |
| # CHECK: })IR"; |
| |
| std::ostringstream oss; |
| oss << *unrolled; |
| torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); |
| } |
| |
| void testUnrollInner() { |
| KernelScope kernel_scope; |
| ExprHandle outer_bound(3); |
| ExprHandle inner_bound(4); |
| Tensor* A = Compute( |
| "A", |
| {{outer_bound, "x"}, {inner_bound, "y"}}, |
| [&](const VarHandle& x, const VarHandle& y) { return x + y; }); |
| LoopNest l({A}); |
| std::vector<For*> loops = l.getLoopStmtsFor(A); |
| Stmt* unrolled = nullptr; |
| LoopNest::unroll( |
| static_cast<For*>(loops[0]->body()->stmts().front()), &unrolled); |
| const std::string& verification_pattern = |
| R"IR( |
| # CHECK: for (int x = 0; x < 3; x++) { |
| # CHECK: A[x, 0] = x; |
| # CHECK: A[x, 1] = x + 1; |
| # CHECK: A[x, 2] = x + 2; |
| # CHECK: A[x, 3] = x + 3; |
| # CHECK: })IR"; |
| |
| std::ostringstream oss; |
| oss << *loops[0]; |
| torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); |
| } |
| |
| void testUnrollMultipleStatements() { |
| KernelScope kernel_scope; |
| const int kTotalSize = 3; |
| BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt); |
| BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt); |
| |
| VarHandle x("x", kInt); |
| auto f = For::make( |
| x, |
| 0, |
| kTotalSize, |
| Block::make({Store::make(a_buf, {x}, x * 2), |
| Store::make(b_buf, {x}, Load::make(a_buf, {x}, 1))})); |
| Block::make({f}); |
| Stmt* unrolled = nullptr; |
| LoopNest::unroll(f, &unrolled); |
| std::ostringstream oss; |
| oss << *unrolled; |
| const std::string& verification_pattern = |
| R"IR( |
| # CHECK: A[0] = 0; |
| # CHECK: B[0] = A[0]; |
| # CHECK: A[1] = 2; |
| # CHECK: B[1] = A[1]; |
| # CHECK: A[2] = 4 |
| # CHECK: B[2] = A[2];)IR"; |
| |
| torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); |
| } |
| |
| void testUnrollEmpty() { |
| const std::string actual = constantUpperBoundLoopIR(0); |
| const std::string& verification_pattern = R"IR( |
| # CHECK-NOT: A[ |
| )IR"; |
| |
| torch::jit::testing::FileCheck().run(verification_pattern, actual); |
| } |
| |
| void testNoUnroll() { |
| KernelScope kernel_scope; |
| VarHandle upper_bound("N", kInt); |
| Tensor* A = Compute( |
| "A", {{upper_bound, "x"}}, [&](const VarHandle& x) { return x * 2; }); |
| LoopNest l({A}); |
| std::vector<For*> loops = l.getLoopStmtsFor(A); |
| Stmt* unrolled = nullptr; |
| ASSERT_THROWS_WITH( |
| LoopNest::unroll(loops[0], &unrolled), "non-constant loop"); |
| } |
| |
| void testUnrollWithLet() { |
| KernelScope kernel_scope; |
| const int kTotalSize = 3; |
| BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt); |
| BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt); |
| |
| VarHandle e("e", kInt); |
| VarHandle x("x", kInt); |
| auto f = For::make( |
| x, |
| 0, |
| kTotalSize, |
| Block::make({Let::make(e, 7), |
| Store::make(a_buf, {x}, e), |
| Store::make(b_buf, {x}, e + 1)})); |
| Block::make({f}); |
| Stmt* unrolled = nullptr; |
| LoopNest::unroll(f, &unrolled); |
| std::ostringstream oss; |
| oss << *unrolled; |
| const std::string& verification_pattern = |
| R"IR( |
| # CHECK: int e = 7; |
| # CHECK: A[0] = e; |
| # CHECK: B[0] = e + 1; |
| # CHECK: A[1] = e; |
| # CHECK: B[1] = e + 1; |
| # CHECK: A[2] = e; |
| # CHECK: B[2] = e + 1;)IR"; |
| |
| torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); |
| |
| std::vector<int> a_v(kTotalSize, 0); |
| std::vector<int> b_v(kTotalSize, 0); |
| SimpleIREvaluator eval(unrolled, a_buf, b_buf); |
| eval(a_v, b_v); |
| for (int i = 0; i < kTotalSize; ++i) { |
| ASSERT_EQ(a_v[i], 7); |
| ASSERT_EQ(b_v[i], 8); |
| } |
| } |
| |
| void testNormalizeStartPositive() { |
| KernelScope kernel_scope; |
| |
| // Input IR: |
| // for (int x = 50; x < 100; x++) { |
| // A[x] = B[x]; |
| // B[x] = x * 2; |
| // } |
| const int kTotalSize = 50; |
| BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt); |
| BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt); |
| VarHandle x("x", kInt); |
| auto for_body = |
| Block::make({Store::make(a_buf, {x}, Load::make(kInt, b_buf, {x}, 1), 1), |
| Store::make(b_buf, {x}, x * 2)}); |
| auto for_stmt = For::make(x, 50, 100, for_body); |
| Block::make({for_stmt}); |
| |
| For* normalized = nullptr; |
| LoopNest::normalize(for_stmt, &normalized); |
| |
| auto result = IRSimplifier::simplify(normalized); |
| std::ostringstream oss; |
| oss << *result; |
| const std::string& expected_ir = |
| R"IR( |
| # CHECK: for (int x = 0; x < 50; x++) { |
| # CHECK: A[x + 50] = B[x + 50]; |
| # CHECK: B[x + 50] = 2 * (x + 50); |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_ir, oss.str()); |
| } |
| |
| void testNormalizeStartNegative() { |
| KernelScope kernel_scope; |
| |
| // Input IR: |
| // for (int x = -50; x < 100; x++) { |
| // A[x + 50] = B[x + 50]; |
| // B[x + 50] = x * 2; |
| // } |
| const int kTotalSize = 150; |
| BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt); |
| BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt); |
| VarHandle x("x", kInt); |
| auto for_body = Block::make( |
| {Store::make(a_buf, {x + 50}, Load::make(kInt, b_buf, {x + 50}, 1), 1), |
| Store::make(b_buf, {x + 50}, x * 2)}); |
| auto for_stmt = For::make(x, -50, 100, for_body); |
| Block::make({for_stmt}); |
| |
| For* normalized = nullptr; |
| LoopNest::normalize(for_stmt, &normalized); |
| |
| auto result = IRSimplifier::simplify(normalized); |
| std::ostringstream oss; |
| oss << *result; |
| const std::string& expected_ir = |
| R"IR( |
| # CHECK: for (int x = 0; x < 150; x++) { |
| # CHECK: A[x] = B[x]; |
| # CHECK: B[x] = 2 * (x - 50); |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_ir, oss.str()); |
| } |
| |
| void testNormalizeStartZero() { |
| KernelScope kernel_scope; |
| |
| // Input IR: |
| // for (int x = 0; x < 100; x++) { |
| // A[x] = B[x]; |
| // B[x] = x * 2; |
| // } |
| // Should not be modified. |
| |
| const int kTotalSize = 100; |
| BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt); |
| BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt); |
| VarHandle x("x", kInt); |
| auto for_body = |
| Block::make({Store::make(a_buf, {x}, Load::make(kInt, b_buf, {x}, 1), 1), |
| Store::make(b_buf, {x}, x * 2)}); |
| auto for_stmt = For::make(x, 0, 100, for_body); |
| Block::make({for_stmt}); |
| |
| For* normalized = nullptr; |
| LoopNest::normalize(for_stmt, &normalized); |
| |
| auto result = IRSimplifier::simplify(normalized); |
| std::ostringstream oss; |
| oss << *result; |
| const std::string& expected_ir = |
| R"IR( |
| # CHECK: for (int x = 0; x < 100; x++) { |
| # CHECK: A[x] = B[x]; |
| # CHECK: B[x] = 2 * x; |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_ir, oss.str()); |
| } |
| |
| void testNormalizeStartVariable() { |
| KernelScope kernel_scope; |
| |
| // Input IR: |
| // for (int x = y; x < 100; x++) { |
| // A[x] = B[x]; |
| // B[x] = x * 2; |
| // } |
| |
| const int kTotalSize = 100; |
| BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt); |
| BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt); |
| VarHandle x("x", kInt); |
| VarHandle y("y", kInt); |
| auto for_body = |
| Block::make({Store::make(a_buf, {x}, Load::make(kInt, b_buf, {x}, 1), 1), |
| Store::make(b_buf, {x}, x * 2)}); |
| auto for_stmt = For::make(x, y, 100, for_body); |
| Block::make({for_stmt}); |
| |
| For* normalized = nullptr; |
| LoopNest::normalize(for_stmt, &normalized); |
| |
| auto result = IRSimplifier::simplify(normalized); |
| std::ostringstream oss; |
| oss << *result; |
| const std::string& expected_ir = |
| R"IR( |
| # CHECK: for (int x = 0; x < 100 - y; x++) { |
| # CHECK: A[y + x] = B[y + x]; |
| # CHECK: B[y + x] = 2 * (y + x); |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_ir, oss.str()); |
| } |
| |
| void testNormalizeOnNestedOuterLoop() { |
| KernelScope kernel_scope; |
| |
| // Input IR: |
| // for (int x = 50; x < 100; x++) { |
| // for (int y = 10; y < 100; y++) { |
| // A[x] = A[x] + B[y] + y * 2; |
| // } |
| // } |
| |
| BufHandle a_buf("A", {ExprHandle(50)}, kInt); |
| BufHandle b_buf("B", {ExprHandle(100)}, kInt); |
| VarHandle x("x", kInt); |
| VarHandle y("y", kInt); |
| auto inner_for_body = Store::make( |
| a_buf, |
| {x}, |
| Load::make(a_buf, {x}, 1) + Load::make(b_buf, {y}, 1) + y * 2, |
| 1); |
| auto inner_for = For::make(y, 10, 100, inner_for_body); |
| auto for_stmt = For::make(x, 50, 100, inner_for); |
| Block::make({for_stmt}); |
| |
| For* normalized = nullptr; |
| LoopNest::normalize(for_stmt, &normalized); |
| |
| auto result = IRSimplifier::simplify(normalized); |
| std::ostringstream oss; |
| oss << *result; |
| const std::string& expected_ir = |
| R"IR( |
| # CHECK: for (int x = 0; x < 50; x++) { |
| # CHECK: for (int y = 10; y < 100; y++) { |
| # CHECK: A[x + 50] = ((B[y]) + (A[x + 50])) + 2 * y; |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_ir, oss.str()); |
| } |
| |
| void testNormalizeOnNestedInnerLoop() { |
| KernelScope kernel_scope; |
| |
| // Input IR: |
| // for (int x = 50; x < 100; x++) { |
| // for (int y = 10; y < 100; y++) { |
| // A[x] = A[x] + B[y] + y * 2; |
| // } |
| // } |
| |
| BufHandle a_buf("A", {ExprHandle(50)}, kInt); |
| BufHandle b_buf("B", {ExprHandle(100)}, kInt); |
| VarHandle x("x", kInt); |
| VarHandle y("y", kInt); |
| auto inner_for_body = Store::make( |
| a_buf, |
| {x}, |
| Load::make(a_buf, {x}, 1) + Load::make(b_buf, {y}, 1) + y * 2, |
| 1); |
| auto inner_for = For::make(y, 10, 100, inner_for_body); |
| auto for_stmt = For::make(x, 50, 100, inner_for); |
| Block::make({for_stmt}); |
| |
| For* normalized = nullptr; |
| LoopNest::normalize(inner_for, &normalized); |
| |
| auto result = IRSimplifier::simplify(for_stmt); |
| std::ostringstream oss; |
| oss << *result; |
| const std::string& expected_ir = |
| R"IR( |
| # CHECK: for (int x = 50; x < 100; x++) { |
| # CHECK: for (int y = 0; y < 90; y++) { |
| # CHECK: A[x] = (((B[y + 10]) + (A[x])) + 2 * y) + 20; |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_ir, oss.str()); |
| } |
| |
| void testNormalizeAndSplitWithTail() { |
| KernelScope kernel_scope; |
| |
| // Create a dummy tensor to construct LoopNest. |
| ExprHandle n(100); |
| Buffer a(BufHandle("a", {n}, kFloat)); |
| Tensor* b = |
| Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a(i); }); |
| LoopNest l({b}); |
| |
| // Input IR: |
| // for (int x = 5; x < 10; x++) { |
| // A[x] = x * 2; |
| // } |
| const int kTotalSize = 5; |
| BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt); |
| VarHandle x("x", kInt); |
| auto for_stmt = For::make(x, 5, 10, Store::make(a_buf, {x}, x * 2)); |
| Block::make({for_stmt}); |
| |
| For* normalized = nullptr; |
| LoopNest::normalize(for_stmt, &normalized); |
| |
| For* x_outer; |
| For* x_inner; |
| For* x_tail; |
| l.splitWithTail(normalized, 10, &x_outer, &x_inner, &x_tail); |
| |
| auto x_outer_result = IRSimplifier::simplify(x_outer); |
| std::ostringstream oss_outer; |
| oss_outer << *x_outer_result; |
| const std::string& expected_outer_ir = |
| R"IR( |
| # CHECK: { |
| # CHECK: } |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_outer_ir, oss_outer.str()); |
| |
| auto x_tail_result = IRSimplifier::simplify(x_tail); |
| std::ostringstream oss_tail; |
| oss_tail << *x_tail_result; |
| const std::string& expected_tail_ir = |
| R"IR( |
| # CHECK: for (int x_tail = 0; x_tail < 5; x_tail++) { |
| # CHECK: A[x_tail + 5] = 2 * (x_tail + 5); |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_tail_ir, oss_tail.str()); |
| } |
| |
| } // namespace jit |
| } // namespace torch |