| #include <limits> |
| #include <memory> |
| #include <sstream> |
| #include <stdexcept> |
| #include <unordered_map> |
| #include "test/cpp/tensorexpr/test_base.h" |
| |
| #include "test/cpp/tensorexpr/padded_buffer.h" |
| #include "torch/csrc/jit/tensorexpr/analysis.h" |
| #include "torch/csrc/jit/tensorexpr/buffer.h" |
| #include "torch/csrc/jit/tensorexpr/eval.h" |
| #include "torch/csrc/jit/tensorexpr/function.h" |
| #include "torch/csrc/jit/tensorexpr/ir.h" |
| #include "torch/csrc/jit/tensorexpr/ir_printer.h" |
| #include "torch/csrc/jit/tensorexpr/ir_simplifier.h" |
| #include "torch/csrc/jit/tensorexpr/loopnest.h" |
| #include "torch/csrc/jit/tensorexpr/tensor.h" |
| |
| #include <torch/csrc/jit/testing/file_check.h> |
| |
| namespace torch { |
| namespace jit { |
| |
| using namespace torch::jit::tensorexpr; |
| |
| // Sum an array to a single value. |
| void testReduceSum1D() { |
| KernelScope kernel_scope; |
| |
| Buffer b(BufHandle("b", {10}, kFloat)); |
| std::vector<float> in(10); |
| for (int j = 0; j < 10; ++j) { |
| in[j] = j; |
| } |
| |
| std::vector<float> out(1, -1.f); |
| |
| Tensor* c = Reduce("sum", {}, Sum(), b, {{10, "m"}}); |
| LoopNest loop({c}); |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| ASSERT_EQ(out[0], 45); |
| } |
| // Sum a 2D tensor to a 1D tensor with dynamic shapes. |
| void testReduceSum2D() { |
| KernelScope kernel_scope; |
| |
| const int M = 3; |
| const int N = 7; |
| |
| VarHandle m("m", kInt); |
| VarHandle n("n", kInt); |
| |
| Buffer b(BufHandle("b", {m, n}, kFloat)); |
| std::vector<float> in(M * N); |
| for (int i = 0; i < M; ++i) { |
| for (int j = 0; j < N; ++j) { |
| in[i * N + j] = j; |
| } |
| } |
| |
| std::vector<float> out(M, -1.f); |
| |
| Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}}); |
| LoopNest loop({c}); |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c, n, m}); |
| |
| cg.call({in, out, 5, 7}); |
| |
| float expected = 0; |
| for (int i = 0; i < N; ++i) { |
| expected += i; |
| } |
| |
| for (int i = 0; i < M; ++i) { |
| ASSERT_EQ(out[i], expected); |
| } |
| } |
| |
| // Sum a 3D tensor to both a 2D and 1D tensor, then reduce the 2D tensor flat to |
| // check our work. |
| void testReduceSum3D() { |
| KernelScope kernel_scope; |
| |
| const int M = 10; |
| VarHandle m("m", kInt); |
| |
| Buffer b(BufHandle("b", {2, 3, m}, kFloat)); |
| |
| Tensor* c = Reduce("sum", {{2, "l"}, {3, "n"}}, Sum(), b, {{m, "m"}}); |
| LoopNest loop({c}); |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c, m}); |
| |
| std::vector<float> bData(2 * 3 * M, 0); |
| std::vector<float> cData(2 * 3, 6.0f); |
| std::vector<float> dData(2, 1.0f); |
| std::vector<float> eData(2, 1.0f); |
| |
| for (int i = 0; i < 2 * 3; ++i) { |
| for (int j = 0; j < M; ++j) { |
| bData[i * M + j] = j; |
| } |
| } |
| |
| cg.call({bData, cData, M}); |
| float expected = 0; |
| for (int i = 0; i < M; ++i) { |
| expected += i; |
| } |
| |
| for (int i = 0; i < 2 * 3; ++i) { |
| ASSERT_EQ(cData[i], expected); |
| } |
| |
| Tensor* d = Reduce("sum2", {{2, "l"}}, Sum(), b, {{3, "n"}, {m, "m"}}); |
| LoopNest loop2({d}); |
| loop2.prepareForCodegen(); |
| Stmt* s2 = loop2.root_stmt(); |
| s2 = IRSimplifier::simplify(s2); |
| |
| SimpleIREvaluator cg2(s2, {b, d, m}); |
| cg2.call({bData, dData, M}); |
| |
| // We're combining an additional dimension of 3, so the sum is 3x. |
| expected = expected * 3; |
| |
| for (int i = 0; i < 2; ++i) { |
| ASSERT_EQ(dData[i], expected); |
| } |
| |
| // This is the same as just reducing the original result across that axis. |
| Buffer c_buf(BufHandle(c->func_var())); |
| Tensor* e = Reduce("sum3", {{2, "l"}}, Sum(), c_buf, {{3, "m"}}); |
| LoopNest loop3({e}); |
| loop3.prepareForCodegen(); |
| Stmt* s3 = loop3.root_stmt(); |
| s3 = IRSimplifier::simplify(s3); |
| |
| SimpleIREvaluator cg3(s3, {c, e}); |
| cg3.call({cData, eData}); |
| |
| for (int i = 0; i < 2; ++i) { |
| ASSERT_EQ(eData[i], expected); |
| } |
| } |
| |
| // Sum a large (10 D) Tensor 5 dimensions in. |
| void testReduceSum10D() { |
| KernelScope kernel_scope; |
| |
| Buffer in_(BufHandle("in_", {2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, kFloat)); |
| const int InputSize = 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3; |
| Buffer out_(BufHandle("out_", {2, 3, 2, 3, 2}, kFloat)); |
| const int OutputSize = 2 * 3 * 2 * 3 * 2; |
| |
| std::vector<float> in(InputSize, 1.f); |
| std::vector<float> out(OutputSize, -1.f); |
| |
| Tensor* c = Reduce( |
| "sum", |
| {{2, "a"}, {3, "b"}, {2, "c"}, {3, "d"}, {2, "e"}}, |
| Sum(), |
| in_, |
| {{3, "f"}, {2, "g"}, {3, "h"}, {2, "i"}, {3, "j"}}); |
| LoopNest loop({c}); |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {in_, c}); |
| |
| cg.call({in, out}); |
| |
| float expected = InputSize / OutputSize; |
| for (int i = 0; i < OutputSize; ++i) { |
| ASSERT_EQ(out[i], expected); |
| } |
| } |
| |
| // Reduce via Mul rather than Add using a custom Reducer. |
| void testReduceProduct() { |
| KernelScope kernel_scope; |
| |
| const int M = 4; |
| const int N = 4; |
| |
| Buffer b(BufHandle("b", {M, N}, kFloat)); |
| std::vector<float> in(M * N); |
| for (int i = 0; i < M; ++i) { |
| for (int j = 0; j < N; ++j) { |
| in[i * N + j] = 2 + j; |
| } |
| } |
| |
| std::vector<float> out(M, -1.f); |
| |
| Reducer product( |
| ExprHandle(1.f), [](ExprHandle a, ExprHandle b) { return a * b; }); |
| |
| Tensor* c = Reduce("product", {{M, "m"}}, product, b, {{N, "n"}}); |
| LoopNest loop({c}); |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| |
| float expected = 1; |
| for (int i = 0; i < N; ++i) { |
| expected *= 2 + i; |
| } |
| |
| for (int i = 0; i < M; ++i) { |
| ASSERT_EQ(out[i], expected); |
| } |
| } |
| |
| // Maximum reductions. |
| void testReduceMax() { |
| KernelScope kernel_scope; |
| |
| Buffer in_(BufHandle("b", {10}, kFloat)); |
| |
| std::vector<float> in(10); |
| std::vector<float> out(1, -1.f); |
| for (int j = 0; j < 10; ++j) { |
| in[j] = j; |
| } |
| |
| Tensor* dm1 = Reduce("max", {}, Maximum(kFloat), in_, {{10, "m"}}); |
| |
| LoopNest loop({dm1}); |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| SimpleIREvaluator cg(s, {in_, dm1}); |
| |
| cg.call({in, out}); |
| |
| ASSERT_EQ(out[0], 9); |
| |
| Buffer in2_(BufHandle("b", {2, 5}, kFloat)); |
| std::vector<float> out2(2, -1.f); |
| |
| Tensor* m2d = Reduce("max", {{2, "n"}}, Maximum(kFloat), in2_, {{5, "m"}}); |
| |
| loop = LoopNest({m2d}); |
| loop.prepareForCodegen(); |
| s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg2(s, {in2_, m2d}); |
| cg2.call({in, out2}); |
| |
| ASSERT_EQ(out2[0], 4); |
| ASSERT_EQ(out2[1], 9); |
| } |
| |
| // Minimum reduction, with custom initialization. |
| void testReduceMinCustomInitializer() { |
| KernelScope kernel_scope; |
| |
| VarHandle minInit("minInit", kFloat); |
| Buffer in_(BufHandle("b", {10}, kFloat)); |
| |
| std::vector<float> in(10); |
| std::vector<float> out(1, -1.f); |
| for (int j = 0; j < 10; ++j) { |
| in[j] = 10 + j; |
| } |
| |
| Tensor* min = Reduce( |
| "min", |
| {}, |
| Minimum(ExprHandle(minInit)), |
| [&](ParameterList& v) { return in_.call(v); }, |
| {{10, "m"}}); |
| |
| LoopNest loop({min}); |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {in_, min, minInit}); |
| |
| // Works normally (note that out data starts lower than the correct |
| // minimum). |
| cg.call({in, out, std::numeric_limits<float>::max()}); |
| ASSERT_EQ(out[0], 10); |
| |
| // With an initalizer lower than the min, that's the min. |
| cg.call({in, out, 5.f}); |
| ASSERT_EQ(out[0], 5); |
| } |
| |
| // Example implementation of Any/All. |
| // TODO: this is very awkward without logical And/Or operators. |
| void testReduceAnyAll() { |
| KernelScope kernel_scope; |
| |
| VarHandle searchValue("searchValue", kInt); |
| Buffer b(BufHandle("b", {4, 10}, kInt)); |
| |
| Reducer anyEqSV(ExprHandle(0), [](ExprHandle a, ExprHandle b) { |
| return CompareSelect::make(a, 1, 1, b, kEQ); |
| }); |
| |
| Tensor* any = Reduce( |
| "anyEqual", |
| {{4, "i"}}, |
| anyEqSV, |
| [&](const auto& i, const auto& j) { |
| return CompareSelect::make(b(i, j), searchValue, kEQ); |
| }, |
| {{10, "j"}}); |
| |
| LoopNest loop({any}); |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, any, searchValue}); |
| |
| std::vector<int> in(40, 0); |
| std::vector<int> out(4, 0); |
| |
| // input has 0-39 in 4 rows. |
| for (int i = 0; i < 40; ++i) { |
| in[i] = i; |
| } |
| cg.call({in, out, 1}); |
| |
| // only the first row has 1 |
| ASSERT_EQ(out[0], 1); |
| ASSERT_EQ(out[1], 0); |
| ASSERT_EQ(out[2], 0); |
| ASSERT_EQ(out[3], 0); |
| |
| cg.call({in, out, 15}); |
| |
| // 15 in the 3rd row |
| ASSERT_EQ(out[0], 0); |
| ASSERT_EQ(out[1], 1); |
| ASSERT_EQ(out[2], 0); |
| ASSERT_EQ(out[3], 0); |
| |
| Reducer allGTSV(ExprHandle(1), [](ExprHandle a, ExprHandle b) { |
| return CompareSelect::make(a, 0, 0, b, kEQ); |
| }); |
| |
| Tensor* allGreaterThan = Reduce( |
| "allGreaterThan", |
| {{4, "i"}}, |
| allGTSV, |
| [&](const auto& i, const auto& j) { |
| return CompareSelect::make(b(i, j), searchValue, kGT); |
| }, |
| {{10, "j"}}); |
| |
| loop = LoopNest({allGreaterThan}); |
| loop.prepareForCodegen(); |
| s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg2(s, {b, allGreaterThan, searchValue}); |
| |
| cg2.call({in, out, 11}); |
| |
| // 11 is in row 2. |
| ASSERT_EQ(out[0], 0); |
| ASSERT_EQ(out[1], 0); |
| ASSERT_EQ(out[2], 1); |
| ASSERT_EQ(out[3], 1); |
| |
| cg2.call({in, out, -3}); |
| |
| // All are positive. |
| ASSERT_EQ(out[0], 1); |
| ASSERT_EQ(out[1], 1); |
| ASSERT_EQ(out[2], 1); |
| ASSERT_EQ(out[3], 1); |
| } |
| |
| void testReduceMatmul2D() { |
| KernelScope kernel_scope; |
| |
| Buffer tA(BufHandle("tA", {3, 2}, kFloat)); |
| Buffer tB(BufHandle("tB", {2, 3}, kFloat)); |
| |
| std::vector<float> tA_(6); |
| std::vector<float> tB_(6); |
| |
| std::vector<float> out(9, -1.f); |
| for (int i = 0; i < 3; ++i) { |
| for (int j = 0; j < 2; ++j) { |
| tA_[i * 2 + j] = i * 2 + j; |
| tB_[j * 3 + i] = i * 2 + j; |
| } |
| } |
| |
| Tensor* mm = Reduce( |
| "mm", |
| {{3, "m"}, {3, "n"}}, |
| Sum(), |
| [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) { |
| return tA(m, k) * tB(k, n); |
| }, |
| {{2, "k"}}); |
| |
| LoopNest loop({mm}); |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {tA, tB, mm}); |
| cg.call({tA_, tB_, out}); |
| |
| std::vector<float> expected( |
| {1.f, 3.f, 5.f, 3.f, 13.f, 23.f, 5.f, 23.f, 41.f}); |
| |
| for (int i = 0; i < 9; ++i) { |
| ASSERT_EQ(out[i], expected[i]); |
| } |
| } |
| |
| void testReduceRfactorLike() { |
| KernelScope kernel_scope; |
| |
| Buffer in(BufHandle("in", {10, 10}, kFloat)); |
| std::vector<float> in_(100); |
| for (int i = 0; i < 100; ++i) { |
| in_[i] = i; |
| } |
| std::vector<float> in_rf_(10, -2.f); |
| std::vector<float> out(1, -1.f); |
| |
| Tensor* l1 = Reduce("l1", {{10, "i"}}, Sum(), in, {{10, "j"}}); |
| Buffer in_rf(BufHandle(l1->func_var())); |
| |
| Tensor* l2 = Reduce("l2", {}, Sum(), in_rf, {{10, "i"}}); |
| |
| LoopNest loop({l1, l2}); |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {in, l1, l2}); |
| cg.call({in_, in_rf_, out}); |
| |
| ASSERT_EQ(out[0], 99 * 50); |
| } |
| |
| void testSplitReduceAxis() { |
| KernelScope kernel_scope; |
| |
| Buffer in(BufHandle("in", {16, 8}, kFloat)); |
| |
| std::vector<float> in_(16 * 8); |
| for (int i = 0; i < 16; ++i) { |
| for (int j = 0; j < 8; ++j) { |
| in_[i * 8 + j] = i; |
| } |
| } |
| std::vector<float> out(16, -1.f); |
| |
| Tensor* tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}}); |
| LoopNest l({tensor}); |
| For* x_outer; |
| For* x_inner; |
| For* x_tail; |
| std::vector<For*> loops = l.getLoopStmtsFor(tensor); |
| l.splitWithTail(loops[1], 2, &x_outer, &x_inner, &x_tail); |
| |
| l.prepareForCodegen(); |
| |
| Stmt* s = l.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {in, tensor}); |
| cg.call({in_, out}); |
| |
| for (int i = 0; i < 16; ++i) { |
| ASSERT_EQ(out[i], i * 8); |
| } |
| } |
| |
| void testSplitNonReduceAxis() { |
| KernelScope kernel_scope; |
| |
| Buffer in(BufHandle("in", {16, 8}, kFloat)); |
| |
| std::vector<float> in_(16 * 8); |
| for (int i = 0; i < 16; ++i) { |
| for (int j = 0; j < 8; ++j) { |
| in_[i * 8 + j] = i; |
| } |
| } |
| std::vector<float> out(16, -1.f); |
| Tensor* tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}}); |
| LoopNest l({tensor}); |
| For* x_outer; |
| For* x_inner; |
| For* x_tail; |
| std::vector<For*> loops = l.getLoopStmtsFor(tensor); |
| l.splitWithTail(loops[0], 2, &x_outer, &x_inner, &x_tail); |
| |
| For* x_2; |
| For* x_1; |
| For* x_tail_2; |
| l.splitWithTail(x_outer, 2, &x_2, &x_1, &x_tail_2); |
| |
| l.prepareForCodegen(); |
| |
| Stmt* s = l.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {in, tensor}); |
| cg.call({in_, out}); |
| |
| for (int i = 0; i < 16; ++i) { |
| ASSERT_EQ(out[i], i * 8); |
| } |
| } |
| |
| void testReorderedReductionInitializer() { |
| KernelScope kernel_scope; |
| /* From the quip: |
| for k in 0..1: // blockIdx |
| for m in 0..128: |
| for n in 0..64: // threadIdx |
| SumOp(c(k, n), 0, a(k, m, n), {m}) |
| */ |
| |
| Buffer in(BufHandle("in", {1, 12, 6}, kFloat)); |
| std::vector<float> in_(12 * 6, 1.f); |
| |
| Tensor* tensor_ = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}}); |
| LoopNest l_({tensor_}); |
| |
| l_.prepareForCodegen(); |
| Stmt* s_ = Stmt::clone(l_.root_stmt()); |
| s_ = IRSimplifier::simplify(s_); |
| |
| Tensor* tensor = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}}); |
| LoopNest l({tensor}); |
| |
| auto loops = l.getLoopStmtsFor(tensor); |
| l.setGPUBlockIndex(loops[0], 0); |
| l.setGPUThreadIndex(loops[1], 0); |
| |
| l.reorderAxis(loops[1], loops[2]); |
| |
| Stmt* s = l.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| l.prepareForCodegen(); |
| |
| s = l.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| std::vector<float> out1(16, -1.f); |
| SimpleIREvaluator cg(s_, {in, tensor_}); |
| cg.call({in_, out1}); |
| |
| std::vector<float> out2(16, -1.f); |
| SimpleIREvaluator cg2(s, {in, tensor}); |
| cg2.call({in_, out2}); |
| |
| for (int i = 0; i < 16; ++i) { |
| ASSERT_EQ(out1[i], out2[i]); |
| } |
| } |
| |
| void testReduceRfactor() { |
| KernelScope kernel_scope; |
| |
| const int M = 10; |
| const int N = 10; |
| VarHandle m("m", kInt); |
| VarHandle n("n", kInt); |
| |
| Buffer b(BufHandle("b", {m, n}, kFloat)); |
| std::vector<float> in(M * N); |
| for (int j = 0; j < M * N; ++j) { |
| in[j] = j; |
| } |
| |
| std::vector<float> out(1, -1.f); |
| |
| Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}}); |
| LoopNest loop({c}); |
| std::vector<For*> loops = loop.getLoopStmtsFor(c); |
| auto v = loops.at(1)->var(); |
| loop.rfactor(c->body(), v); |
| auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt()); |
| ASSERT_EQ(rc.size(), 2); |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c, m, n}); |
| |
| cg.call({in, out, M, N}); |
| ASSERT_EQ(out[0], 4950); |
| } |
| |
| void testReduce3DRfactorInternal() { |
| KernelScope kernel_scope; |
| |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| VarHandle m("m", kInt); |
| VarHandle n("n", kInt); |
| VarHandle k("k", kInt); |
| |
| Buffer b(BufHandle("b", {m, n, k}, kFloat)); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| std::vector<float> out(1, -1.f); |
| |
| Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}}); |
| LoopNest loop({c}); |
| std::vector<For*> loops = loop.getLoopStmtsFor(c); |
| auto v = loops.at(1)->var(); |
| loop.rfactor(c->body(), v); |
| auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt()); |
| ASSERT_EQ(rc.size(), 2); |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c, m, n, k}); |
| |
| cg.call({in, out, M, N, K}); |
| ASSERT_EQ(out[0], 499500); |
| } |
| |
| void testReduce3DRfactorInner() { |
| KernelScope kernel_scope; |
| |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| VarHandle m("m", kInt); |
| VarHandle n("n", kInt); |
| VarHandle k("k", kInt); |
| |
| Buffer b(BufHandle("b", {m, n, k}, kFloat)); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| std::vector<float> out(1, -1.f); |
| |
| Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}}); |
| LoopNest loop({c}); |
| std::vector<For*> loops = loop.getLoopStmtsFor(c); |
| auto v = loops.at(2)->var(); |
| loop.rfactor(c->body(), v); |
| auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt()); |
| ASSERT_EQ(rc.size(), 2); |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c, m, n, k}); |
| |
| cg.call({in, out, M, N, K}); |
| ASSERT_EQ(out[0], 499500); |
| } |
| |
| void testReduce3DRfactorOuter() { |
| KernelScope kernel_scope; |
| |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| VarHandle m("m", kInt); |
| VarHandle n("n", kInt); |
| VarHandle k("k", kInt); |
| |
| Buffer b(BufHandle("b", {m, n, k}, kFloat)); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| std::vector<float> out(1, -1.f); |
| |
| Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}}); |
| LoopNest loop({c}); |
| std::vector<For*> loops = loop.getLoopStmtsFor(c); |
| auto v = loops.at(0)->var(); |
| loop.rfactor(c->body(), v); |
| auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt()); |
| ASSERT_EQ(rc.size(), 2); |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c, m, n, k}); |
| cg.call({in, out, M, N, K}); |
| ASSERT_EQ(out[0], 499500); |
| } |
| |
| void testReduce3DRfactorWithOuter() { |
| KernelScope kernel_scope; |
| |
| const int L = 5; |
| const int M = 5; |
| const int N = 5; |
| const int K = 5; |
| VarHandle l("l", kInt); |
| VarHandle m("m", kInt); |
| VarHandle n("n", kInt); |
| VarHandle k("k", kInt); |
| |
| Buffer b(BufHandle("b", {l, m, n, k}, kFloat)); |
| std::vector<float> in(L * M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| std::vector<float> out(L, -1.f); |
| |
| Tensor* c = |
| Reduce("sum", {{l, "l"}}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}}); |
| LoopNest loop({c}); |
| std::vector<For*> loops = loop.getLoopStmtsFor(c); |
| auto v = loops.at(3)->var(); |
| loop.rfactor(c->body(), v); |
| auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt()); |
| ASSERT_EQ(rc.size(), 2); |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c, l, m, n, k}); |
| cg.call({in, out, L, M, N, K}); |
| ASSERT_EQ(out[0], 7750); |
| } |
| |
| void testReduce3DRfactorRepeated() { |
| KernelScope kernel_scope; |
| |
| const int M = 5; |
| const int N = 5; |
| const int K = 5; |
| VarHandle m("m", kInt); |
| VarHandle n("n", kInt); |
| VarHandle k("k", kInt); |
| |
| Buffer b(BufHandle("b", {m, n, k}, kFloat)); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}}); |
| |
| for (int rVar1 = 0; rVar1 < 3; ++rVar1) { |
| for (int rVar2 = 0; rVar2 < 2; ++rVar2) { |
| std::vector<float> out(1, -1.f); |
| |
| LoopNest loop({c}); |
| auto reduces = NodeFinder<ReduceOp>::find(loop.root_stmt()); |
| ASSERT_EQ(reduces.size(), 1); |
| auto v1 = reduces[0]->reduce_args()[rVar1]; |
| loop.rfactor(reduces[0], v1); |
| |
| reduces = NodeFinder<ReduceOp>::find(loop.root_stmt()); |
| ASSERT_EQ(reduces.size(), 2); |
| auto v2 = reduces[0]->reduce_args()[rVar2]; |
| loop.rfactor(reduces[0], v2); |
| |
| reduces = NodeFinder<ReduceOp>::find(loop.root_stmt()); |
| ASSERT_EQ(reduces.size(), 3); |
| |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c, m, n, k}); |
| |
| cg.call({in, out, M, N, K}); |
| ASSERT_EQ(out[0], 7750); |
| } |
| } |
| } |
| |
| void testReduceRfactorInsertionPoint() { |
| KernelScope kernel_scope; |
| |
| const int M = 10; |
| const int N = 10; |
| VarHandle m("m", kInt); |
| VarHandle n("n", kInt); |
| |
| Buffer b(BufHandle("b", {m, n}, kFloat)); |
| std::vector<float> in(M * N); |
| for (int j = 0; j < M * N; ++j) { |
| in[j] = j; |
| } |
| |
| std::vector<float> out(1, -1.f); |
| |
| Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}}); |
| LoopNest loop({c}); |
| std::vector<For*> loops = loop.getLoopStmtsFor(c); |
| auto v = loops.at(0)->var(); |
| loop.rfactor(c->body(), v, loops.at(0)->body()); |
| auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt()); |
| ASSERT_EQ(rc.size(), 2); |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c, m, n}); |
| |
| cg.call({in, out, M, N}); |
| ASSERT_EQ(out[0], 4950); |
| } |
| |
| void testReduce3DRfactorInsertionPoint() { |
| KernelScope kernel_scope; |
| |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| VarHandle m("m", kInt); |
| VarHandle n("n", kInt); |
| VarHandle k("k", kInt); |
| |
| Buffer b(BufHandle("b", {m, n, k}, kFloat)); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| std::vector<float> out(M, -1.f); |
| |
| Tensor* c = Reduce("sum", {{m, "m"}}, Sum(), b, {{n, "n"}, {k, "k"}}); |
| LoopNest loop({c}); |
| std::vector<For*> loops = loop.getLoopStmtsFor(c); |
| auto v = loops.at(1)->var(); |
| loop.rfactor(c->body(), v, loops.at(1)->body()); |
| auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt()); |
| ASSERT_EQ(rc.size(), 2); |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c, m, n, k}); |
| cg.call({in, out, M, N, K}); |
| ASSERT_EQ(out[0], 4950); |
| } |
| |
| void testReduceRepeatedInternalRfactor() { |
| KernelScope kernel_scope; |
| |
| Buffer in_(BufHandle("in_", {2, 3, 4, 5, 6}, kFloat)); |
| const int InputSize = 2 * 3 * 4 * 5 * 6; |
| |
| std::vector<float> in(InputSize, 1.f); |
| std::vector<float> out(1, -1.f); |
| std::vector<float> ref(1, -1.f); |
| |
| Tensor* c = Reduce( |
| "sum", |
| {}, |
| Sum(), |
| in_, |
| {{2, "a"}, {3, "b"}, {4, "c"}, {5, "d"}, {6, "e"}}); |
| LoopNest refloop({c}); |
| refloop.prepareForCodegen(); |
| SimpleIREvaluator ref_cg( |
| IRSimplifier::simplify(refloop.root_stmt()), {in_, c}); |
| ref_cg.call({in, ref}); |
| |
| LoopNest loop({c}); |
| |
| // rfactor out "c". |
| auto reduces = NodeFinder<ReduceOp>::find(loop.root_stmt()); |
| loop.rfactor(reduces[0], reduces[0]->reduce_args()[3]); |
| |
| // rfactor out "b". |
| reduces = NodeFinder<ReduceOp>::find(loop.root_stmt()); |
| loop.rfactor(reduces[0], reduces[0]->reduce_args()[1]); |
| |
| // rfactor out "d". |
| reduces = NodeFinder<ReduceOp>::find(loop.root_stmt()); |
| loop.rfactor(reduces[0], reduces[0]->reduce_args()[1]); |
| |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {in_, c}); |
| cg.call({in, out}); |
| |
| ASSERT_EQ(ref[0], out[0]); |
| } |
| |
| // Split a reduction axis with a tail loop. |
| void testReduceSplitTail() { |
| KernelScope kernel_scope; |
| |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| |
| Buffer b(BufHandle("b", {M, N, K}, kFloat)); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| for (int i = 0; i < 3; ++i) { |
| std::vector<float> out(M, -1.f); |
| |
| Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); |
| LoopNest loop({c}); |
| std::vector<For*> loops = loop.getLoopStmtsFor(c); |
| For *outer, *inner, *tail; |
| loop.splitWithTail(loops[i], 8, &outer, &inner, &tail); |
| |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| ASSERT_EQ(out[0], 4950); |
| } |
| } |
| |
| // Split a reduction axis cleanly so there is no tail loop. |
| void testReduceSplitNoTail() { |
| KernelScope kernel_scope; |
| |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| Buffer b(BufHandle("b", {M, N, K}, kFloat)); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| for (int i = 0; i < 3; ++i) { |
| std::vector<float> out(M, -1.f); |
| |
| Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); |
| LoopNest loop({c}); |
| std::vector<For*> loops = loop.getLoopStmtsFor(c); |
| For *outer, *inner, *tail; |
| loop.splitWithTail(loops[i], 5, &outer, &inner, &tail); |
| |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| ASSERT_EQ(out[0], 4950); |
| } |
| } |
| |
| // Split a reduction axis with only a tail loop (the split loop will be size 0 |
| // and eliminated out). |
| void testReduceOverSplitTail() { |
| KernelScope kernel_scope; |
| |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| |
| Buffer b(BufHandle("b", {M, N, K}, kFloat)); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| for (int i = 0; i < 3; ++i) { |
| std::vector<float> out(M, -1.f); |
| |
| Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); |
| LoopNest loop({c}); |
| std::vector<For*> loops = loop.getLoopStmtsFor(c); |
| For *outer, *inner, *tail; |
| loop.splitWithTail(loops[i], 16, &outer, &inner, &tail); |
| |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| ASSERT_EQ(out[0], 4950); |
| } |
| } |
| |
| // Split a reduction axis with a mask. |
| void testReduceSplitMask() { |
| KernelScope kernel_scope; |
| |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| |
| Buffer b(BufHandle("b", {M, N, K}, kFloat)); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| for (int i = 0; i < 3; ++i) { |
| std::vector<float> out(M, -1.f); |
| |
| Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); |
| LoopNest loop({c}); |
| std::vector<For*> loops = loop.getLoopStmtsFor(c); |
| For *outer, *inner; |
| loop.splitWithMask(loops[i], 8, &outer, &inner); |
| |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| ASSERT_EQ(out[0], 4950); |
| } |
| } |
| |
| // Split a reduction axis cleanly not requiring a mask. |
| void testReduceSplitNoMask() { |
| KernelScope kernel_scope; |
| |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| Buffer b(BufHandle("b", {M, N, K}, kFloat)); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| for (int i = 0; i < 3; ++i) { |
| std::vector<float> out(M, -1.f); |
| |
| Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); |
| LoopNest loop({c}); |
| std::vector<For*> loops = loop.getLoopStmtsFor(c); |
| For *outer, *inner; |
| loop.splitWithMask(loops[i], 5, &outer, &inner); |
| |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| ASSERT_EQ(out[0], 4950); |
| } |
| } |
| |
| // Split a reduction axis with all logic in the mask. |
| void testReduceOverSplitMask() { |
| KernelScope kernel_scope; |
| |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| |
| Buffer b(BufHandle("b", {M, N, K}, kFloat)); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| for (int i = 0; i < 3; ++i) { |
| std::vector<float> out(M, -1.f); |
| |
| Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); |
| LoopNest loop({c}); |
| std::vector<For*> loops = loop.getLoopStmtsFor(c); |
| For *outer, *inner; |
| loop.splitWithMask(loops[i], 16, &outer, &inner); |
| |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| ASSERT_EQ(out[0], 4950); |
| } |
| } |
| |
| // Test an rfactor when there are two ReduceOps in the graph due to a |
| // splitWithTail. |
| void testReduceSplitRfactor() { |
| KernelScope kernel_scope; |
| |
| const int M = 2; |
| const int N = 10; |
| const int K = 10; |
| const int SPLIT_FACTOR = 4; |
| |
| Buffer b(BufHandle("b", {M, N, K}, kFloat)); |
| std::vector<float> in(M * N * K); |
| for (int m = 0; m < M; ++m) { |
| for (int j = 0; j < N * K; ++j) { |
| in[m * N * K + j] = j; |
| } |
| } |
| |
| std::vector<float> out(M, -1.f); |
| |
| Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); |
| LoopNest loop({c}); |
| std::vector<For*> loops = loop.getLoopStmtsFor(c); |
| For *o, *i, *t; |
| loop.splitWithTail(loops[2], SPLIT_FACTOR, &o, &i, &t); |
| |
| auto reduces = NodeFinder<ReduceOp>::find(loop.root_stmt()); |
| loop.rfactor(reduces[0], reduces[0]->reduce_args().back()); |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| for (int i = 0; i < M; ++i) { |
| ASSERT_EQ(out[0], 4950); |
| } |
| } |
| |
| // Test an rfactor which ends up being eliminated since the total loop size is |
| // smaller than the split factor. |
| void testReduceOverSplitRfactor() { |
| KernelScope kernel_scope; |
| |
| const int N = 10; |
| const int K = 10; |
| const int SPLIT_FACTOR = 16; |
| |
| Buffer b(BufHandle("b", {N, K}, kFloat)); |
| std::vector<float> in(N * K); |
| for (int j = 0; j < N * K; ++j) { |
| in[j] = j; |
| } |
| |
| std::vector<float> out(1, -1.f); |
| |
| Tensor* c = Reduce("sum", {}, Sum(), b, {{N, "n"}, {K, "k"}}); |
| LoopNest loop({c}); |
| std::vector<For*> loops = loop.getLoopStmtsFor(c); |
| For *o, *i, *t; |
| loop.splitWithTail(loops[1], SPLIT_FACTOR, &o, &i, &t); |
| |
| auto reduces = NodeFinder<ReduceOp>::find(loop.root_stmt()); |
| loop.rfactor(reduces[0], reduces[0]->reduce_args().back()); |
| loop.prepareForCodegen(); |
| Stmt* s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| ASSERT_EQ(out[0], 4950); |
| |
| std::ostringstream oss; |
| oss << *s; |
| |
| // Check the IR to verify the rfactored reduce is eliminated. |
| // TODO: The alloc free should be eliminated here since it is size 0. |
| const std::string& verification_pattern = |
| R"IR( |
| # CHECK: Allocate(tmp_buf, float, {0}); |
| # CHECK: sum[0] = 0.f; |
| # CHECK: for (int n = 0; n < 10; n++) { |
| # CHECK: for (int k_tail = 0; k_tail < 10; k_tail++) { |
| # CHECK: sum[0] = (sum[0]) + (b[k_tail + 10 * n]); |
| # CHECK: } |
| # CHECK: } |
| # CHECK: Free(tmp_buf);)IR"; |
| // TODO: rfactor output is not consistent yet, will fix (@nickg). |
| // torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); |
| } |
| |
| } // namespace jit |
| } // namespace torch |