| #if defined(USE_CUDA) |
| #include <test/cpp/jit/test_base.h> |
| |
| #include <torch/csrc/jit/codegen/cuda/arith.h> |
| #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h> |
| #include <torch/csrc/jit/codegen/cuda/fusion.h> |
| #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h> |
| #include <torch/csrc/jit/codegen/cuda/ir_graphviz.h> |
| #include <torch/csrc/jit/codegen/cuda/ir_iostream.h> |
| #include <torch/csrc/jit/codegen/cuda/kernel.h> |
| #include <torch/csrc/jit/codegen/cuda/lower2device.h> |
| #include <torch/csrc/jit/codegen/cuda/mutator.h> |
| #include <torch/csrc/jit/codegen/cuda/tensor_meta.h> |
| #include <torch/csrc/jit/codegen/cuda/transform_replay.h> |
| #include <torch/csrc/jit/codegen/cuda/transform_rfactor.h> |
| |
| // fuser and IR parser |
| #include <torch/csrc/jit/codegen/cuda/parser.h> |
| #include "torch/csrc/jit/ir/irparser.h" |
| |
| #include <iostream> |
| |
| // Tests go in torch::jit |
| namespace torch { |
| namespace jit { |
| |
| using namespace torch::jit::fuser; |
| |
| static TensorView* makeDummyTensor( |
| int nDims, |
| DataType dtype = DataType::Float) { |
| std::vector<IterDomain*> dom; |
| for (int i = 0; i < nDims; i++) |
| dom.push_back(new IterDomain(new Int(0), new Int())); |
| |
| return new TensorView(new TensorDomain(dom), dtype); |
| } |
| |
| static void checkIntValue( |
| const EvaluationContext* eval_context, |
| const Val* val, |
| Int::ScalarType expected_value) { |
| TORCH_CHECK(val->isAnInt()); |
| const auto actual_value = ExpressionEvaluator::evaluate(val, eval_context); |
| TORCH_CHECK(actual_value.has_value()); |
| TORCH_CHECK(actual_value.value() == expected_value); |
| } |
| |
| // 1. Test cases are void() functions. |
| // 2. They start with the prefix `test` |
| |
| // A few smoke tests for IrGraphGenerator |
| // (These tests exercise IrGraphGenerator through a non-trivial IR, |
| // to make sure that it runs w/o crashing. The actual output is not |
| // validated) |
| void testGPU_IrGraphGenerator() { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| // Make sure we can handle empty IRs |
| TORCH_CHECK(!IrGraphGenerator::toGraphviz( |
| &fusion, IrGraphGenerator::DetailLevel::Basic) |
| .empty()); |
| |
| // Construct an interesting IR |
| TensorView* tv0 = makeDummyTensor(2); |
| fusion.addInput(tv0); |
| |
| TensorView* tv1 = mul(tv0, new Float(-1.0)); |
| TensorView* tv2 = add(tv0, new Float(3.0)); |
| TensorView* tv3 = mul(tv0, new Float(2.5)); |
| TensorView* tv4 = add(tv2, tv1); |
| TensorView* tv5 = add(tv4, tv3); |
| TensorView* tv6 = add(tv0, tv3); |
| |
| // Another checkpoint before adding outputs |
| TORCH_CHECK(!IrGraphGenerator::toGraphviz( |
| &fusion, IrGraphGenerator::DetailLevel::Explicit) |
| .empty()); |
| |
| fusion.addOutput(tv5); |
| fusion.addOutput(tv6); |
| |
| tv6->merge(0); |
| tv6->split(0, 4); |
| tv6->axis(0)->parallelize(ParallelType::BIDx); |
| tv5->reorder({{-1, 0}}); |
| tv0->computeAt(tv3, 1); |
| tv0->computeAt(tv6, 1); |
| |
| // Another checkpoint with more node types |
| TORCH_CHECK(!IrGraphGenerator::toGraphviz( |
| &fusion, IrGraphGenerator::DetailLevel::ComputeOnly) |
| .empty()); |
| |
| for (Val* val : fusion.vals()) { |
| if (!fusion.hasInput(val) && |
| val->getValType().value() == ValType::TensorView) { |
| TensorView* tv = static_cast<TensorView*>(val); |
| tv->axis(-1)->parallelize(ParallelType::TIDx); |
| } |
| } |
| |
| // Final IR graph |
| TORCH_CHECK(!IrGraphGenerator::toGraphviz( |
| &fusion, IrGraphGenerator::DetailLevel::Verbose) |
| .empty()); |
| } |
| |
| void testGPU_FusionDispatch() { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| Float* f = new Float{2.f}; |
| std::stringstream ss1, ss2, ss3; |
| ss1 << f; |
| ss2 << static_cast<Val*>(f); |
| ss3 << static_cast<Statement*>(f); |
| TORCH_CHECK( |
| ss1.str().compare(ss2.str()) == 0 && ss1.str().compare(ss3.str()) == 0, |
| "Error with dispatch system where results differ by passing Float* vs Val* vs Statement*."); |
| } |
| |
| // Evaluate basic scalar operations with constant values |
| void testGPU_FusionExprEvalConstants() { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| EvaluationContext eval_context(&fusion); |
| |
| auto* a = new Int(7); |
| auto* b = new Int(3); |
| |
| checkIntValue(&eval_context, neg(a), -7); |
| checkIntValue(&eval_context, add(a, b), 10); |
| checkIntValue(&eval_context, neg(mul(sub(a, b), div(a, b))), -8); |
| checkIntValue(&eval_context, mod(a, b), 1); |
| checkIntValue(&eval_context, ceilDiv(a, b), 3); |
| } |
| |
| // Evaluate basic scalar operations with bound values |
| void testGPU_FusionExprEvalBindings() { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| EvaluationContext eval_context(&fusion); |
| |
| auto* a = new Int(); |
| auto* b = new Int(); |
| auto* c = add(a, b); |
| auto* d = neg(ceilDiv(add(a, b), b)); |
| |
| eval_context.bind(a, 7); |
| eval_context.bind(b, 3); |
| |
| checkIntValue(&eval_context, c, 10); |
| checkIntValue(&eval_context, sub(a, b), 4); |
| checkIntValue(&eval_context, mod(a, b), 1); |
| checkIntValue(&eval_context, ceilDiv(a, b), 3); |
| checkIntValue(&eval_context, d, -4); |
| |
| eval_context.bind(a, 2); |
| eval_context.bind(b, 5); |
| |
| checkIntValue(&eval_context, c, 7); |
| checkIntValue(&eval_context, sub(a, b), -3); |
| checkIntValue(&eval_context, mod(a, b), 2); |
| checkIntValue(&eval_context, ceilDiv(a, b), 1); |
| checkIntValue(&eval_context, d, -2); |
| } |
| |
| // Evaluate expressions in a simple IR |
| void testGPU_FusionExprEvalBasic() { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| // Create a non-trivial IR |
| TensorView* tv0 = makeDummyTensor(2); |
| TensorView* tv1 = makeDummyTensor(2); |
| |
| fusion.addInput(tv0); |
| fusion.addInput(tv1); |
| |
| TensorView* tv2 = add(tv1, new Float(2.0)); |
| TensorView* tv3 = add(tv0, tv2); |
| |
| fusion.addOutput(tv3); |
| |
| tv3->split(0, 4); |
| |
| tv0->computeAt(tv3, 1); |
| tv1->computeAt(tv3, 1); |
| |
| tv3->axis(0)->parallelize(ParallelType::BIDx); |
| tv2->axis(1)->parallelize(ParallelType::Unroll); |
| tv3->axis(1)->parallelize(ParallelType::Unroll); |
| tv2->axis(-1)->parallelize(ParallelType::TIDx); |
| tv3->axis(-1)->parallelize(ParallelType::TIDx); |
| |
| // 1. Create an evaluation context |
| EvaluationContext eval_context(&fusion); |
| |
| // 2. Bind values |
| // |
| // IMPORTANT: |
| // a. The bindings are only as stable as the Vals are in the fusion graph |
| // b. You must use the original (rootDomain) extents |
| // (ex. `tv0->getRootDomain()[0]->extent()` |
| // instead of `tv0->axis(0)->extent()`) |
| // |
| eval_context.bind(tv0->getRootDomain()[0]->extent(), 6); |
| eval_context.bind(tv0->getRootDomain()[1]->extent(), 128); |
| eval_context.bind(tv1->getRootDomain()[0]->extent(), 6); |
| eval_context.bind(tv1->getRootDomain()[1]->extent(), 128); |
| |
| // 3. Evaluate and check result values |
| TORCH_CHECK(tv2->domain()->nDims() == 3); |
| checkIntValue(&eval_context, tv2->axis(0)->rawExtent(), 2); |
| checkIntValue(&eval_context, tv2->axis(1)->rawExtent(), 4); |
| checkIntValue(&eval_context, tv2->axis(2)->rawExtent(), 128); |
| |
| TORCH_CHECK(tv3->domain()->nDims() == 3); |
| checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 2); |
| checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 4); |
| checkIntValue(&eval_context, tv3->axis(2)->rawExtent(), 128); |
| } |
| |
| // Evaluate expressions in a more complex IR |
| void testGPU_FusionExprEvalComplex() { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| TensorView* tv0 = makeDummyTensor(2); |
| fusion.addInput(tv0); |
| |
| TensorView* tv1 = mul(tv0, new Float(-1.0)); |
| TensorView* tv2 = add(tv0, new Float(3.0)); |
| TensorView* tv3 = mul(tv0, new Float(2.0)); |
| TensorView* tv4 = add(tv2, tv1); |
| TensorView* tv5 = add(tv4, tv3); |
| TensorView* tv6 = add(tv0, tv3); |
| |
| fusion.addOutput(tv5); |
| fusion.addOutput(tv6); |
| |
| tv5->reorder({{-1, 0}}); |
| |
| tv6->split(0, 5); |
| tv5->merge(0); |
| |
| // 1. Create an evaluation context |
| EvaluationContext eval_context(&fusion); |
| |
| // 2. Bind values |
| eval_context.bind(tv0->getRootDomain()[0]->extent(), 129); |
| eval_context.bind(tv0->getRootDomain()[1]->extent(), 127); |
| |
| // Evaluate and check extent values |
| TORCH_CHECK(tv0->domain()->nDims() == 2); |
| checkIntValue(&eval_context, tv0->axis(0)->rawExtent(), 129); |
| checkIntValue(&eval_context, tv0->axis(1)->rawExtent(), 127); |
| |
| TORCH_CHECK(tv3->domain()->nDims() == 2); |
| checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 129); |
| checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 127); |
| |
| TORCH_CHECK(tv4->domain()->nDims() == 2); |
| checkIntValue(&eval_context, tv4->axis(0)->rawExtent(), 129); |
| checkIntValue(&eval_context, tv4->axis(1)->rawExtent(), 127); |
| |
| TORCH_CHECK(tv5->domain()->nDims() == 1); |
| checkIntValue(&eval_context, tv5->axis(0)->rawExtent(), 16383); |
| |
| TORCH_CHECK(tv6->domain()->nDims() == 3); |
| checkIntValue(&eval_context, tv6->axis(0)->rawExtent(), 26); |
| checkIntValue(&eval_context, tv6->axis(1)->rawExtent(), 5); |
| checkIntValue(&eval_context, tv6->axis(2)->rawExtent(), 127); |
| } |
| |
| void testGPU_FusionSimpleArith() { |
| std::stringstream ss1, ss2; |
| |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| Float* f1 = new Float(1.f); |
| Float* f2 = new Float{2.f}; |
| Float* f3 = new Float(); |
| |
| // Disrupt the fusion to make sure guard works well |
| { |
| Fusion fusion2; |
| FusionGuard fg(&fusion2); |
| |
| Float* f1 = new Float(1.f); |
| Float* f2 = new Float(2.f); |
| add(f1, f2); |
| ss2 << fusion2; |
| } |
| |
| new BinaryOp(BinaryOpType::Add, f3, f1, f2); |
| ss1 << fusion; |
| |
| TORCH_CHECK( |
| ss1.str().compare(ss2.str()) == 0, |
| "Error where explicit add nodes don't match implicit add nodes."); |
| } |
| |
| void testGPU_FusionSimpleTypePromote() { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| Float* f4 = new Float{4.f}; |
| Int* i1 = new Int{3}; |
| auto f5 = add(f4, i1); |
| |
| TORCH_CHECK(f5->getDataType() == DataType::Float); |
| } |
| |
| class ZeroMutator : public OptOutMutator { |
| public: |
| Statement* mutate(Float* f) { |
| if (f->isConst() && *(f->value()) == 1.0) |
| return new Float(0.0); |
| return f; |
| } |
| void mutate(Fusion* f) { |
| OptOutMutator::mutate(f); |
| } |
| }; |
| |
| void testGPU_FusionMutator() { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| Float* f4 = new Float{1.f}; |
| Int* i1 = new Int{3}; |
| Val* f5 = add(f4, i1); |
| ZeroMutator mutator; |
| mutator.mutate(&fusion); |
| Val* lhs = static_cast<BinaryOp*>(fusion.origin(f5))->lhs(); |
| TORCH_CHECK( |
| lhs->getValType().value() == ValType::Scalar && |
| lhs->getDataType().value() == DataType::Float); |
| Float* flhs = static_cast<Float*>(lhs); |
| |
| TORCH_CHECK(flhs->value().value() == 0.f); |
| } |
| |
| void testGPU_FusionRegister() { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| Float* v1 = new Float{1.f}; |
| Float* v2 = new Float{2.f}; |
| Val* v3 = binaryOp(BinaryOpType::Add, v1, v2); |
| Val* v4 = binaryOp(BinaryOpType::Add, v1, v2); |
| TORCH_CHECK(v1->name() + 1 == v2->name()); |
| TORCH_CHECK(v2->name() + 1 == v3->name()); |
| TORCH_CHECK(v3->name() + 1 == v4->name()); |
| TORCH_CHECK(fusion.origin(v3)->name() + 1 == fusion.origin(v4)->name()); |
| } |
| |
| // dummy expr with 2 outputs only for toposort test. |
| struct DummyExpr : public Expr { |
| ~DummyExpr() = default; |
| DummyExpr(Val* _outlhs, Val* _outrhs, Val* _lhs, Val* _rhs) |
| : Expr(ExprType::UnaryOp) // Not terribly safe... |
| { |
| addOutput(_outlhs); |
| addOutput(_outrhs); |
| addInput(_lhs); |
| addInput(_rhs); |
| this->name_ = FusionGuard::getCurFusion()->registerExpr(this); |
| } |
| DummyExpr(const DummyExpr& other) = delete; |
| DummyExpr& operator=(const DummyExpr& other) = delete; |
| DummyExpr(DummyExpr&& other) = delete; |
| DummyExpr& operator=(DummyExpr&& other) = delete; |
| }; |
| |
| void testGPU_FusionTopoSort() { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| // e0: v3, v2 = dummy(v1, v0) |
| // e1: v4 = add(v3, v2) |
| // e2: v5 = add(v2, v4) |
| // e3: v6 = add(v5, v5) |
| Float* v0 = new Float{1.f}; |
| Float* v1 = new Float{2.f}; |
| Float* v2 = new Float(); |
| Float* v3 = new Float(); |
| Float* v4 = new Float(); |
| Float* v5 = new Float(); |
| Float* v6 = new Float(); |
| |
| Expr* e0 = new DummyExpr(v3, v2, v1, v0); |
| Expr* e1 = new BinaryOp(BinaryOpType::Add, v4, v3, v2); |
| Expr* e2 = new BinaryOp(BinaryOpType::Add, v5, v2, v4); |
| Expr* e3 = new BinaryOp(BinaryOpType::Add, v6, v5, v5); |
| |
| std::vector<Expr*> exprs = fusion.exprs(); |
| |
| TORCH_CHECK(exprs.size() == 4); |
| TORCH_CHECK(exprs[0] == e0); |
| TORCH_CHECK(exprs[1] == e1); |
| TORCH_CHECK(exprs[2] == e2); |
| TORCH_CHECK(exprs[3] == e3); |
| |
| fusion.addOutput(v2); |
| exprs = fusion.exprs(true); |
| TORCH_CHECK(exprs.size() == 1); |
| TORCH_CHECK(exprs[0] == e0); |
| |
| fusion.addOutput(v5); |
| exprs = fusion.exprs(true); |
| TORCH_CHECK(exprs[0] == e0); |
| TORCH_CHECK(exprs[1] == e1); |
| TORCH_CHECK(exprs[2] == e2); |
| |
| fusion.addOutput(v4); |
| exprs = fusion.exprs(true); |
| TORCH_CHECK(exprs[0] == e0); |
| TORCH_CHECK(exprs[1] == e1); |
| TORCH_CHECK(exprs[2] == e2); |
| |
| fusion.addOutput(v3); |
| exprs = fusion.exprs(true); |
| TORCH_CHECK(exprs[0] == e0); |
| TORCH_CHECK(exprs[1] == e1); |
| TORCH_CHECK(exprs[2] == e2); |
| |
| fusion.addOutput(v6); |
| exprs = fusion.exprs(true); |
| TORCH_CHECK(exprs.size() == 4); |
| TORCH_CHECK(exprs[0] == e0); |
| TORCH_CHECK(exprs[1] == e1); |
| TORCH_CHECK(exprs[2] == e2); |
| TORCH_CHECK(exprs[3] == e3); |
| |
| TORCH_CHECK(fusion.origin(v2)->name() == 0); |
| TORCH_CHECK(fusion.origin(v3)->name() == 0); |
| TORCH_CHECK(fusion.origin(v4)->name() == 1); |
| TORCH_CHECK(fusion.origin(v5)->name() == 2); |
| TORCH_CHECK(fusion.origin(v6)->name() == 3); |
| } |
| |
| void testGPU_FusionTensor() { |
| auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
| |
| auto tensor = at::randn({2, 3, 4, 5}, options); |
| auto sizes = tensor.sizes().vec(); |
| auto tensor_type = TensorType::create(tensor); |
| |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| auto fuser_tensor = new TensorView(tensor_type); |
| TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); |
| TORCH_CHECK(fuser_tensor->domain() != nullptr); |
| } |
| |
| void testGPU_FusionTensorContiguity() { |
| { |
| // NCHW memory layout |
| auto tensor = at::randn({2, 3, 4, 5}); |
| auto sizes = tensor.sizes().vec(); |
| auto strides = tensor.strides().vec(); |
| TensorContiguity t_c(sizes, strides); |
| TORCH_CHECK(t_c.rank() == 4); |
| TORCH_CHECK(t_c.getBroadcastDims().size() == 0); |
| for (int i = 0; i < 4; i++) { |
| TORCH_CHECK(!t_c.isBroadcastDim(i)); |
| if (i < 3) { |
| TORCH_CHECK(t_c.canCollapseToHigher(i)); |
| } |
| } |
| } |
| |
| { |
| // NHWC memory layout |
| TensorContiguity t_c({2, 3, 4, 5}, {60, 1, 15, 3}); |
| TORCH_CHECK(t_c.rank() == 4); |
| TORCH_CHECK(t_c.getBroadcastDims().size() == 0); |
| for (int i = 0; i < 4; i++) { |
| TORCH_CHECK(!t_c.isBroadcastDim(i)); |
| if (i < 3) { |
| TORCH_CHECK((t_c.canCollapseToHigher(i) ^ (i != 2))); |
| } |
| } |
| } |
| |
| { |
| // NHWC memory layout with broadcast |
| TensorContiguity t_c({2, 3, 4, 5}, {120, 0, 30, 3}); |
| TORCH_CHECK(t_c.rank() == 4); |
| auto b_dims = t_c.getBroadcastDims(); |
| TORCH_CHECK(b_dims.size() == 1 && b_dims[0] == 1); |
| for (int i = 0; i < 4; i++) { |
| TORCH_CHECK(!(t_c.isBroadcastDim(i)) ^ (i == 1)); |
| if (i < 3) { |
| TORCH_CHECK(!(t_c.canCollapseToHigher(i))); |
| } |
| } |
| } |
| |
| { |
| // contiguity across size-1 dimension |
| auto tensor = at::randn({4, 1, 4}); |
| auto sizes = tensor.sizes().vec(); |
| auto strides = tensor.strides().vec(); |
| auto dim = sizes.size(); |
| TensorContiguity t_c(sizes, strides); |
| TORCH_CHECK(t_c.rank() == (int)sizes.size()); |
| auto b_dims = t_c.getBroadcastDims(); |
| TORCH_CHECK(b_dims.size() == 0); |
| TORCH_CHECK(t_c.getFCD() == 2); |
| TORCH_CHECK(t_c.hasContiguousFCD()); |
| for (decltype(dim) i = 0; i < dim; i++) { |
| TORCH_CHECK(!t_c.isBroadcastDim(i)); |
| if (i < dim - 1) { |
| TORCH_CHECK(t_c.canCollapseToHigher(i)); |
| } |
| } |
| } |
| |
| { |
| // no contiguity across size-1 dimension |
| auto tensor = at::randn({4, 4, 4}).split(1, 1)[0]; |
| auto sizes = tensor.sizes().vec(); |
| auto strides = tensor.strides().vec(); |
| TensorContiguity t_c(sizes, strides); |
| TORCH_CHECK(!(t_c.canCollapseToHigher(0))); |
| TORCH_CHECK((t_c.canCollapseToHigher(1))); |
| } |
| |
| { |
| // no contiguity across size-1 dimension |
| auto tensor = at::randn({4, 1, 8}).split(4, 2)[0]; |
| auto sizes = tensor.sizes().vec(); |
| auto strides = tensor.strides().vec(); |
| TensorContiguity t_c(sizes, strides); |
| TORCH_CHECK((t_c.canCollapseToHigher(0))); |
| TORCH_CHECK((!t_c.canCollapseToHigher(1))); |
| } |
| |
| { |
| // no contiguity across size-1 dimension |
| auto tensor = at::randn({8, 1, 4}).split(4, 0)[0]; |
| auto sizes = tensor.sizes().vec(); |
| auto strides = tensor.strides().vec(); |
| TensorContiguity t_c(sizes, strides); |
| TORCH_CHECK((t_c.canCollapseToHigher(0))); |
| TORCH_CHECK((t_c.canCollapseToHigher(1))); |
| } |
| |
| { |
| // test merge |
| TensorContiguity t_c_l({4, 4, 4}, {16, 4, 1}); |
| TensorContiguity t_c_r({4, 4, 4}, {16, 4, 1}); |
| t_c_l.merge(t_c_r); |
| TORCH_CHECK((t_c_l.isIdentical(t_c_r))); |
| } |
| |
| { |
| TensorContiguity t_c_l({4, 4, 4, 4}, {16, 0, 4, 1}); |
| TensorContiguity t_c_r({4, 4, 4, 4}, {64, 16, 4, 1}); |
| t_c_l.merge(t_c_r); |
| TORCH_CHECK(t_c_l.getFCD() == 3); |
| TORCH_CHECK(t_c_l.getAxisByStride(0) == 0); |
| } |
| |
| { |
| // NHWC + NCHW |
| TensorContiguity t_c_l({4, 4, 4, 4}, {64, 16, 4, 1}); |
| TensorContiguity t_c_r({4, 4, 4, 4}, {64, 1, 16, 4}); |
| t_c_l.merge(t_c_r); |
| TORCH_CHECK(!t_c_l.hasContiguousFCD()); |
| TORCH_CHECK(t_c_l.getFCD() == -1); |
| TORCH_CHECK(t_c_l.getAxisByStride(0) == 0); |
| TORCH_CHECK(t_c_l.getAxisByStride(1) == -1); |
| TORCH_CHECK(t_c_l.getAxisByStride(2) == -1); |
| TORCH_CHECK(t_c_l.getAxisByStride(3) == -1); |
| } |
| |
| { |
| // NCHW + NCHW with broadcasting |
| TensorContiguity t_c_l({4, 4, 4, 4}, {4, 1, 4, 0}); |
| TensorContiguity t_c_r({4, 4, 4, 4}, {64, 1, 16, 4}); |
| t_c_l.merge(t_c_r); |
| TORCH_CHECK(t_c_l.getFCD() == 1); |
| TORCH_CHECK(t_c_l.getAxisByStride(0) == 0); |
| } |
| } |
| |
| void testGPU_FusionTVSplit() { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| TensorView* tv = makeDummyTensor(3); |
| |
| tv = tv->split(2, 2); |
| TORCH_CHECK(tv->nDims() == 4); |
| Expr* outer = tv->axis(2)->extent()->getOrigin(); |
| |
| TORCH_CHECK( |
| outer->getExprType().value() == ExprType::BinaryOp && |
| static_cast<BinaryOp*>(outer)->getBinaryOpType() == |
| BinaryOpType::CeilDiv && |
| static_cast<BinaryOp*>(outer)->lhs()->sameAs( |
| tv->getRootDomain()[2]->extent()) && |
| static_cast<Int*>(static_cast<BinaryOp*>(outer)->rhs()) |
| ->sameAs(new Int(2))); |
| |
| IterDomain* inner = static_cast<IterDomain*>(tv->axis(3)); |
| TORCH_CHECK( |
| inner->extent()->isScalar() && |
| static_cast<Int*>(inner->extent())->isConst() && |
| static_cast<Int*>(inner->extent())->value().value() == 2); |
| } |
| |
| void testGPU_FusionTVMerge() { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| TensorView* tv = makeDummyTensor(3); |
| |
| tv = tv->merge(1); |
| Expr* axisOp = tv->axis(1)->extent()->getOrigin(); |
| |
| TORCH_CHECK( |
| tv->nDims() == 2 && axisOp->getExprType() == ExprType::BinaryOp && |
| static_cast<BinaryOp*>(axisOp)->getBinaryOpType() == BinaryOpType::Mul && |
| static_cast<BinaryOp*>(axisOp)->lhs() == |
| tv->getRootDomain()[1]->extent() && |
| static_cast<BinaryOp*>(axisOp)->rhs() == |
| tv->getRootDomain()[2]->extent()); |
| } |
| |
| void testGPU_FusionTVReorder() { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| std::unordered_map<int, int> shift_right{{-1, 0}}; |
| |
| std::unordered_map<int, int> shift_left{{0, -1}}; |
| |
| std::unordered_map<int, int> shift_left_2{{0, -1}, {1, 0}, {2, 1}}; |
| |
| std::unordered_map<int, int> swap{{0, 2}, {2, 0}}; |
| |
| auto tv = makeDummyTensor(3); |
| std::vector<IterDomain*> ref; |
| ref = std::vector<IterDomain*>( |
| tv->domain()->domain().begin(), tv->domain()->domain().end()); |
| |
| tv->reorder(shift_left); |
| for (int i = 0; i < (int)tv->nDims(); i++) |
| TORCH_CHECK(ref[i]->sameAs(tv->axis(i - 1))); |
| |
| tv = makeDummyTensor(3); |
| ref = std::vector<IterDomain*>( |
| tv->domain()->domain().begin(), tv->domain()->domain().end()); |
| |
| tv->reorder(shift_left); |
| for (int i = 0; i < (int)tv->nDims(); i++) |
| TORCH_CHECK(ref[i]->sameAs(tv->axis(i - 1))); |
| |
| tv = makeDummyTensor(3); |
| ref = std::vector<IterDomain*>( |
| tv->domain()->domain().begin(), tv->domain()->domain().end()); |
| |
| tv->reorder(shift_right); |
| TORCH_CHECK(ref[ref.size() - 1]->sameAs(tv->axis(0))); |
| for (int i = 1; i < (int)tv->nDims(); i++) |
| TORCH_CHECK(ref[i - 1]->sameAs(tv->axis(i))); |
| |
| tv = makeDummyTensor(3); |
| ref = std::vector<IterDomain*>( |
| tv->domain()->domain().begin(), tv->domain()->domain().end()); |
| tv->reorder(swap); |
| TORCH_CHECK(ref[0]->sameAs(tv->axis(2))); |
| TORCH_CHECK(ref[2]->sameAs(tv->axis(0))); |
| TORCH_CHECK(ref[1]->sameAs(tv->axis(1))); |
| } |
| |
| void testGPU_FusionEquality() { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| Float* fval1 = new Float(); |
| Float* fval1_copy = fval1; |
| Float* fval2 = new Float(); |
| Float* fone = new Float(1.0); |
| |
| TORCH_CHECK(fval1->sameAs(fval1_copy)); |
| TORCH_CHECK(!fval1->sameAs(fval2)); |
| TORCH_CHECK(!fone->sameAs(fval1)); |
| TORCH_CHECK(fone->sameAs(new Float(1.0))); |
| |
| Int* ival1 = new Int(); |
| Int* ival1_copy = ival1; |
| Int* ival2 = new Int(); |
| Int* ione = new Int(1); |
| |
| TORCH_CHECK(ival1->sameAs(ival1_copy)); |
| TORCH_CHECK(!ival1->sameAs(ival2)); |
| TORCH_CHECK(!ione->sameAs(ival1)); |
| TORCH_CHECK(ione->sameAs(new Int(1))); |
| |
| BinaryOp* add1 = new BinaryOp(BinaryOpType::Add, new Float(), fval1, ival1); |
| BinaryOp* add1_copy = |
| new BinaryOp(BinaryOpType::Add, new Float(), fval1, ival1); |
| BinaryOp* sub1 = new BinaryOp(BinaryOpType::Sub, new Float(), fval1, ival1); |
| |
| UnaryOp* neg1 = new UnaryOp(UnaryOpType::Neg, new Float(), fval1); |
| UnaryOp* neg2 = new UnaryOp(UnaryOpType::Neg, new Float(), fval2); |
| UnaryOp* neg1_copy = new UnaryOp(UnaryOpType::Neg, new Float(), fval1); |
| |
| TORCH_CHECK(add1->sameAs(add1_copy)); |
| TORCH_CHECK(!add1->sameAs(sub1)); |
| |
| TORCH_CHECK(neg1->sameAs(neg1_copy)); |
| TORCH_CHECK(!static_cast<Expr*>(neg1)->sameAs(add1)); |
| TORCH_CHECK(!neg1->sameAs(neg2)); |
| } |
| |
| void testGPU_FusionReplaceAll() { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| Float* f0 = new Float(); |
| Float* f1 = new Float{1.f}; |
| Float* f2 = new Float{2.f}; |
| Float* f3 = new Float(); |
| Float* f4 = static_cast<Float*>(add(f1, f0)); |
| |
| // replace the output f4 with f3 |
| ReplaceAll::instancesOf(f4, f3); |
| // f3 should now have an origin function |
| TORCH_CHECK(fusion.origin(f3) != nullptr); |
| |
| // Should have removed f4 completely so we shouldn't have any other expr than |
| // f3 construction |
| TORCH_CHECK(fusion.exprs().size() == 1); |
| |
| // Replace constant Float's of value 1.f with 2.f |
| ReplaceAll::instancesOf(f1, f2); |
| BinaryOp* bop = static_cast<BinaryOp*>(fusion.origin(f3)); |
| // make sure the binary op (origin of f3) actually changed to 2.f |
| TORCH_CHECK(static_cast<Float*>(bop->lhs())->sameAs(new Float{2.f})); |
| } |
| |
| void testGPU_FusionDependency() { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| Float* f0 = new Float(0.f); |
| Float* f1 = new Float(1.f); |
| auto f2 = add(f0, f1); |
| |
| auto f3 = add(f2, f2); |
| |
| Float* f4 = new Float(4.f); |
| Float* f5 = new Float(5.f); |
| auto f6 = add(f4, f5); |
| |
| Float* f7 = new Float(7.f); |
| Float* f8 = new Float(8.f); |
| auto f9 = add(f7, f8); |
| |
| auto f10 = add(f6, f9); |
| |
| auto f11 = add(f3, f10); |
| |
| TORCH_CHECK(DependencyCheck::isDependencyOf(f0, f11)); |
| TORCH_CHECK(DependencyCheck::isDependencyOf(f1, f11)); |
| TORCH_CHECK(DependencyCheck::isDependencyOf(f2, f11)); |
| TORCH_CHECK(DependencyCheck::isDependencyOf(f3, f11)); |
| TORCH_CHECK(DependencyCheck::isDependencyOf(f6, f11)); |
| TORCH_CHECK(DependencyCheck::isDependencyOf(f9, f11)); |
| TORCH_CHECK(DependencyCheck::isDependencyOf(f0, f2)); |
| TORCH_CHECK(DependencyCheck::isDependencyOf(f2, f3)); |
| TORCH_CHECK(DependencyCheck::isDependencyOf(f4, f6)); |
| TORCH_CHECK(DependencyCheck::isDependencyOf(f8, f10)); |
| |
| TORCH_CHECK(!DependencyCheck::isDependencyOf(f11, f0)); |
| TORCH_CHECK(!DependencyCheck::isDependencyOf(f11, f1)); |
| TORCH_CHECK(!DependencyCheck::isDependencyOf(f11, f2)); |
| TORCH_CHECK(!DependencyCheck::isDependencyOf(f11, f3)); |
| TORCH_CHECK(!DependencyCheck::isDependencyOf(f11, f4)); |
| TORCH_CHECK(!DependencyCheck::isDependencyOf(f11, f5)); |
| TORCH_CHECK(!DependencyCheck::isDependencyOf(f2, f0)); |
| TORCH_CHECK(!DependencyCheck::isDependencyOf(f3, f2)); |
| TORCH_CHECK(!DependencyCheck::isDependencyOf(f6, f4)); |
| TORCH_CHECK(!DependencyCheck::isDependencyOf(f10, f8)); |
| |
| auto dep_chain = DependencyCheck::getSingleDependencyChain(f0, f11); |
| TORCH_CHECK(dep_chain.back() == f11); |
| dep_chain.pop_back(); |
| TORCH_CHECK(dep_chain.back() == f3); |
| dep_chain.pop_back(); |
| TORCH_CHECK(dep_chain.back() == f2); |
| dep_chain.pop_back(); |
| |
| dep_chain = DependencyCheck::getSingleDependencyChain(f6, f11); |
| TORCH_CHECK(dep_chain.back() == f11); |
| dep_chain.pop_back(); |
| TORCH_CHECK(dep_chain.back() == f10); |
| dep_chain.pop_back(); |
| |
| dep_chain = DependencyCheck::getSingleDependencyChain(f4, f11); |
| TORCH_CHECK(dep_chain.back() == f11); |
| dep_chain.pop_back(); |
| TORCH_CHECK(dep_chain.back() == f10); |
| dep_chain.pop_back(); |
| TORCH_CHECK(dep_chain.back() == f6); |
| dep_chain.pop_back(); |
| |
| dep_chain = DependencyCheck::getSingleDependencyChain(f11, f2); |
| TORCH_CHECK(dep_chain.empty()); |
| } |
| |
| void testGPU_FusionParser() { |
| auto g = std::make_shared<Graph>(); |
| const auto graph0_string = R"IR( |
| graph(%0 : Float(2:1), |
| %1 : Float(2:1)): |
| %c0 : Float(2:1) = aten::mul(%0, %1) |
| %d0 : Float(2:1) = aten::mul(%c0, %0) |
| return (%d0))IR"; |
| torch::jit::parseIR(graph0_string, g.get()); |
| |
| // strides are not yet supported in the irparser. |
| for (auto val : g->block()->inputs()) { |
| if (val->isCompleteTensor()) |
| val->setType(val->type()->cast<TensorType>()->contiguous()); |
| } |
| for (auto node : g->block()->nodes()) { |
| for (auto val : node->outputs()) { |
| if (val->isCompleteTensor()) |
| val->setType(val->type()->cast<TensorType>()->contiguous()); |
| } |
| } |
| |
| torch::jit::fuser::cuda::CudaKernel prog; |
| Fusion& fusion = *prog.fusion_; |
| FusionGuard fg(&fusion); |
| // These can be set to anything as there are no bindings! |
| // All CTAS and threads execute the same thing. |
| prog.grid(4); |
| prog.block(32); |
| prog.device_ = 0; |
| fuser::cuda::parseJitIR(g, &prog); |
| |
| std::stringstream ref; |
| ref << "__global__ void CUDAGeneratedKernel(Tensor<float, 1> T0, Tensor<float, 1> T1, Tensor<float, 1> T3){\n" |
| << " float T2[4];\n" |
| << " if ( ( ( ( ( ( blockIdx.x * 4 ) + ( 4 - 1 ) ) * 128 ) + threadIdx.x ) < T3.size[0] ) ) { \n" |
| << " for(size_t i60 = 0; i60 < 4; ++i60 ) {\n" |
| << " T2[ i60 ]\n" |
| << " = T0[ ( ( ( ( ( blockIdx.x * 4 ) + i60 ) * 128 ) + threadIdx.x ) * T0.stride[0] ) ]\n" |
| << " * T1[ ( ( ( ( ( blockIdx.x * 4 ) + i60 ) * 128 ) + threadIdx.x ) * T1.stride[0] ) ];\n" |
| << " }\n" |
| << " } else { \n" |
| << " for(size_t i60 = 0; i60 < 4; ++i60 ) {\n" |
| << " if ( ( ( ( ( ( blockIdx.x * 4 ) + i60 ) * 128 ) + threadIdx.x ) < T3.size[0] ) ) { \n" |
| << " T2[ i60 ]\n" |
| << " = T0[ ( ( ( ( ( blockIdx.x * 4 ) + i60 ) * 128 ) + threadIdx.x ) * T0.stride[0] ) ]\n" |
| << " * T1[ ( ( ( ( ( blockIdx.x * 4 ) + i60 ) * 128 ) + threadIdx.x ) * T1.stride[0] ) ];\n" |
| << " }\n" |
| << " }\n" |
| << " }\n" |
| << " if ( ( ( ( ( ( blockIdx.x * 4 ) + ( 4 - 1 ) ) * 128 ) + threadIdx.x ) < T3.size[0] ) ) { \n" |
| << " for(size_t i61 = 0; i61 < 4; ++i61 ) {\n" |
| << " T3[ ( ( ( ( ( blockIdx.x * 4 ) + i61 ) * 128 ) + threadIdx.x ) * T3.stride[0] ) ]\n" |
| << " = T2[ i61 ]\n" |
| << " * T0[ ( ( ( ( ( blockIdx.x * 4 ) + i61 ) * 128 ) + threadIdx.x ) * T0.stride[0] ) ];\n" |
| << " }\n" |
| << " } else { \n" |
| << " for(size_t i61 = 0; i61 < 4; ++i61 ) {\n" |
| << " if ( ( ( ( ( ( blockIdx.x * 4 ) + i61 ) * 128 ) + threadIdx.x ) < T3.size[0] ) ) { \n" |
| << " T3[ ( ( ( ( ( blockIdx.x * 4 ) + i61 ) * 128 ) + threadIdx.x ) * T3.stride[0] ) ]\n" |
| << " = T2[ i61 ]\n" |
| << " * T0[ ( ( ( ( ( blockIdx.x * 4 ) + i61 ) * 128 ) + threadIdx.x ) * T0.stride[0] ) ];\n" |
| << " }\n" |
| << " }\n" |
| << " }\n" |
| << "}\n"; |
| |
| GPULower gpulw(&fusion); |
| std::stringstream cdg; |
| gpulw.printKernel(cdg); |
| if (ref.str().size() != cdg.str().size() || |
| ref.str().compare(cdg.str()) != 0) { |
| std::cerr |
| << " Codegen mismatch, codegen possibly changed, or is incorrect. " |
| << " \n ========= REF ========= \n" |
| << ref.str() << "\n========= RESULT ========== \n" |
| << cdg.str() << "\n=================" << std::endl; |
| TORCH_CHECK(false); |
| } |
| } |
| |
| void testGPU_FusionForLoop() { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| const auto TV0 = new TensorView( |
| new TensorDomain({new IterDomain(new Int(0), new Int(16))}), |
| DataType::Float); |
| const auto TV1 = new TensorView( |
| new TensorDomain({new IterDomain(new Int(0), new Int(16))}), |
| DataType::Float); |
| |
| fusion.addInput(TV0); |
| fusion.addInput(TV1); |
| |
| auto ID0 = new IterDomain(new Int(0), new Int(8)); |
| |
| TensorView* TV2 = add(TV0, TV1); |
| BinaryOp* op = static_cast<BinaryOp*>(TV2->getOrigin()); |
| fusion.addOutput(TV2); |
| |
| ForLoop* fl = new ForLoop(new Int(), ID0, {op}); |
| std::stringstream result; |
| std::stringstream ref; |
| result << fl; |
| ref << "for(size_t i3{0}; i3 < iS{8}; ++i3 ) {\nT2[ iS{16} ] = T0[ iS{16} ] + T1[ iS{16} ]\n}"; |
| |
| if (result.str().compare(ref.str()) == 0) { |
| std::stringstream err_msg; |
| err_msg << "ForLoop printing has changed or something has gone wrong. " |
| << result.str() << "\n does not match reference: " << ref.str() |
| << std::endl; |
| TORCH_CHECK(false, err_msg.str()); |
| } |
| } |
| |
| void testGPU_FusionCodeGen() { |
| torch::jit::fuser::cuda::CudaKernel prog; |
| Fusion& fusion = *prog.fusion_; |
| FusionGuard fg(&fusion); |
| |
| TensorView* tv0 = makeDummyTensor(3); |
| |
| new BinaryOp(BinaryOpType::Add, tv0, new Float(0.0), new Float(1.0)); |
| TensorView* tv1 = add(tv0, new Float(2.0)); |
| TensorView* tv2 = add(tv1, new Float(3.0)); |
| fusion.addOutput(tv2); |
| |
| //[I0, I1, I2] |
| tv2 = tv2->split(0, 4); |
| //[I0o, I0i{4}, I1, I2] |
| tv2 = tv2->merge(1); |
| //[I0o, I0i{4}*I1, I2] |
| tv2 = tv2->split(-1, 2); |
| //[I0o, I0i{4}*I1, I2o, I2i{2}] |
| tv2 = tv2->reorder({{0, 1}, {1, 0}, {3, 2}}); |
| //[I0i{4}*I1, I0o, I2i{2}, I2o] |
| |
| tv0->computeAt(tv2, -1); |
| |
| prog.device_ = 0; |
| // These can be set to anything as there are no bindings! |
| // All CTAS and threads execute the same thing. |
| prog.grid(4); |
| prog.block(32); |
| |
| auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
| |
| at::Tensor output = at::empty({16, 8, 8}, options); |
| |
| torch::jit::fuser::cuda::compileKernel(&prog); |
| torch::jit::fuser::cuda::runTestKernel(&prog, {}, {output}); |
| |
| at::Tensor output_ref = at::zeros_like(output, options); |
| output_ref = output_ref + 0.0 + 1.0 + 2.0 + 3.0; |
| |
| TORCH_CHECK(output_ref.equal(output)); |
| } |
| |
| void testGPU_FusionCodeGen2() { |
| torch::jit::fuser::cuda::CudaKernel prog; |
| Fusion& fusion = *prog.fusion_; |
| FusionGuard fg(&fusion); |
| |
| TensorView* tv0 = makeDummyTensor(3); |
| TensorView* tv1 = makeDummyTensor(3); |
| TensorView* tv2 = add(tv1, new Float(2.0)); |
| TensorView* tv3 = add(tv0, tv2); |
| |
| fusion.addInput(tv0); |
| fusion.addInput(tv1); |
| fusion.addOutput(tv3); |
| |
| //[I0, I1, I2] |
| tv3->reorder({{0, 2}, {2, 0}}); |
| //[I2, I1, I0] |
| tv3->split(-1, 4); |
| //[I2, I1, I0o, I0i{4}] |
| tv3->reorder({{2, 0}, {3, 1}, {0, 3}}); |
| // I0o, I0i{4}, I1, I2] |
| |
| tv0->computeAt(tv3, -1); |
| tv1->computeAt(tv3, -1); |
| |
| tv3->axis(0)->parallelize(ParallelType::BIDx); |
| tv3->axis(-1)->parallelize(ParallelType::TIDx); |
| |
| prog.device_ = 0; |
| prog.grid(4); |
| prog.block(8); |
| |
| auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
| |
| at::Tensor input1 = at::randn({16, 8, 8}, options); |
| at::Tensor input2 = at::randn_like(input1); |
| ; |
| at::Tensor output = at::empty_like(input1); |
| |
| torch::jit::fuser::cuda::compileKernel(&prog); |
| torch::jit::fuser::cuda::runTestKernel(&prog, {input1, input2}, {output}); |
| |
| at::Tensor tv2_ref = input2 + 2.0; |
| at::Tensor output_ref = input1 + tv2_ref; |
| |
| TORCH_CHECK(output_ref.equal(output)); |
| } |
| |
| void testGPU_FusionSimplePWise() { |
| torch::jit::fuser::cuda::CudaKernel prog; |
| Fusion& fusion = *prog.fusion_; |
| FusionGuard fg(&fusion); |
| // dimensionality of the problem |
| int nDims = 3; |
| |
| // Set up your input tensor views |
| TensorView* tv0 = makeDummyTensor(nDims); |
| TensorView* tv1 = makeDummyTensor(nDims); |
| |
| // Register your inputs |
| fusion.addInput(tv0); |
| fusion.addInput(tv1); |
| |
| // Do math with it, it returns a `Val*` but can be static_casted back to |
| // TensorView |
| TensorView* tv2 = add(tv1, new Float(2.0)); |
| TensorView* tv3 = add(tv0, tv2); |
| |
| // Register your outputs |
| fusion.addOutput(tv3); |
| |
| // Do transformations, remember, transformations are outputs to inputs |
| // This doesn't have to be in this order |
| tv3->merge(1); |
| tv3->merge(0); |
| |
| // Split by n_threads |
| tv3->split(-1, 128 * 2); |
| tv3->split(-1, 128); |
| |
| // For all inputs, computeAt the output inline, temporaries should be squeezed |
| // between them |
| tv0->computeAt(tv3, -1); |
| tv1->computeAt(tv3, -1); |
| |
| // Parallelize TV3 |
| tv3->axis(0)->parallelize(ParallelType::BIDx); |
| tv3->axis(-2)->parallelize(ParallelType::TIDy); |
| tv3->axis(-1)->parallelize(ParallelType::TIDx); |
| |
| prog.device_ = 0; |
| prog.grid(64); // 1 CTA |
| prog.block(128, 2); // 256 Threads |
| |
| auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
| |
| at::Tensor input1 = at::randn({64, 2, 128}, options); |
| at::Tensor input2 = at::rand_like(input1); |
| at::Tensor output = at::empty_like(input1); |
| |
| torch::jit::fuser::cuda::compileKernel(&prog); |
| torch::jit::fuser::cuda::runTestKernel(&prog, {input1, input2}, {output}); |
| |
| at::Tensor tv2_ref = input2 + 2.0; |
| at::Tensor output_ref = input1 + tv2_ref; |
| |
| TORCH_CHECK(output_ref.equal(output)); |
| } |
| |
| void testGPU_FusionExecKernel() { |
| torch::jit::fuser::cuda::CudaKernel prog; |
| Fusion& fusion = *prog.fusion_; |
| FusionGuard fg(&fusion); |
| |
| // Set up your input tensor views |
| TensorView* tv0 = makeDummyTensor(2); |
| TensorView* tv1 = makeDummyTensor(2); |
| |
| // Register your inputs |
| fusion.addInput(tv0); |
| fusion.addInput(tv1); |
| |
| // Do math with it, it returns a `Val*` but can be static_casted back to |
| // TensorView |
| TensorView* tv2 = add(tv1, new Float(2.0)); |
| TensorView* tv3 = add(tv0, tv2); |
| |
| // Register your outputs |
| fusion.addOutput(tv3); |
| |
| tv3->merge(0); |
| tv3->split(0, 128); |
| tv3->split(0, 4); |
| |
| // For all inputs, computeAt the output inline, temporaries should be squeezed |
| // between them |
| tv0->computeAt(tv3, 1); |
| tv1->computeAt(tv3, 1); |
| |
| // Parallelize TV3 |
| tv3->axis(0)->parallelize(ParallelType::BIDx); |
| tv2->axis(1)->parallelize(ParallelType::Unroll); |
| tv3->axis(1)->parallelize(ParallelType::Unroll); |
| tv2->axis(-1)->parallelize(ParallelType::TIDx); |
| tv3->axis(-1)->parallelize(ParallelType::TIDx); |
| |
| prog.device_ = 0; |
| prog.grid(1); // 1 CTA |
| prog.block(128); // 128 Threads |
| |
| auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
| |
| at::Tensor input1 = at::ones({1, 128}, options); |
| at::Tensor input2 = at::ones_like(input1); |
| |
| at::Tensor output = at::empty_like(input1); |
| |
| torch::jit::fuser::cuda::compileKernel(&prog); |
| torch::jit::fuser::cuda::runTestKernel(&prog, {input1, input2}, {output}); |
| |
| at::Tensor check = at::full({1, 128}, 4, options); |
| ; |
| TORCH_CHECK(output.equal(check)); |
| } |
| |
| int ceilDiv_(int a, int b) { |
| return (a + b - 1) / b; |
| } |
| |
| void testGPU_FusionAdvancedComputeAt() { |
| // Case 1 |
| /* |
| * tv1 = tv0 * -1 |
| * tv2 = tv0 + 3 |
| * tv3 = tv0 * 2 |
| * tv4 = tv2 + tv1 |
| * tv5 = tv4 + tv3 |
| * tv6 = tv0 + tv3 |
| */ |
| { |
| torch::jit::fuser::cuda::CudaKernel prog; |
| Fusion& fusion = *prog.fusion_; |
| FusionGuard fg(&fusion); |
| |
| TensorView* tv0 = makeDummyTensor(2); |
| fusion.addInput(tv0); |
| |
| TensorView* tv1 = mul(tv0, new Float(-1.0)); |
| TensorView* tv2 = add(tv0, new Float(3.0)); |
| TensorView* tv3 = mul(tv0, new Float(2.0)); |
| TensorView* tv4 = add(tv2, tv1); |
| |
| TensorView* tv5 = add(tv4, tv3); |
| TensorView* tv6 = add(tv0, tv3); |
| |
| fusion.addOutput(tv5); |
| fusion.addOutput(tv6); |
| |
| tv0->computeAt(tv3, 1); |
| |
| // Check propagation of this computeAt. |
| TORCH_CHECK(tv0->getComputeAtView() == tv3); |
| TORCH_CHECK(tv1->getComputeAtView() == tv4); |
| TORCH_CHECK(tv2->getComputeAtView() == tv4); |
| TORCH_CHECK(tv3->getComputeAtView() == tv6); |
| TORCH_CHECK(tv4->getComputeAtView() == tv5); |
| TORCH_CHECK(tv5->getComputeAtView() == tv6); |
| TORCH_CHECK(!tv6->hasComputeAt()); |
| |
| // Lets setup to actually run |
| tv6->merge(0); |
| tv6->split(0, 128); |
| tv6->split(0, 4); |
| |
| tv6->axis(0)->parallelize(ParallelType::BIDx); |
| |
| tv0->computeAt(tv6, 1); |
| |
| TORCH_CHECK(tv0->getComputeAtView() == tv3 && tv0->nDims() == 3); |
| TORCH_CHECK(tv1->getComputeAtView() == tv4 && tv1->nDims() == 3); |
| TORCH_CHECK(tv2->getComputeAtView() == tv4 && tv2->nDims() == 3); |
| TORCH_CHECK(tv3->getComputeAtView() == tv6 && tv3->nDims() == 3); |
| TORCH_CHECK(tv4->getComputeAtView() == tv5 && tv4->nDims() == 3); |
| TORCH_CHECK(tv5->getComputeAtView() == tv6 && tv5->nDims() == 3); |
| TORCH_CHECK(!tv6->hasComputeAt()); |
| |
| for (Val* val : fusion.vals()) { |
| if (!fusion.hasInput(val) && |
| val->getValType().value() == ValType::TensorView) { |
| TensorView* tv = static_cast<TensorView*>(val); |
| tv->axis(1)->parallelize(ParallelType::Unroll); |
| tv->axis(-1)->parallelize(ParallelType::TIDx); |
| } |
| } |
| |
| auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
| |
| at::Tensor t0 = at::randn({129, 127}, options); |
| |
| auto t1 = t0.mul({-1.0}); |
| auto t2 = t0.add({3.0}); |
| auto t3 = t0.mul({2.0}); |
| auto t4 = t2.add(t1); |
| auto t5 = t4.add(t3); |
| auto t6 = t0.add(t3); |
| |
| at::Tensor kernel_tv5 = at::empty_like(t0, options); |
| at::Tensor kernel_tv6 = at::empty_like(t0, options); |
| |
| prog.device_ = 0; |
| |
| int blocks = ceilDiv_( |
| ceilDiv_(t0.numel(), 128), 4); // numel / unroll factor / threads |
| prog.grid(blocks); |
| prog.block(128); |
| torch::jit::fuser::cuda::compileKernel(&prog); |
| torch::jit::fuser::cuda::runTestKernel( |
| &prog, {t0}, {kernel_tv5, kernel_tv6}); |
| |
| TORCH_CHECK(at::allclose(kernel_tv5, t5)); |
| TORCH_CHECK(at::allclose(kernel_tv6, t6)); |
| } |
| |
| // Case 2 |
| /* |
| * tv1 = tv0 * -1 |
| * tv2 = tv0 + 3 |
| * tv3 = tv0 * 2 |
| * tv4 = tv2 + tv1 |
| * tv5 = tv4 + tv3 |
| * tv6 = tv5 + tv3 |
| */ |
| { |
| torch::jit::fuser::cuda::CudaKernel prog; |
| Fusion& fusion = *prog.fusion_; |
| FusionGuard fg(&fusion); |
| |
| TensorView* tv0 = makeDummyTensor(2); |
| fusion.addInput(tv0); |
| |
| TensorView* tv1 = mul(tv0, new Float(-1.0)); |
| TensorView* tv2 = add(tv0, new Float(3.0)); |
| TensorView* tv3 = mul(tv0, new Float(2.0)); |
| TensorView* tv4 = add(tv2, tv1); |
| |
| TensorView* tv5 = add(tv4, tv3); |
| TensorView* tv6 = add(tv5, tv3); |
| |
| fusion.addOutput(tv5); |
| fusion.addOutput(tv6); |
| |
| tv2->computeAt(tv4, 1); |
| TORCH_CHECK(!tv0->hasComputeAt()); |
| TORCH_CHECK(!tv1->hasComputeAt()); |
| TORCH_CHECK(tv2->getComputeAtView() == tv4); |
| TORCH_CHECK(!tv3->hasComputeAt()); |
| TORCH_CHECK(!tv4->hasComputeAt()); |
| TORCH_CHECK(!tv5->hasComputeAt()); |
| TORCH_CHECK(!tv6->hasComputeAt()); |
| |
| // Lets setup to actually run |
| tv6->merge(0); |
| tv6->split(0, 128); |
| tv6->split(0, 4); |
| |
| tv6->axis(0)->parallelize(ParallelType::BIDx); |
| |
| tv0->computeAt(tv6, 1); |
| |
| for (Val* val : fusion.vals()) { |
| if (!fusion.hasInput(val) && |
| val->getValType().value() == ValType::TensorView) { |
| TensorView* tv = static_cast<TensorView*>(val); |
| |
| tv->axis(1)->parallelize(ParallelType::Unroll); |
| tv->axis(-1)->parallelize(ParallelType::TIDx); |
| } |
| } |
| |
| auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
| at::Tensor t0 = at::randn({129, 127}, options); |
| |
| auto t1 = t0.mul({-1.0}); |
| auto t2 = t0.add({3.0}); |
| auto t3 = t0.mul({2.0}); |
| auto t4 = t2.add(t1); |
| auto t5 = t4.add(t3); |
| auto t6 = t5.add(t3); |
| |
| at::Tensor kernel_tv5 = at::empty_like(t0, options); |
| at::Tensor kernel_tv6 = at::empty_like(t0, options); |
| |
| prog.device_ = 0; |
| |
| int blocks = ceilDiv_( |
| ceilDiv_(t0.numel(), 128), 4); // numel / unroll factor / threads |
| prog.grid(blocks); |
| prog.block(128); |
| torch::jit::fuser::cuda::compileKernel(&prog); |
| torch::jit::fuser::cuda::runTestKernel( |
| &prog, {t0}, {kernel_tv5, kernel_tv6}); |
| |
| GPULower gpulw(&fusion); |
| std::stringstream cdg; |
| gpulw.printKernel(cdg); |
| |
| TORCH_CHECK(at::allclose(kernel_tv5, t5), cdg.str()); |
| TORCH_CHECK(at::allclose(kernel_tv6, t6)); |
| } |
| |
| // Case 3 |
| // T2 = T1 * 0.979361 |
| // T3 = T2 * T0 |
| { |
| torch::jit::fuser::cuda::CudaKernel prog; |
| Fusion& fusion = *prog.fusion_; |
| FusionGuard fg(&fusion); |
| |
| TensorView* tv0 = makeDummyTensor(4); |
| fusion.addInput(tv0); |
| |
| TensorView* tv1 = makeDummyTensor(4); |
| fusion.addInput(tv1); |
| |
| TensorView* tv2 = mul(tv1, new Float(.979361)); |
| TensorView* tv3 = mul(tv2, tv0); |
| |
| fusion.addOutput(tv3); |
| |
| // Lets setup to actually run |
| while (tv3->nDims() > 1) |
| tv3->merge(0); |
| tv3->split(0, 128); |
| tv3->split(0, 4); |
| |
| tv0->computeAt(tv3, 1); |
| tv1->computeAt(tv3, 1); |
| |
| tv3->axis(0)->parallelize(ParallelType::BIDx); |
| |
| for (Val* val : fusion.vals()) { |
| if (!fusion.hasInput(val) && |
| val->getValType().value() == ValType::TensorView) { |
| TensorView* tv = static_cast<TensorView*>(val); |
| |
| tv->axis(1)->parallelize(ParallelType::Unroll); |
| tv->axis(-1)->parallelize(ParallelType::TIDx); |
| } |
| } |
| |
| auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
| at::Tensor t0 = at::randn({129, 127, 63, 65}, options); |
| at::Tensor t1 = at::rand_like(t0, options); |
| |
| auto t2 = t1.mul({0.979361}); |
| auto t3 = t2.mul(t0); |
| |
| at::Tensor kernel_tv3 = at::empty_like(t0, options); |
| |
| prog.device_ = 0; |
| |
| int blocks = ceilDiv_( |
| ceilDiv_(t0.numel(), 128), 4); // numel / unroll factor / threads |
| |
| prog.grid(blocks); |
| prog.block(128); |
| torch::jit::fuser::cuda::compileKernel(&prog); |
| torch::jit::fuser::cuda::runTestKernel(&prog, {t0, t1}, {kernel_tv3}); |
| |
| GPULower gpulw(&fusion); |
| std::stringstream cdg; |
| gpulw.printKernel(cdg); |
| |
| TORCH_CHECK(at::allclose(kernel_tv3, t3), cdg.str()); |
| } |
| |
| // Case 4 |
| // T4 = T2 - T3 |
| // T5 = T1 + T4 |
| // T6 = T5 - T0 |
| { |
| torch::jit::fuser::cuda::CudaKernel prog; |
| Fusion& fusion = *prog.fusion_; |
| FusionGuard fg(&fusion); |
| |
| TensorView* tv0 = makeDummyTensor(4); |
| fusion.addInput(tv0); |
| |
| TensorView* tv1 = makeDummyTensor(4); |
| fusion.addInput(tv1); |
| |
| TensorView* tv2 = makeDummyTensor(4); |
| fusion.addInput(tv2); |
| |
| TensorView* tv3 = makeDummyTensor(4); |
| fusion.addInput(tv3); |
| |
| TensorView* tv4 = sub(tv2, tv3); |
| TensorView* tv5 = add(tv1, tv4); |
| TensorView* tv6 = sub(tv5, tv0); |
| |
| fusion.addOutput(tv6); |
| |
| // Lets setup to actually run |
| while (tv6->nDims() > 1) |
| tv6->merge(0); |
| tv6->split(0, 128); |
| tv6->split(0, 4); |
| |
| tv0->computeAt(tv6, 1); |
| tv1->computeAt(tv6, 1); |
| tv2->computeAt(tv6, 1); |
| tv3->computeAt(tv6, 1); |
| |
| tv6->axis(0)->parallelize(ParallelType::BIDx); |
| |
| for (Val* val : fusion.vals()) { |
| if (!fusion.hasInput(val) && |
| val->getValType().value() == ValType::TensorView) { |
| TensorView* tv = static_cast<TensorView*>(val); |
| |
| tv->axis(1)->parallelize(ParallelType::Unroll); |
| tv->axis(-1)->parallelize(ParallelType::TIDx); |
| } |
| } |
| |
| auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
| at::Tensor t0 = at::randn({129, 127, 63, 65}, options); |
| at::Tensor t1 = at::rand_like(t0, options); |
| at::Tensor t2 = at::rand_like(t0, options); |
| at::Tensor t3 = at::rand_like(t0, options); |
| |
| auto t4 = t2.sub(t3); |
| auto t5 = t1.add(t4); |
| auto t6 = t5.sub(t0); |
| |
| at::Tensor kernel_tv6 = at::empty_like(t0, options); |
| |
| prog.device_ = 0; |
| |
| int blocks = ceilDiv_( |
| ceilDiv_(t0.numel(), 128), 4); // numel / unroll factor / threads |
| |
| prog.grid(blocks); |
| prog.block(128); |
| torch::jit::fuser::cuda::compileKernel(&prog); |
| torch::jit::fuser::cuda::runTestKernel( |
| &prog, {t0, t1, t2, t3}, {kernel_tv6}); |
| |
| GPULower gpulw(&fusion); |
| std::stringstream cdg; |
| gpulw.printKernel(cdg); |
| |
| TORCH_CHECK(at::allclose(kernel_tv6, t6), cdg.str()); |
| } |
| } |
| |
| void testGPU_FusionScalarInputs() { |
| torch::jit::fuser::cuda::CudaKernel prog; |
| Fusion& fusion = *prog.fusion_; |
| FusionGuard fg(&fusion); |
| |
| TensorView* tv0 = makeDummyTensor(2); |
| fusion.addInput(tv0); |
| TensorView* tv1 = makeDummyTensor(2); |
| fusion.addInput(tv1); |
| |
| Float* f0 = new Float(); |
| fusion.addInput(f0); |
| Float* f1 = new Float(); |
| fusion.addInput(f1); |
| Float* f2 = new Float(); |
| fusion.addInput(f2); |
| Float* f3 = new Float(); |
| fusion.addInput(f3); |
| Val* f4 = mul(f0, f1); |
| Val* f5 = sub(f2, f3); |
| |
| TensorView* tv2 = sub(tv1, f4); |
| TensorView* tv3 = add(tv0, f5); |
| TensorView* tv4 = mul(tv3, tv2); |
| |
| fusion.addOutput(tv4); |
| |
| // Lets setup to actually run |
| while (tv4->nDims() > 1) |
| tv4->merge(0); |
| tv4->split(0, 128); |
| tv4->split(0, 4); |
| |
| tv0->computeAt(tv4, 1); |
| tv1->computeAt(tv4, 1); |
| |
| tv4->axis(0)->parallelize(ParallelType::BIDx); |
| |
| for (Val* val : fusion.vals()) { |
| if (!fusion.hasInput(val) && |
| val->getValType().value() == ValType::TensorView) { |
| TensorView* tv = static_cast<TensorView*>(val); |
| |
| tv->axis(1)->parallelize(ParallelType::Unroll); |
| tv->axis(-1)->parallelize(ParallelType::TIDx); |
| } |
| } |
| |
| // f4 = f0 * f1 |
| // f5 = f2 - f3 |
| // t2 = t1 - f4 |
| // t3 = t0 + f5 |
| // t4 = t3 * t2 |
| |
| auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
| |
| float fl0 = 0.1; |
| float fl1 = -0.2; |
| float fl2 = 0.3; |
| float fl3 = -0.4; |
| float fl4 = fl0 * fl1; |
| float fl5 = fl2 - fl3; |
| |
| at::Tensor t0 = at::randn({129, 127}, options); |
| at::Tensor t1 = at::rand_like(t0, options); |
| |
| auto t2 = t1.sub(fl4); |
| auto t3 = t0.add(fl5); |
| auto t4 = t3.mul(t2); |
| |
| at::Tensor kernel_tv4 = at::empty_like(t0, options); |
| |
| prog.device_ = 0; |
| |
| int blocks = |
| ceilDiv_(ceilDiv_(t0.numel(), 128), 4); // numel / unroll factor / threads |
| |
| prog.grid(blocks); |
| prog.block(128); |
| torch::jit::fuser::cuda::compileKernel(&prog); |
| at::Scalar test(fl0); |
| |
| torch::jit::fuser::cuda::runTestKernel( |
| &prog, |
| {t0, |
| t1, |
| at::Scalar(fl0), |
| at::Scalar(fl1), |
| at::Scalar(fl2), |
| at::Scalar(fl3)}, |
| {kernel_tv4}); |
| |
| GPULower gpulw(&fusion); |
| std::stringstream cdg; |
| gpulw.printKernel(cdg); |
| |
| TORCH_CHECK(at::allclose(kernel_tv4, t4), cdg.str()); |
| } |
| |
| void testGPU_FusionLoopUnroll() { |
| torch::jit::fuser::cuda::CudaKernel prog; |
| Fusion& fusion = *prog.fusion_; |
| FusionGuard fg(&fusion); |
| |
| // Set up your input tensor views |
| TensorView* tv0 = makeDummyTensor(3); |
| TensorView* tv1 = makeDummyTensor(3); |
| |
| // Register your inputs |
| fusion.addInput(tv0); |
| fusion.addInput(tv1); |
| |
| // Do math with it, it returns a `Val*` but can be static_casted back to |
| // TensorView |
| TensorView* tv2 = add(tv1, new Float(2.0)); |
| TensorView* tv3 = add(tv0, tv2); |
| |
| // Register your outputs |
| fusion.addOutput(tv3); |
| |
| int block_size = 16; |
| |
| tv3->merge(0, 1); |
| tv3->merge(0, 1); |
| |
| tv3->split(0, block_size); |
| tv3->split(0, 4); |
| |
| // For all inputs, computeAt the output inline, temporaries should be squeezed |
| // between them |
| tv0->computeAt(tv3, 1); |
| tv1->computeAt(tv3, 1); |
| |
| // Parallelize |
| tv2->axis(1)->parallelize(ParallelType::Unroll); |
| tv3->axis(1)->parallelize(ParallelType::Unroll); |
| tv2->axis(-1)->parallelize(ParallelType::TIDx); |
| tv3->axis(-1)->parallelize(ParallelType::TIDx); |
| tv3->axis(0)->parallelize(ParallelType::BIDx); |
| |
| int inp_size = 129 * 13 * 3; |
| |
| // GPULower lower(&fusion); |
| // lower.printKernel(std::cout); |
| |
| prog.device_ = 0; |
| prog.grid((inp_size + 63) / 64); |
| prog.block(block_size); |
| |
| auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
| |
| at::Tensor input0 = at::rand({129, 13, 3}, options); |
| at::Tensor input1 = at::rand({129, 13, 3}, options); |
| |
| at::Tensor output = at::empty_like(input1); |
| |
| torch::jit::fuser::cuda::compileKernel(&prog); |
| torch::jit::fuser::cuda::runTestKernel(&prog, {input0, input1}, {output}); |
| |
| TORCH_CHECK(output.equal(input0.add(input1.add(2.0)))); |
| } |
| |
| /* |
| * Helper function for single op testing that generates a codegen operand |
| */ |
| |
| Val* gen_jit_operand(std::pair<ValType, DataType> desc) { |
| if (desc.first == ValType::TensorView) { |
| return makeDummyTensor(2, desc.second); |
| } else if (desc.first == ValType::Scalar) { |
| if (desc.second == DataType::Float) |
| return new Float(); |
| else if (desc.second == DataType::Int) |
| return new Int(); |
| else |
| TORCH_CHECK("Not currently supported type", desc.first); |
| } else { |
| TORCH_CHECK("Not currently supported type", desc.first); |
| } |
| return nullptr; |
| } |
| |
| /* |
| * Helper function for single op testing that generates an ATen operand |
| */ |
| |
| IValue gen_aten_operand( |
| std::pair<ValType, DataType> desc, |
| int blocks, |
| int threads, |
| bool rand) { |
| if (desc.first == ValType::TensorView) { |
| if (desc.second == DataType::Float) { |
| auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
| if (rand) |
| return IValue(at::rand({blocks, threads}, options)); |
| else |
| return IValue(at::empty({blocks, threads}, options)); |
| } else if (desc.second == DataType::Half) { |
| auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); |
| if (rand) |
| return IValue(at::rand({blocks, threads}, options)); |
| else |
| return IValue(at::empty({blocks, threads}, options)); |
| } else if (desc.second == DataType::Bool) { |
| if (rand) { |
| auto options = |
| at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
| return IValue(at::rand({blocks, threads}, options).to(at::kBool)); |
| } else { |
| auto options = |
| at::TensorOptions().dtype(at::kBool).device(at::kCUDA, 0); |
| return IValue(at::empty({blocks, threads}, options)); |
| } |
| } else { |
| TORCH_CHECK("Not currently supported type", desc.second) |
| } |
| } else if (desc.first == ValType::Scalar) { |
| if (desc.second == DataType::Float) |
| return IValue(at::Scalar(1.f)); |
| else if (desc.second == DataType::Int) |
| return IValue(at::Scalar(1)); |
| else |
| TORCH_CHECK("Not currently supported type", desc.first); |
| } else { |
| TORCH_CHECK("Not currently supported type", desc.first); |
| } |
| return nullptr; |
| } |
| |
| /* |
| * Templatized Helper Function To generate single Op comparison between the |
| * JIT codegen for Cuda and the ATen Library. |
| */ |
| |
| using OutputPair = std::pair<ValType, DataType>; |
| template < |
| typename AtenFunc, |
| typename JitFunc, |
| typename InputTuple, |
| size_t... NumInputs> |
| void test_op( |
| int blocks, |
| int threads, |
| std::string op_str, |
| AtenFunc af, |
| JitFunc jf, |
| OutputPair op, |
| InputTuple it, |
| std::index_sequence<NumInputs...>) { |
| torch::jit::fuser::cuda::CudaKernel prog; |
| Fusion& fusion = *prog.fusion_; |
| FusionGuard fg(&fusion); |
| |
| // Generate Input JIT function Inputs and add them as Inputs to the Fusion |
| // Graph |
| std::array<Val*, sizeof...(NumInputs)> jit_inputs = { |
| gen_jit_operand(std::get<NumInputs>(it))...}; |
| std::for_each(jit_inputs.begin(), jit_inputs.end(), [&fusion](Val* v) { |
| fusion.addInput(v); |
| }); |
| TensorView* out = |
| static_cast<TensorView*>(jf(std::get<NumInputs>(jit_inputs)...)); |
| fusion.addOutput(out); |
| |
| std::for_each(jit_inputs.begin(), jit_inputs.end(), [out](Val* v) { |
| if (v->getValType() == ValType::TensorView) |
| static_cast<TensorView*>(v)->computeAt(out, -1); |
| }); |
| out->axis(0)->parallelize(ParallelType::BIDx); |
| out->axis(-1)->parallelize(ParallelType::TIDx); |
| |
| prog.device_ = 0; |
| prog.grid(blocks); |
| prog.block(threads); |
| torch::jit::fuser::cuda::compileKernel(&prog); |
| |
| std::array<IValue, sizeof...(NumInputs)> aten_inputs = {gen_aten_operand( |
| std::get<NumInputs>(it), blocks, threads, /*rand*/ true)...}; |
| const at::ArrayRef<IValue> aten_inputs_ivalues(aten_inputs); |
| |
| at::Tensor output = |
| gen_aten_operand(op, blocks, threads, /*rand*/ false).toTensor(); |
| std::vector<at::Tensor> output_vect = {output}; |
| cudaDeviceSynchronize(); |
| if (fusion.hasRNG()) |
| at::manual_seed(0); |
| torch::jit::fuser::cuda::runTestKernel( |
| &prog, aten_inputs_ivalues, output_vect); |
| cudaDeviceSynchronize(); |
| |
| if (fusion.hasRNG()) |
| at::manual_seed(0); |
| at::Tensor ref_output = af(aten_inputs); |
| cudaDeviceSynchronize(); // This sync shouldn't be necessary; |
| |
| std::function<std::string()> aten_inputs_to_str = |
| [&aten_inputs]() -> std::string { |
| int input_cnt = 1; |
| std::stringstream ss; |
| std::for_each( |
| aten_inputs.begin(), aten_inputs.end(), [&input_cnt, &ss](IValue& iv) { |
| ss << "\nINPUT" << input_cnt++ << ": " << iv.toTensor(); |
| }); |
| return ss.str(); |
| }; |
| |
| at::Tensor diff; |
| if (output.scalar_type() == at::kBool) { |
| diff = at::eq(output, ref_output); |
| } else { |
| diff = at::sub(output, ref_output); |
| } |
| |
| TORCH_CHECK( |
| (output.scalar_type() == at::kBool |
| ? output.equal(ref_output) |
| : |
| // The absolute Tolerance was raised to 1e-07 from 1e-08 to allow |
| // allow for the remainder function to pass. |
| output.allclose(ref_output, /*rtol*/ 1e-05, /*atol*/ 1e-07)), |
| "\nOp Type: -- ", |
| op_str, |
| " -- had a mismatch.", |
| aten_inputs_to_str(), |
| "\nJIT: ", |
| output, |
| "\nREF: ", |
| ref_output, |
| "\nDIFF: ", |
| diff, |
| "\n"); |
| } |
| |
| /* |
| * Templatized Helper Function that uses variadic templates to |
| * process a variable length Input Tuple of different Operand Type. |
| */ |
| template <typename AtenFunc, typename JitFunc, typename InputTuple> |
| void test_op( |
| int blocks, |
| int threads, |
| std::string op_str, |
| AtenFunc af, |
| JitFunc jf, |
| OutputPair op, |
| InputTuple it) { |
| static constexpr auto size = std::tuple_size<InputTuple>::value; |
| test_op( |
| blocks, |
| threads, |
| op_str, |
| af, |
| jf, |
| op, |
| it, |
| std::make_index_sequence<size>{}); |
| } |
| |
| void testGPU_FusionUnaryOps() { |
| using OpTuple = |
| std::tuple<at::Tensor (*)(const at::Tensor&), UnaryOpType, std::string>; |
| |
| // [Note: explicit tuple type for uniform initialization list] |
| // Tuple type must be explicitly specified for each uniform initialization |
| // list within the vector to make this code compatible with some old env |
| // which we still need to support. eg. gcc 5.4 + cuda 9.2. |
| std::vector<OpTuple> ops{ |
| OpTuple{at::abs, UnaryOpType::Abs, "abs"}, |
| OpTuple{at::acos, UnaryOpType::Acos, "acos"}, |
| OpTuple{at::asin, UnaryOpType::Asin, "asin"}, |
| OpTuple{at::atan, UnaryOpType::Atan, "atan"}, |
| // There does not appear to be an appropriate ATen function for atanh |
| // OpTuple{at::atanh, UnaryOpType::Atanh, "atanh" }, |
| OpTuple{at::ceil, UnaryOpType::Ceil, "ceil"}, |
| OpTuple{at::cos, UnaryOpType::Cos, "cos"}, |
| OpTuple{at::cosh, UnaryOpType::Cosh, "cosh"}, |
| OpTuple{at::erf, UnaryOpType::Erf, "erf"}, |
| OpTuple{at::erfc, UnaryOpType::Erfc, "erfc"}, |
| OpTuple{at::exp, UnaryOpType::Exp, "exp"}, |
| OpTuple{at::expm1, UnaryOpType::Expm1, "expm1"}, |
| OpTuple{at::floor, UnaryOpType::Floor, "floor"}, |
| OpTuple{at::frac, UnaryOpType::Frac, "frac"}, |
| OpTuple{at::gelu, UnaryOpType::Gelu, "gelu"}, |
| OpTuple{at::lgamma, UnaryOpType::Lgamma, "lgamma"}, |
| OpTuple{at::log, UnaryOpType::Log, "log"}, |
| OpTuple{at::log10, UnaryOpType::Log10, "log10"}, |
| OpTuple{at::log1p, UnaryOpType::Log1p, "log1p"}, |
| OpTuple{at::log2, UnaryOpType::Log2, "log2"}, |
| OpTuple{at::neg, UnaryOpType::Neg, "neg"}, |
| OpTuple{at::reciprocal, UnaryOpType::Reciprocal, "reciprocal"}, |
| OpTuple{at::relu, UnaryOpType::Relu, "relu"}, |
| OpTuple{at::round, UnaryOpType::Round, "round"}, |
| OpTuple{at::rsqrt, UnaryOpType::Rsqrt, "rsqrt"}, |
| OpTuple{at::sigmoid, UnaryOpType::Sigmoid, "sigmoid"}, |
| OpTuple{at::sin, UnaryOpType::Sin, "sin"}, |
| OpTuple{at::sinh, UnaryOpType::Sinh, "sinh"}, |
| OpTuple{at::sqrt, UnaryOpType::Sqrt, "sqrt"}, |
| OpTuple{at::tan, UnaryOpType::Tan, "tan"}, |
| OpTuple{at::tanh, UnaryOpType::Tanh, "tanh"}, |
| OpTuple{at::trunc, UnaryOpType::Trunc, "trunc"}}; |
| |
| std::for_each(ops.begin(), ops.end(), [](OpTuple& op) { |
| test_op( |
| /*blocks*/ 640, |
| /*threads*/ 64, |
| /*name*/ std::get<2>(op), |
| /*Aten Func */ |
| [&op](std::array<IValue, 1>& vals) { |
| return std::get<0>(op)(vals[0].toTensor()); |
| }, |
| /*JIT Func */ |
| [&op](Val* in1) -> Val* { return unaryOp(std::get<1>(op), in1); }, |
| /*Output */ std::make_pair(ValType::TensorView, DataType::Float), |
| /*Inputs Tuple*/ |
| std::make_tuple(std::make_pair(ValType::TensorView, DataType::Float))); |
| }); |
| |
| test_op( |
| /*blocks*/ 128, |
| /*threads*/ 64, |
| /*name*/ "rand_like", |
| /*Aten Func */ |
| [](std::array<IValue, 1>& vals) { |
| return at::rand_like(vals[0].toTensor()); |
| }, |
| /*JIT Func */ |
| [](Val* in1) -> Val* { return unaryOp(UnaryOpType::RandLike, in1); }, |
| /*Output */ std::make_pair(ValType::TensorView, DataType::Float), |
| /*Inputs Tuple*/ |
| std::make_tuple(std::make_pair(ValType::TensorView, DataType::Float))); |
| } |
| |
| void testGPU_FusionBinaryOps() { |
| using AtenFuncSig = at::Tensor (*)(const at::Tensor&, const at::Tensor&); |
| using OpTuple = std::tuple<AtenFuncSig, BinaryOpType, std::string>; |
| |
| // see [Note: explicit tuple type for uniform initialization list] |
| std::vector<OpTuple> logic_ops{OpTuple{at::eq, BinaryOpType::Eq, "eq"}, |
| OpTuple{at::ge, BinaryOpType::GE, "ge"}, |
| OpTuple{at::gt, BinaryOpType::GT, "gt"}, |
| OpTuple{at::le, BinaryOpType::LE, "le"}, |
| OpTuple{at::lt, BinaryOpType::LT, "lt"}, |
| OpTuple{at::ne, BinaryOpType::NE, "ne"}}; |
| |
| std::for_each(logic_ops.begin(), logic_ops.end(), [](OpTuple& op) { |
| test_op( |
| /*blocks*/ 640, |
| /*threads*/ 64, |
| /*name*/ std::get<2>(op), |
| /*Aten Func */ |
| [&op](std::array<IValue, 2>& vals) { |
| return std::get<0>(op)(vals[0].toTensor(), vals[1].toTensor()); |
| }, |
| /*JIT Func */ |
| [&op](Val* in1, Val* in2) -> Val* { |
| return binaryOp(std::get<1>(op), in1, in2); |
| }, |
| /*Output */ std::make_pair(ValType::TensorView, DataType::Bool), |
| /*Inputs Tuple*/ |
| std::make_tuple( |
| std::make_pair(ValType::TensorView, DataType::Float), |
| std::make_pair(ValType::TensorView, DataType::Float))); |
| }); |
| |
| // see [Note: explicit tuple type for uniform initialization list] |
| std::vector<OpTuple> math_ops{ |
| OpTuple{at::atan2, BinaryOpType::Atan2, "atan2"}, |
| OpTuple{at::div, BinaryOpType::Div, "div"}, |
| OpTuple{at::fmod, BinaryOpType::Fmod, "fmod"}, |
| OpTuple{at::max, BinaryOpType::Max, "max"}, |
| OpTuple{at::min, BinaryOpType::Min, "min"}, |
| OpTuple{at::mul, BinaryOpType::Mul, "mul"}, |
| OpTuple{at::pow, BinaryOpType::Pow, "pow"}, |
| // NOTE: Remainder does not match the Aten impl exactly |
| // despite using an identical function. |
| OpTuple{at::remainder, BinaryOpType::Remainder, "remainder"}, |
| }; |
| |
| std::for_each(math_ops.begin(), math_ops.end(), [](OpTuple& op) { |
| test_op( |
| /*blocks*/ 640, |
| /*threads*/ 64, |
| /*name*/ std::get<2>(op), |
| /*Aten Func */ |
| [&op](std::array<IValue, 2>& vals) { |
| return std::get<0>(op)(vals[0].toTensor(), vals[1].toTensor()); |
| }, |
| /*JIT Func */ |
| [&op](Val* in1, Val* in2) -> Val* { |
| return binaryOp(std::get<1>(op), in1, in2); |
| }, |
| /*Output */ std::make_pair(ValType::TensorView, DataType::Float), |
| /*Inputs Tuple*/ |
| std::make_tuple( |
| std::make_pair(ValType::TensorView, DataType::Float), |
| std::make_pair(ValType::TensorView, DataType::Float))); |
| }); |
| |
| test_op( |
| /*blocks*/ 640, |
| /*threads*/ 64, |
| /*name*/ "add_alpha", |
| /*Aten Func */ |
| [](std::array<IValue, 3>& vals) { |
| return at::add( |
| vals[0].toTensor(), vals[1].toTensor(), vals[2].toScalar()); |
| }, |
| /*JIT Func */ static_cast<Val* (*)(Val*, Val*, Val*)>(&add_alpha), |
| /*Output */ std::make_pair(ValType::TensorView, DataType::Float), |
| /*Inputs Tuple*/ |
| std::make_tuple( |
| std::make_pair(ValType::TensorView, DataType::Float), |
| std::make_pair(ValType::TensorView, DataType::Float), |
| std::make_pair(ValType::Scalar, DataType::Float))); |
| test_op( |
| /*blocks*/ 640, |
| /*threads*/ 64, |
| /*name*/ "sub_alpha", |
| /*Aten Func */ |
| [](std::array<IValue, 3>& vals) { |
| return at::sub( |
| vals[0].toTensor(), vals[1].toTensor(), vals[2].toScalar()); |
| }, |
| /*JIT Func */ static_cast<Val* (*)(Val*, Val*, Val*)>(&sub_alpha), |
| /*Output */ std::make_pair(ValType::TensorView, DataType::Float), |
| /*Inputs Tuple*/ |
| std::make_tuple( |
| std::make_pair(ValType::TensorView, DataType::Float), |
| std::make_pair(ValType::TensorView, DataType::Float), |
| std::make_pair(ValType::Scalar, DataType::Float))); |
| } |
| |
| void testGPU_FusionTernaryOps() { |
| test_op( |
| /*blocks*/ 640, |
| /*threads*/ 64, |
| /*name*/ "clamp", |
| /*Aten Func */ |
| [](std::array<IValue, 1>& vals) { |
| return at::clamp(vals[0].toTensor(), 0.f, 1.f); |
| }, |
| /*JIT Func */ |
| [](Val* in1) -> Val* { |
| return clamp(in1, new Float(0.f), new Float(1.f)); |
| }, |
| /*Output */ std::make_pair(ValType::TensorView, DataType::Float), |
| /*Inputs Tuple*/ |
| std::make_tuple(std::make_pair(ValType::TensorView, DataType::Float))); |
| test_op( |
| /*blocks*/ 640, |
| /*threads*/ 64, |
| /*name*/ "threshold", |
| /*Aten Func */ |
| [](std::array<IValue, 1>& vals) { |
| return at::threshold(vals[0].toTensor(), 0.f, 1.f); |
| }, |
| /*JIT Func */ |
| [](Val* in1) -> Val* { |
| return threshold(in1, new Float(0.f), new Float(1.f)); |
| }, |
| /*Output */ std::make_pair(ValType::TensorView, DataType::Float), |
| /*Inputs Tuple*/ |
| std::make_tuple(std::make_pair(ValType::TensorView, DataType::Float))); |
| test_op( |
| /*blocks*/ 640, |
| /*threads*/ 64, |
| /*name*/ "where", |
| /*Aten Func */ |
| [](std::array<IValue, 3>& vals) { |
| return at::where( |
| vals[0].toTensor(), vals[1].toTensor(), vals[2].toTensor()); |
| }, |
| /*JIT Func */ static_cast<Val* (*)(Val*, Val*, Val*)>(&where), |
| /*Output */ std::make_pair(ValType::TensorView, DataType::Float), |
| /*Inputs Tuple*/ |
| std::make_tuple( |
| std::make_pair(ValType::TensorView, DataType::Bool), |
| std::make_pair(ValType::TensorView, DataType::Float), |
| std::make_pair(ValType::TensorView, DataType::Float))); |
| } |
| |
| void testGPU_FusionCompoundOps() { |
| test_op( |
| /*blocks*/ 640, |
| /*threads*/ 64, |
| /*name*/ "lerp", |
| /*Aten Func */ |
| [](std::array<IValue, 3>& vals) { |
| return at::lerp( |
| vals[0].toTensor(), vals[1].toTensor(), vals[2].toTensor()); |
| }, |
| /*JIT Func */ static_cast<Val* (*)(Val*, Val*, Val*)>(&lerp), |
| /*Output */ std::make_pair(ValType::TensorView, DataType::Float), |
| /*Inputs Tuple*/ |
| std::make_tuple( |
| std::make_pair(ValType::TensorView, DataType::Float), |
| std::make_pair(ValType::TensorView, DataType::Float), |
| std::make_pair(ValType::TensorView, DataType::Float))); |
| test_op( |
| /*blocks*/ 640, |
| /*threads*/ 64, |
| /*name*/ "addcmul", |
| /*Aten Func */ |
| [](std::array<IValue, 4>& vals) { |
| return at::addcmul( |
| vals[0].toTensor(), |
| vals[1].toTensor(), |
| vals[2].toTensor(), |
| vals[3].toScalar()); |
| }, |
| /*JIT Func */ static_cast<Val* (*)(Val*, Val*, Val*, Val*)>(&addcmul), |
| /*Output */ std::make_pair(ValType::TensorView, DataType::Float), |
| /*Inputs Tuple*/ |
| std::make_tuple( |
| std::make_pair(ValType::TensorView, DataType::Float), |
| std::make_pair(ValType::TensorView, DataType::Float), |
| std::make_pair(ValType::TensorView, DataType::Float), |
| std::make_pair(ValType::Scalar, DataType::Float))); |
| } |
| |
| void testGPU_FusionCastOps() { |
| torch::jit::fuser::cuda::CudaKernel prog; |
| Fusion& fusion = *prog.fusion_; |
| FusionGuard fg(&fusion); |
| |
| TensorView* tv0 = makeDummyTensor(2, DataType::Half); |
| |
| TensorView* intrm1 = castOp(DataType::Float, tv0); |
| TensorView* out = castOp(DataType::Half, intrm1); |
| |
| fusion.addInput(tv0); |
| fusion.addOutput(out); |
| tv0->computeAt(out, -1); |
| |
| out->axis(0)->parallelize(ParallelType::BIDx); |
| out->axis(-1)->parallelize(ParallelType::TIDx); |
| |
| prog.device_ = 0; |
| prog.grid(1); |
| prog.block(4); |
| |
| auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); |
| |
| at::Tensor input1 = at::rand({1, 4}, options); |
| at::Tensor output = at::empty_like(input1); |
| at::Tensor ref_output = at::empty_like(input1); |
| |
| std::array<IValue, 1> inputs = {input1}; |
| const at::ArrayRef<IValue> input_ivalues(inputs); |
| std::vector<at::Tensor> outputs{{output}}; |
| |
| torch::jit::fuser::cuda::compileKernel(&prog); |
| torch::jit::fuser::cuda::runTestKernel(&prog, input_ivalues, outputs); |
| |
| ref_output = at::_cast_Half(at::_cast_Float(input1)); |
| |
| TORCH_CHECK( |
| output.equal(ref_output), |
| "\nOp Type: -- ", |
| "cast FP16->FP32->FP16", |
| " -- had a mismatch.\n", |
| "IN1 : ", |
| input1, |
| "\n", |
| "JIT: ", |
| output, |
| "\n", |
| "REF: ", |
| ref_output, |
| "\n"); |
| } |
| |
| // We want split/merge/reorder all tested both on and off rfactor domains, also |
| // want compute at into the rfactor domain, and into its consumer |
| void testGPU_FusionRFactorReplay() { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| // Set up your input tensor views |
| TensorView* tv0 = makeDummyTensor(2); |
| |
| // Register your inputs |
| fusion.addInput(tv0); |
| |
| // Do math with it, it returns a `Val*` but can be static_casted back to |
| // TensorView |
| TensorView* tv1 = sum(tv0, {1}); |
| // tv1[I0, R1] |
| tv1->split(0, 32); |
| // tv1[I0o, I0i{32}, R1] |
| tv1->split(0, 16); |
| // tv1[I0oo, I0oi{16}, I0i{32}, R1] |
| tv1->split(-1, 8); |
| // tv1[I0oo, I0oi{16}, I0i{32}, R1o, R1i{8}] |
| tv1->split(-2, 4); |
| // tv1[I0oo, I0oi{16}, I0i{32}, R1oo, R1oi{4}, R1i{8}] |
| tv1->reorder({{0, -2}, {2, -1}, {-3, 0}, {-1, 1}}); |
| // tv1[R1oo, R1i{8}, I0oi{16}, R1oi{4}, I0oo, I0i{32}] |
| |
| tv1->merge(0); |
| tv1->merge(-2); |
| |
| // tv1[R1oo*R1i{8}, I0oi{16}, R1oi{4}, I0oo*I0i{32}] |
| TensorDomain* new_domain = TransformRFactor::runReplay(tv1->domain(), {0}); |
| // new_domain[r(R1oo*R1i{8})rf, I0oi{16}, ir1oi{4}rf, I0oo*I0i{32}] |
| |
| TensorDomain* new_domain2 = TransformRFactor::runReplay2(tv1->domain(), {0}); |
| // new_domain2[ I0oi{16}, , I0oo*I0i{32}, R1oi{4}] |
| |
| // Move rfactor axis to end, keep iter rfactor axis |
| new_domain->reorder({{0, -1}, {2, 2}}); |
| |
| // Replay casp, replay new_domain2 as new_domain |
| // reordered_new_domain[I0oi{16}, I0oo*I0i{32}, ir1oi{4}rf, R(R1oo*R1i{8})rf] |
| TensorDomain* casp = TransformReplay::replayCasP(new_domain2, new_domain, 2); |
| // new_domain[I0oi{16}, I0oo*I0i{32}, ir1oi{4}rf, R(R1oo*R1i{8})rf] |
| // casp[I0oi{16}, I0oo*I0i{32}, R1oi{4}] |
| |
| casp->split(1, 2); |
| // casp [I0oi{16}, (I0oo*I0i{32})o, I(Ioo*I0i)i{2}, ir1oi{4} ] |
| // new_domain[I0oi{16}, I0oo*I0i{32} , ir1oi{4}rf, |
| // R(R1oo*R1i{8})rf] |
| |
| TensorDomain* pasc = TransformReplay::replayPasC(new_domain, casp, 2); |
| // pasc [I0oi{16}, (I0oo*I0i{32})o, I(Ioo*I0i)i{2}, ir1oi{4}rf, |
| // R(R1oo*R1i{8})rf] |
| |
| TORCH_CHECK( |
| new_domain->nDims() - 1 == new_domain2->nDims(), |
| casp->nDims() == new_domain2->nDims() + 1, |
| pasc->nDims() == new_domain->nDims() + 1, |
| "Error in rfactor, number of dimensions is not correct."); |
| |
| TORCH_CHECK( |
| !casp->sameAs(new_domain2) && !pasc->sameAs(new_domain) && |
| !new_domain->sameAs(new_domain2) && |
| !tv1->domain()->sameAs(new_domain) && |
| !tv1->domain()->sameAs(new_domain2), |
| "Error in rfactor, number of dimensions is not correct."); |
| |
| auto dom = new_domain->rootDomain(); |
| TORCH_CHECK( |
| !dom[0]->isReduction() && |
| std::any_of( |
| dom.begin(), |
| dom.end(), |
| [](IterDomain* id) { return id->isReduction(); }) && |
| std::any_of( |
| dom.begin(), |
| dom.end(), |
| [](IterDomain* id) { return id->isRFactorProduct(); }), |
| "Error in rFactor, there seems to be something wrong in root domain."); |
| |
| auto dom2 = new_domain2->rootDomain(); |
| TORCH_CHECK( |
| !dom2[0]->isReduction() && |
| std::any_of( |
| dom2.begin(), |
| dom2.end(), |
| [](IterDomain* id) { return id->isReduction(); }), |
| "Error in rFactor, there seems to be something wrong in root domain."); |
| } |
| |
| // Start off simple, block on the outer dim |
| // block stride + thread all reduce + unrolling on inner dim |
| void testGPU_FusionReduction() { |
| torch::jit::fuser::cuda::CudaKernel prog; |
| Fusion& fusion = *prog.fusion_; |
| FusionGuard fg(&fusion); |
| |
| // Set up your input tensor views |
| TensorView* tv0 = makeDummyTensor(2); |
| fusion.addInput(tv0); |
| |
| // tv1[I0, R1] = tv0[I0, I1] |
| TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Float(0), tv0); |
| fusion.addOutput(tv1); |
| |
| TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); |
| |
| tv1->split(1, 128); |
| // tv1[I0, R1o, R1i{128}] = tv0[I0, I1] |
| tv1->split(1, 4); |
| // tv1[I0, R1oo, R1oi{4}, R1i{128}] = tv0[I0, I1] |
| |
| TensorView* tv2 = tv1->rFactor({1}); |
| // tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] = tv0[I0, I1] |
| // tv1[I0, R1oi{4}, R1i{128}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] |
| |
| TensorView* tv3 = tv1->rFactor({1}); |
| // tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] = tv0[I0, I1] |
| // tv3[I0, R1oi{4}, Ir1i{128}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] |
| // tv1[I0, R1i{128}] = tv3[I0, R1oi{4}, Ir1i{128}] |
| |
| // Incrementally, can print in between for debugging |
| tv0->computeAt(tv2, 1); |
| tv2->computeAt(tv3, 1); |
| tv3->computeAt(tv1, 1); |
| |
| // Re do it all at once, because why not. |
| tv0->computeAt(tv1, 1); |
| |
| tv2->axis(2)->parallelize(ParallelType::Unroll); |
| tv1->axis(0)->parallelize(ParallelType::BIDx); |
| |
| tv1->axis(-1)->parallelize(ParallelType::TIDx); |
| tv2->axis(-1)->parallelize(ParallelType::TIDx); |
| tv3->axis(-1)->parallelize(ParallelType::TIDx); |
| |
| // for(auto expr : fusion.exprs(true)) |
| // std::cout<<expr<<std::endl; |
| // GPULower lower(&fusion); |
| // lower.printKernel(std::cout); |
| |
| int numel_x = 65000; |
| int numel_y = 1025; |
| |
| prog.device_ = 0; |
| prog.grid(numel_x); |
| prog.block(128); |
| |
| auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
| at::Tensor input = at::rand({numel_x, numel_y}, options); |
| at::Tensor cg_output = at::empty({numel_x}, options); |
| |
| torch::jit::fuser::cuda::compileKernel(&prog); |
| torch::jit::fuser::cuda::runTestKernel(&prog, {input}, {cg_output}); |
| |
| auto aten_output = input.sum({1}); |
| TORCH_CHECK(aten_output.allclose(cg_output)); |
| } |
| |
| void testGPU_FusionReduction2() { |
| { |
| torch::jit::fuser::cuda::CudaKernel prog; |
| Fusion& fusion = *prog.fusion_; |
| FusionGuard fg(&fusion); |
| |
| // Set up your input tensor views |
| TensorView* tv0 = makeDummyTensor(2); |
| fusion.addInput(tv0); |
| |
| // tv1[I0, R1] = tv0[I0, I1] |
| TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Float(0), tv0); |
| |
| fusion.addOutput(tv1); |
| |
| // switches to try some different scenarios. maybe we should iterate on all |
| // permutations. |
| bool bind_bidx = true; |
| bool bind_tidx = true; |
| bool bind_tidy = true; |
| bool bind_unroll = true; |
| |
| int numel_x = 1025; // Cannot exceed block dim max size / tidy |
| int numel_y = 129; |
| int tidx = 16; |
| int tidy = 8; |
| int unroll_factor = 4; |
| |
| int bidx = bind_tidy ? ceilDiv_(numel_x, tidy) : numel_x; |
| |
| tv1->split(1, tidx); |
| // tv1[I0, R1o, R1i{tidx}] = tv0[I0, I1] |
| |
| tv1->split(1, unroll_factor); |
| // tv1[I0, R1oo, R1oi{unroll}, R1i{tidx}] = tv0[I0, I1] |
| |
| tv1->split(0, tidy); |
| |
| TensorView* tv2 = tv1->rFactor({-3}); |
| // tv2[I0, >R1oo<, Ir1oi{unroll}, Ir1i{tidx}] |
| // tv1[I0o, I0i{tidy}, R1oi{unroll}, R1i{tidx}] |
| |
| TensorView* tv3 = tv1->rFactor({-2}); |
| // tv2[I0, >R1oo<, Ir1oi{unroll}, Ir1i{tidx}] |
| // tv3[I0, R1oi{unroll}, Ir1i{tidx}] |
| // tv1[I0o, I0i{tidy}, R1i{tidx}] |
| |
| tv0->computeAt(tv1, -2); |
| |
| if (bind_unroll) |
| tv2->axis(-2)->parallelize(ParallelType::Unroll); |
| if (bind_bidx) |
| tv1->axis(0)->parallelize(ParallelType::BIDx); |
| if (bind_tidy) |
| tv1->axis(1)->parallelize(ParallelType::TIDy); |
| |
| if (bind_tidx) { |
| tv2->axis(-1)->parallelize(ParallelType::TIDx); |
| tv3->axis(-1)->parallelize(ParallelType::TIDx); |
| tv1->axis(-1)->parallelize(ParallelType::TIDx); |
| } |
| |
| prog.device_ = 0; |
| prog.grid(bind_bidx ? bidx : 1); |
| prog.block(bind_tidx ? tidx : 1, bind_tidy ? tidy : 1); |
| |
| auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
| at::Tensor input = at::rand({numel_x, numel_y}, options); |
| at::Tensor cg_output = at::empty({numel_x}, options); |
| |
| torch::jit::fuser::cuda::compileKernel(&prog); |
| torch::jit::fuser::cuda::runTestKernel(&prog, {input}, {cg_output}); |
| |
| c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream(); |
| AT_CUDA_CHECK(cudaStreamSynchronize(stream)); |
| |
| auto aten_output = input.sum({1}); |
| TORCH_CHECK(aten_output.allclose(cg_output)); |
| } |
| |
| { |
| // What if Z participates in the reduction with X? |
| torch::jit::fuser::cuda::CudaKernel prog; |
| Fusion& fusion = *prog.fusion_; |
| FusionGuard fg(&fusion); |
| |
| // Set up your input tensor views |
| TensorView* tv0 = makeDummyTensor(2); |
| fusion.addInput(tv0); |
| |
| // tv1[I0, R1] = tv0[I0, I1] |
| TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Float(0), tv0); |
| |
| fusion.addOutput(tv1); |
| |
| int numel_x = 1025; // Cannot exceed block dim max size / tidy |
| int numel_y = 129; |
| int tidx = 16; |
| int tidz = 8; |
| |
| tv1->split(1, tidz); |
| // tv1[I0, R1o, R1i{tidz}] = tv0[I0, I1] |
| |
| tv1->split(1, tidx); |
| // tv1[I0, R1oo, R1oi{tidx}, R1i{tidz}] = tv0[I0, I1] |
| |
| TensorView* tv2 = tv1->rFactor({-3}); |
| // tv2[I0, >R1oo<, Ir1oi{tidx}, Ir1i{tidz}] |
| // tv1[I0o, R1oi{tidx}, R1i{tidz}] |
| |
| tv0->computeAt(tv1, -3); |
| |
| tv1->axis(0)->parallelize(ParallelType::BIDx); |
| tv1->axis(-2)->parallelize(ParallelType::TIDx); |
| tv1->axis(-1)->parallelize(ParallelType::TIDz); |
| |
| tv2->axis(-2)->parallelize(ParallelType::TIDx); |
| tv2->axis(-1)->parallelize(ParallelType::TIDz); |
| |
| prog.device_ = 0; |
| prog.grid(numel_x); |
| prog.block(tidx, 1, tidz); |
| |
| auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
| at::Tensor input = at::rand({numel_x, numel_y}, options); |
| at::Tensor cg_output = at::empty({numel_x}, options); |
| |
| torch::jit::fuser::cuda::compileKernel(&prog); |
| torch::jit::fuser::cuda::runTestKernel(&prog, {input}, {cg_output}); |
| |
| c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream(); |
| AT_CUDA_CHECK(cudaStreamSynchronize(stream)); |
| |
| auto aten_output = input.sum({1}); |
| TORCH_CHECK(aten_output.allclose(cg_output)); |
| } |
| } |
| |
| // TODO: Fix and reenable this test. |
| void testGPU_FusionReduction3() { |
| { |
| torch::jit::fuser::cuda::CudaKernel prog; |
| Fusion& fusion = *prog.fusion_; |
| FusionGuard fg(&fusion); |
| |
| // Set up your input tensor views |
| TensorView* tv0 = makeDummyTensor(2); |
| TensorView* tv1 = makeDummyTensor(2); |
| |
| TensorView* tv2 = add(tv0, tv1); |
| // tv2[I0, I1] = tv0[I0, I1] + tv1[I0, I1] |
| |
| fusion.addInput(tv0); |
| fusion.addInput(tv1); |
| |
| TensorView* tv3 = reductionOp(BinaryOpType::Add, {1}, new Float(0), tv2); |
| // tv3[I0, R1] = tv2[I0, I1] |
| |
| TensorView* tv4 = makeDummyTensor(1); |
| fusion.addInput(tv4); |
| |
| // tv5[I0] = tv3[I0, R1] * tv4[I0] |
| TensorView* tv5 = mul(tv3, tv4); |
| fusion.addOutput(tv5); |
| |
| int tidx = 16; |
| |
| // RFactor the reduction |
| tv3->split(1, tidx); |
| // tv3[I0, R1o, R1i{tidx}] = tv2[I0, I1] |
| |
| TensorView* tv6 = tv3->rFactor({-2}); |
| // tv6[I0, R1o, iR1i{tidx}] = tv2[I0, I1] |
| // tv3[I0, R1i{tidx}] = tv3[I0, I1] |
| tv2->computeAt(tv6, 2); |
| |
| // Compute at inline with tv5 (only 1D) |
| tv6->computeAt(tv3, 1); |
| tv3->computeAt(tv5, 1); |
| |
| tv5->axis(0)->parallelize(ParallelType::BIDx); |
| |
| // Intermediate tensors only need this, but doesn't hurt to do on inputs |
| // tv0, 1, 4 |
| tv2->axis(-1)->parallelize(ParallelType::TIDx); |
| tv3->axis(-1)->parallelize(ParallelType::TIDx); |
| tv6->axis(-1)->parallelize(ParallelType::TIDx); |
| |
| int numel_x = 1025; |
| int numel_y = 129; |
| int bidx = numel_x; |
| |
| prog.device_ = 0; |
| prog.grid(bidx); |
| prog.block(tidx); |
| |
| auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
| at::Tensor t0 = at::rand({numel_x, numel_y}, options); |
| at::Tensor t1 = at::rand({numel_x, numel_y}, options); |
| auto t2 = t0.add(t1); |
| auto t3 = t2.sum({1}); |
| at::Tensor t4 = at::rand({numel_x}, options); |
| auto t5 = t3.mul(t4); |
| |
| at::Tensor cg_output = at::empty({numel_x}, options); |
| |
| torch::jit::fuser::cuda::compileKernel(&prog); |
| torch::jit::fuser::cuda::runTestKernel(&prog, {t0, t1, t4}, {cg_output}); |
| |
| c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream(); |
| AT_CUDA_CHECK(cudaStreamSynchronize(stream)); |
| |
| TORCH_CHECK( |
| t5.allclose(cg_output), "Error of: ", t5.sub(cg_output).abs().max()); |
| } |
| } |
| |
| void testGPU_FusionSimpleBCast() { |
| { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| // Set up your input tensor views |
| TensorView* tv0 = makeDummyTensor(2); |
| TensorView* tv1 = makeDummyTensor(2); |
| fusion.addInput(tv0); |
| fusion.addInput(tv1); |
| |
| TensorView* tv2 = add(tv0, tv1); |
| |
| // tv1[I0, R1] = tv0[I0, I1] |
| TensorView* tv3 = broadcast(tv2, {false, true, true, false}); |
| |
| Val* tv4 = mul(tv3, makeDummyTensor(4)); |
| fusion.addOutput(tv4); |
| } |
| |
| { |
| Fusion fusion; |
| FusionGuard fg(&fusion); |
| |
| // Set up your input tensor views |
| TensorView* tv0 = makeDummyTensor(2); |
| TensorView* tv1 = makeDummyTensor(2); |
| fusion.addInput(tv0); |
| fusion.addInput(tv1); |
| |
| TensorView* tv2 = broadcast(tv0, {true, false, false}); |
| TensorView* tv3 = broadcast(tv1, {false, false, true}); |
| |
| TensorView* tv4 = mul(tv3, tv2); |
| fusion.addOutput(tv4); |
| |
| tv0->computeAt(tv4, -1); |
| tv1->computeAt(tv4, -1); |
| |
| // GPULower lower(&fusion); |
| // lower.printKernel(std::cout); |
| } |
| } |
| |
| } // namespace jit |
| } // namespace torch |
| #endif // #if defined(USE_CUDA) |