test/cpp/jit/test_gpu.cpp - platform/external/pytorch - Git at Google

 #if defined(USE_CUDA)
 #include <test/cpp/jit/test_base.h>

 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/kernel.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/mutator.h>
 #include <torch/csrc/jit/codegen/cuda/tensor_meta.h>
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>

 // fuser and IR parser
 #include <torch/csrc/jit/codegen/cuda/parser.h>
 #include "torch/csrc/jit/ir/irparser.h"

 #include <iostream>

 // Tests go in torch::jit
 namespace torch {
 namespace jit {

 using namespace torch::jit::fuser;

 TensorView* makeDummyTensor(int nDims) {
   std::vector<IterDomain*> dom;
   for (int i = 0; i < nDims; i++)
     dom.push_back(new IterDomain(new Int(0), new Int()));

   return new TensorView(new TensorDomain(dom), DataType::Float);
 }

 // 1. Test cases are void() functions.
 // 2. They start with the prefix `test`

 void testGPU_FusionDispatch() {
   Fusion fusion;
   FusionGuard fg(&fusion);

   Float* f = new Float{2.f};
   std::stringstream ss1, ss2, ss3;
   ss1 << f;
   ss2 << static_cast<Val*>(f);
   ss3 << static_cast<Statement*>(f);
   TORCH_CHECK(
       ss1.str().compare(ss2.str()) == 0 && ss1.str().compare(ss3.str()) == 0,
       "Error with dispatch system where results differ by passing Float* vs Val* vs Statement*.");
 }

 void testGPU_FusionSimpleArith() {
   std::stringstream ss1, ss2;

   Fusion fusion;
   FusionGuard fg(&fusion);

   Float* f1 = new Float(1.f);
   Float* f2 = new Float{2.f};
   Float* f3 = new Float();

   // Disrupt the fusion to make sure guard works well
   {
     Fusion fusion2;
     FusionGuard fg(&fusion2);

     Float* f1 = new Float(1.f);
     Float* f2 = new Float(2.f);
     auto f3 = add(f1, f2);
     ss2 << fusion2;
   }

   new BinaryOp(BinaryOpType::Add, f3, f1, f2);
   ss1 << fusion;

   TORCH_CHECK(
       ss1.str().compare(ss2.str()) == 0,
       "Error where explicit add nodes don't match implicit add nodes.");
 }

 void testGPU_FusionSimpleTypePromote() {
   Fusion fusion;
   FusionGuard fg(&fusion);

   Float* f4 = new Float{4.f};
   Int* i1 = new Int{3};
   auto f5 = add(f4, i1);

   TORCH_CHECK(f5->getDataType() == DataType::Float);
 }

 void testGPU_FusionCastOp() {
   Fusion fusion;
   FusionGuard fg(&fusion);

   Float* f3_test = new Float{3.f};
   Int* i3 = new Int{3};
   auto f3 = castOp(DataType::Float, i3);

   TORCH_CHECK(f3->getDataType().value() == f3_test->getDataType().value());
 }

 class ZeroMutator : public OptOutMutator {
  public:
   Statement* mutate(Float* f) {
     if (f->isConst() && *(f->value()) == 1.0)
       return new Float(0.0);
     return f;
   }
   void mutate(Fusion* f) {
     OptOutMutator::mutate(f);
   }
 };

 void testGPU_FusionMutator() {
   Fusion fusion;
   FusionGuard fg(&fusion);

   Float* f4 = new Float{1.f};
   Int* i1 = new Int{3};
   Val* f5 = add(f4, i1);
   ZeroMutator mutator;
   mutator.mutate(&fusion);
   Val* lhs = static_cast<BinaryOp*>(fusion.origin(f5))->lhs();
   TORCH_CHECK(
       lhs->getValType().value() == ValType::Scalar &&
       lhs->getDataType().value() == DataType::Float);
   Float* flhs = static_cast<Float*>(lhs);

   TORCH_CHECK(flhs->value().value() == 0.f);
 }

 void testGPU_FusionRegister() {
   Fusion fusion;
   FusionGuard fg(&fusion);
   Float* v1 = new Float{1.f};
   Float* v2 = new Float{2.f};
   Val* v3 = binaryOp(BinaryOpType::Add, v1, v2);
   Val* v4 = binaryOp(BinaryOpType::Add, v1, v2);
   TORCH_CHECK(v1->name() + 1 == v2->name());
   TORCH_CHECK(v2->name() + 1 == v3->name());
   TORCH_CHECK(v3->name() + 1 == v4->name());
   TORCH_CHECK(fusion.origin(v3)->name() + 1 == fusion.origin(v4)->name());
 }

 // dummy expr with 2 outputs only for toposort test.
 struct DummyExpr : public Expr {
   ~DummyExpr() = default;
   DummyExpr(Val* _outlhs, Val* _outrhs, Val* _lhs, Val* _rhs)
       : Expr(ExprType::BinaryOp) // Not terribly safe...
   {
     addOutput(_outlhs);
     addOutput(_outrhs);
     addInput(_lhs);
     addInput(_rhs);
     this->name_ = FusionGuard::getCurFusion()->registerExpr(this);
   }
   DummyExpr(const DummyExpr& other) = delete;
   DummyExpr& operator=(const DummyExpr& other) = delete;
   DummyExpr(DummyExpr&& other) = delete;
   DummyExpr& operator=(DummyExpr&& other) = delete;
 };

 void testGPU_FusionTopoSort() {
   Fusion fusion;
   FusionGuard fg(&fusion);

   // e0: v3, v2 = dummy(v1, v0)
   // e1: v4     =   add(v3, v2)
   // e2: v5     =   add(v2, v4)
   // e3: v6     =   add(v5, v5)
   Float* v0 = new Float{1.f};
   Float* v1 = new Float{2.f};
   Float* v2 = new Float();
   Float* v3 = new Float();
   Float* v4 = new Float();
   Float* v5 = new Float();
   Float* v6 = new Float();

   Expr* e0 = new DummyExpr(v3, v2, v1, v0);
   Expr* e1 = new BinaryOp(BinaryOpType::Add, v4, v3, v2);
   Expr* e2 = new BinaryOp(BinaryOpType::Add, v5, v2, v4);
   Expr* e3 = new BinaryOp(BinaryOpType::Add, v6, v5, v5);

   std::vector<Expr*> exprs = fusion.exprs();

   TORCH_CHECK(exprs.size() == 4);
   TORCH_CHECK(exprs[0] == e0);
   TORCH_CHECK(exprs[1] == e1);
   TORCH_CHECK(exprs[2] == e2);
   TORCH_CHECK(exprs[3] == e3);

   fusion.addOutput(v2);
   exprs = fusion.exprs(true);
   TORCH_CHECK(exprs.size() == 1);
   TORCH_CHECK(exprs[0] == e0);

   fusion.addOutput(v5);
   exprs = fusion.exprs(true);
   TORCH_CHECK(exprs[0] == e0);
   TORCH_CHECK(exprs[1] == e1);
   TORCH_CHECK(exprs[2] == e2);

   fusion.addOutput(v4);
   exprs = fusion.exprs(true);
   TORCH_CHECK(exprs[0] == e0);
   TORCH_CHECK(exprs[1] == e1);
   TORCH_CHECK(exprs[2] == e2);

   fusion.addOutput(v3);
   exprs = fusion.exprs(true);
   TORCH_CHECK(exprs[0] == e0);
   TORCH_CHECK(exprs[1] == e1);
   TORCH_CHECK(exprs[2] == e2);

   fusion.addOutput(v6);
   exprs = fusion.exprs(true);
   TORCH_CHECK(exprs.size() == 4);
   TORCH_CHECK(exprs[0] == e0);
   TORCH_CHECK(exprs[1] == e1);
   TORCH_CHECK(exprs[2] == e2);
   TORCH_CHECK(exprs[3] == e3);

   TORCH_CHECK(fusion.origin(v2)->name() == 0);
   TORCH_CHECK(fusion.origin(v3)->name() == 0);
   TORCH_CHECK(fusion.origin(v4)->name() == 1);
   TORCH_CHECK(fusion.origin(v5)->name() == 2);
   TORCH_CHECK(fusion.origin(v6)->name() == 3);
 }

 void testGPU_FusionTensor() {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);

   auto tensor = at::randn({2, 3, 4, 5}, options);
   auto sizes = tensor.sizes().vec();
   auto tensor_type = TensorType::create(tensor);

   Fusion fusion;
   FusionGuard fg(&fusion);

   auto fuser_tensor = new TensorView(tensor_type);
   TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float);
   TORCH_CHECK(fuser_tensor->domain() != nullptr);
 }

 void testGPU_FusionTensorContiguity() {
   {
     // NCHW memory layout
     auto tensor = at::randn({2, 3, 4, 5});
     auto sizes = tensor.sizes().vec();
     auto strides = tensor.strides().vec();
     TensorContiguity t_c(sizes, strides);
     TORCH_CHECK(t_c.rank() == 4);
     TORCH_CHECK(t_c.getBroadcastDims().size() == 0);
     for (int i = 0; i < 4; i++) {
       TORCH_CHECK(!t_c.isBroadcastDim(i));
       if (i < 3) {
         TORCH_CHECK(t_c.canCollapseToHigher(i));
       }
     }
   }

   {
     // NHWC memory layout
     TensorContiguity t_c({2, 3, 4, 5}, {60, 1, 15, 3});
     TORCH_CHECK(t_c.rank() == 4);
     TORCH_CHECK(t_c.getBroadcastDims().size() == 0);
     for (int i = 0; i < 4; i++) {
       TORCH_CHECK(!t_c.isBroadcastDim(i));
       if (i < 3) {
         TORCH_CHECK((t_c.canCollapseToHigher(i) ^ (i != 2)));
       }
     }
   }

   {
     // NHWC memory layout with broadcast
     TensorContiguity t_c({2, 3, 4, 5}, {120, 0, 30, 3});
     TORCH_CHECK(t_c.rank() == 4);
     auto b_dims = t_c.getBroadcastDims();
     TORCH_CHECK(b_dims.size() == 1 && b_dims[0] == 1);
     for (int i = 0; i < 4; i++) {
       TORCH_CHECK(!(t_c.isBroadcastDim(i)) ^ (i == 1));
       if (i < 3) {
         TORCH_CHECK(!(t_c.canCollapseToHigher(i)));
       }
     }
   }

   {
     // contiguity across size-1 dimension
     auto tensor = at::randn({4, 1, 4});
     auto sizes = tensor.sizes().vec();
     auto strides = tensor.strides().vec();
     auto dim = sizes.size();
     TensorContiguity t_c(sizes, strides);
     TORCH_CHECK(t_c.rank() == sizes.size());
     auto b_dims = t_c.getBroadcastDims();
     TORCH_CHECK(b_dims.size() == 0);
     TORCH_CHECK(t_c.getFCD() == 2);
     TORCH_CHECK(t_c.hasContiguousFCD());
     for (int i = 0; i < dim; i++) {
       TORCH_CHECK(!t_c.isBroadcastDim(i));
       if (i < dim - 1) {
         TORCH_CHECK(t_c.canCollapseToHigher(i));
       }
     }
   }

   {
     // no contiguity across size-1 dimension
     auto tensor = at::randn({4, 4, 4}).split(1, 1)[0];
     auto sizes = tensor.sizes().vec();
     auto strides = tensor.strides().vec();
     TensorContiguity t_c(sizes, strides);
     TORCH_CHECK(!(t_c.canCollapseToHigher(0)));
     TORCH_CHECK((t_c.canCollapseToHigher(1)));
   }

   {
     // no contiguity across size-1 dimension
     auto tensor = at::randn({4, 1, 8}).split(4, 2)[0];
     auto sizes = tensor.sizes().vec();
     auto strides = tensor.strides().vec();
     TensorContiguity t_c(sizes, strides);
     TORCH_CHECK((t_c.canCollapseToHigher(0)));
     TORCH_CHECK((!t_c.canCollapseToHigher(1)));
   }

   {
     // no contiguity across size-1 dimension
     auto tensor = at::randn({8, 1, 4}).split(4, 0)[0];
     auto sizes = tensor.sizes().vec();
     auto strides = tensor.strides().vec();
     TensorContiguity t_c(sizes, strides);
     TORCH_CHECK((t_c.canCollapseToHigher(0)));
     TORCH_CHECK((t_c.canCollapseToHigher(1)));
   }

   {
     // test merge
     TensorContiguity t_c_l({4, 4, 4}, {16, 4, 1});
     TensorContiguity t_c_r({4, 4, 4}, {16, 4, 1});
     t_c_l.merge(t_c_r);
     TORCH_CHECK((t_c_l.isIdentical(t_c_r)));
   }

   {
     TensorContiguity t_c_l({4, 4, 4, 4}, {16, 0, 4, 1});
     TensorContiguity t_c_r({4, 4, 4, 4}, {64, 16, 4, 1});
     t_c_l.merge(t_c_r);
     TORCH_CHECK(t_c_l.getFCD() == 3);
     TORCH_CHECK(t_c_l.getAxisByStride(0) == 0);
   }

   {
     // NHWC + NCHW
     TensorContiguity t_c_l({4, 4, 4, 4}, {64, 16, 4, 1});
     TensorContiguity t_c_r({4, 4, 4, 4}, {64, 1, 16, 4});
     t_c_l.merge(t_c_r);
     TORCH_CHECK(!t_c_l.hasContiguousFCD());
     TORCH_CHECK(t_c_l.getFCD() == -1);
     TORCH_CHECK(t_c_l.getAxisByStride(0) == 0);
     TORCH_CHECK(t_c_l.getAxisByStride(1) == -1);
     TORCH_CHECK(t_c_l.getAxisByStride(2) == -1);
     TORCH_CHECK(t_c_l.getAxisByStride(3) == -1);
   }

   {
     // NCHW + NCHW with broadcasting
     TensorContiguity t_c_l({4, 4, 4, 4}, {4, 1, 4, 0});
     TensorContiguity t_c_r({4, 4, 4, 4}, {64, 1, 16, 4});
     t_c_l.merge(t_c_r);
     TORCH_CHECK(t_c_l.getFCD() == 1);
     TORCH_CHECK(t_c_l.getAxisByStride(0) == 0);
   }
 }

 void testGPU_FusionTVSplit() {
   Fusion fusion;
   FusionGuard fg(&fusion);

   TensorView* tv = makeDummyTensor(3);

   tv = tv->split(2, 2);
   TORCH_CHECK(tv->nDims() == 4);
   Expr* outer = tv->axis(2)->extent()->getOrigin();

   TORCH_CHECK(
       outer->getExprType().value() == ExprType::BinaryOp &&
       static_cast<BinaryOp*>(outer)->getBinaryOpType() ==
           BinaryOpType::CeilDiv &&
       static_cast<BinaryOp*>(outer)->lhs()->sameAs(
           tv->getRootDomain()->axis(2)->extent()) &&
       static_cast<Int*>(static_cast<BinaryOp*>(outer)->rhs())
           ->sameAs(new Int(2)));

   IterDomain* inner = static_cast<IterDomain*>(tv->axis(3));
   TORCH_CHECK(
       inner->extent()->isScalar() &&
       static_cast<Int*>(inner->extent())->isConst() &&
       static_cast<Int*>(inner->extent())->value().value() == 2);
 }

 void testGPU_FusionTVMerge() {
   Fusion fusion;
   FusionGuard fg(&fusion);

   TensorView* tv = makeDummyTensor(3);

   tv = tv->merge(1);
   Expr* axisOp = tv->axis(1)->extent()->getOrigin();

   TORCH_CHECK(
       tv->nDims() == 2 && axisOp->getExprType() == ExprType::BinaryOp &&
       static_cast<BinaryOp*>(axisOp)->getBinaryOpType() == BinaryOpType::Mul &&
       static_cast<BinaryOp*>(axisOp)->lhs() ==
           tv->getRootDomain()->axis(1)->extent() &&
       static_cast<BinaryOp*>(axisOp)->rhs() ==
           tv->getRootDomain()->axis(2)->extent());
 }

 void testGPU_FusionTVReorder() {
   Fusion fusion;
   FusionGuard fg(&fusion);

   TensorView* dummyTensor = makeDummyTensor(3);

   std::unordered_map<int, int> shift_right{{-1, 0}};

   std::unordered_map<int, int> shift_left{{0, -1}};

   std::unordered_map<int, int> shift_left_2{{0, -1}, {1, 0}, {2, 1}};

   std::unordered_map<int, int> swap{{0, 2}, {2, 0}};
   TensorView* ref = dummyTensor->clone();
   TensorView* tv = dummyTensor->clone();

   TensorView* s_leftl = tv->reorder(shift_left);
   for (int i = 0; i < tv->nDims(); i++)
     TORCH_CHECK(ref->axis(i) == s_leftl->axis(i - 1));

   tv = dummyTensor->clone();
   TensorView* s_left2 = tv->reorder(shift_left);
   for (int i = 0; i < tv->nDims(); i++)
     TORCH_CHECK(ref->axis(i) == s_left2->axis(i - 1));

   tv = dummyTensor->clone();
   TensorView* s_right = tv->reorder(shift_right);
   for (int i = 0; i < tv->nDims(); i++)
     TORCH_CHECK(ref->axis(i - 1) == s_right->axis(i));

   tv = dummyTensor->clone();
   TensorView* rswap = tv->reorder(swap);
   TORCH_CHECK(ref->axis(0) == rswap->axis(2));
   TORCH_CHECK(ref->axis(2) == rswap->axis(0));
   TORCH_CHECK(ref->axis(1) == rswap->axis(1));
 }

 void testGPU_FusionEquality() {
   Fusion fusion;
   FusionGuard fg(&fusion);

   Float* fval1 = new Float();
   Float* fval1_copy = fval1;
   Float* fval2 = new Float();
   Float* fone = new Float(1.0);

   TORCH_CHECK(fval1->sameAs(fval1_copy));
   TORCH_CHECK(!fval1->sameAs(fval2));
   TORCH_CHECK(!fone->sameAs(fval1));
   TORCH_CHECK(fone->sameAs(new Float(1.0)));

   Int* ival1 = new Int();
   Int* ival1_copy = ival1;
   Int* ival2 = new Int();
   Int* ione = new Int(1);

   TORCH_CHECK(ival1->sameAs(ival1_copy));
   TORCH_CHECK(!ival1->sameAs(ival2));
   TORCH_CHECK(!ione->sameAs(ival1));
   TORCH_CHECK(ione->sameAs(new Int(1)));

   BinaryOp* add1 = new BinaryOp(BinaryOpType::Add, new Float(), fval1, ival1);
   BinaryOp* add1_copy =
       new BinaryOp(BinaryOpType::Add, new Float(), fval1, ival1);
   BinaryOp* sub1 = new BinaryOp(BinaryOpType::Sub, new Float(), fval1, ival1);

   UnaryOp* neg1 = new UnaryOp(UnaryOpType::Neg, new Float(), fval1);
   UnaryOp* neg2 = new UnaryOp(UnaryOpType::Neg, new Float(), fval2);
   UnaryOp* neg1_copy = new UnaryOp(UnaryOpType::Neg, new Float(), fval1);

   TORCH_CHECK(add1->sameAs(add1_copy));
   TORCH_CHECK(!add1->sameAs(sub1));

   TORCH_CHECK(neg1->sameAs(neg1_copy));
   TORCH_CHECK(!static_cast<Expr*>(neg1)->sameAs(add1));
   TORCH_CHECK(!neg1->sameAs(neg2));
 }

 void testGPU_FusionReplaceAll() {
   Fusion fusion;
   FusionGuard fg(&fusion);

   Float* f0 = new Float();
   Float* f1 = new Float{1.f};
   Float* f2 = new Float{2.f};
   Float* f3 = new Float();
   Float* f4 = static_cast<Float*>(add(f1, f0));

   // replace the output f4 with f3
   ReplaceAll::instancesOf(f4, f3);
   // f3 should now have an origin function
   TORCH_CHECK(fusion.origin(f3) != nullptr);

   // Should have removed f4 completely so we shouldn't have any other expr than
   // f3 construction
   TORCH_CHECK(fusion.exprs().size() == 1);

   // Replace constant Float's of value 1.f with 2.f
   ReplaceAll::instancesOf(f1, f2);
   BinaryOp* bop = static_cast<BinaryOp*>(fusion.origin(f3));
   // make sure the binary op (origin of f3) actually changed to 2.f
   TORCH_CHECK(static_cast<Float*>(bop->lhs())->sameAs(new Float{2.f}));
 }

 void testGPU_FusionParser() {
   auto g = std::make_shared<Graph>();
   const auto graph0_string = R"IR(
     graph(%0 : Float(2:1),
           %1 : Float(2:1)):
       %c0 : Float(2:1) = aten::mul(%0, %1)
       %d0 : Float(2:1) = aten::mul(%c0, %0)
       return (%d0))IR";
   torch::jit::parseIR(graph0_string, g.get());

   // strides are not yet supported in the irparser.
   for (auto val : g->block()->inputs()) {
     if (val->isCompleteTensor())
       val->setType(val->type()->cast<TensorType>()->contiguous());
   }
   for (auto node : g->block()->nodes()) {
     for (auto val : node->outputs()) {
       if (val->isCompleteTensor())
         val->setType(val->type()->cast<TensorType>()->contiguous());
     }
   }

   Fusion fusion;
   FusionGuard fg(&fusion);
   fuser::cuda::parseJitIR(g, fusion);

   std::stringstream ref;
   ref << "__device__ int ceilDiv(const int a, const int b) {\n"
       << "  return (a + b - 1) / b;\n"
       << "}\n"
       << "\n"
       << "__global__ void CUDAGeneratedKernel(Tensor<float, 1> T0, Tensor<float, 1> T1, Tensor<float, 1> T3){\n"
       << "  float T2[4];\n"
       << "  if ( ( ( ( ( ( blockIdx.x * 4 ) + ( 4 - 1 ) ) * 128 ) + threadIdx.x ) < T1.size[0] ) ) { \n"
       << "    for(size_t i64 = 0; i64 < 4; ++i64 ) {\n"
       << "      T2[ i64 ]\n"
       << "         = T0[ ( ( ( ( ( blockIdx.x * 4 ) + i64 ) * 128 ) + threadIdx.x ) * T0.stride[0] ) ]\n"
       << "         * T1[ ( ( ( ( ( blockIdx.x * 4 ) + i64 ) * 128 ) + threadIdx.x ) * T1.stride[0] ) ];\n"
       << "    }\n"
       << "  } else { \n"
       << "    for(size_t i64 = 0; i64 < 4; ++i64 ) {\n"
       << "      if ( ( ( ( ( ( blockIdx.x * 4 ) + i64 ) * 128 ) + threadIdx.x ) < T1.size[0] ) ) { \n"
       << "        T2[ i64 ]\n"
       << "           = T0[ ( ( ( ( ( blockIdx.x * 4 ) + i64 ) * 128 ) + threadIdx.x ) * T0.stride[0] ) ]\n"
       << "           * T1[ ( ( ( ( ( blockIdx.x * 4 ) + i64 ) * 128 ) + threadIdx.x ) * T1.stride[0] ) ];\n"
       << "      }\n"
       << "    }\n"
       << "  }\n"
       << "  if ( ( ( ( ( ( blockIdx.x * 4 ) + ( 4 - 1 ) ) * 128 ) + threadIdx.x ) < T3.size[0] ) ) { \n"
       << "    for(size_t i65 = 0; i65 < 4; ++i65 ) {\n"
       << "      T3[ ( ( ( ( ( blockIdx.x * 4 ) + i65 ) * 128 ) + threadIdx.x ) * T3.stride[0] ) ]\n"
       << "         = T2[ i65 ]\n"
       << "         * T0[ ( ( ( ( ( blockIdx.x * 4 ) + i65 ) * 128 ) + threadIdx.x ) * T0.stride[0] ) ];\n"
       << "    }\n"
       << "  } else { \n"
       << "    for(size_t i65 = 0; i65 < 4; ++i65 ) {\n"
       << "      if ( ( ( ( ( ( blockIdx.x * 4 ) + i65 ) * 128 ) + threadIdx.x ) < T3.size[0] ) ) { \n"
       << "        T3[ ( ( ( ( ( blockIdx.x * 4 ) + i65 ) * 128 ) + threadIdx.x ) * T3.stride[0] ) ]\n"
       << "           = T2[ i65 ]\n"
       << "           * T0[ ( ( ( ( ( blockIdx.x * 4 ) + i65 ) * 128 ) + threadIdx.x ) * T0.stride[0] ) ];\n"
       << "      }\n"
       << "    }\n"
       << "  }\n"
       << "}\n";

   GPULower gpulw(&fusion);
   std::stringstream cdg;
   gpulw.printKernel(cdg);
   if (ref.str().size() != cdg.str().size() ||
       ref.str().compare(cdg.str()) != 0) {
     std::cerr
         << " Codegen mismatch, codegen possibly changed, or is incorrect. "
         << " \n ========= REF ========= \n"
         << ref.str() << "\n========= RESULT ========== \n"
         << cdg.str() << "\n=================" << std::endl;
     TORCH_CHECK(false);
   }
 }

 void testGPU_FusionDependency() {
   Fusion fusion;
   FusionGuard fg(&fusion);

   Float* f0 = new Float(0.f);
   Float* f1 = new Float(1.f);
   auto f2 = add(f0, f1);

   auto f3 = add(f2, f2);

   Float* f4 = new Float(4.f);
   Float* f5 = new Float(5.f);
   auto f6 = add(f4, f5);

   Float* f7 = new Float(7.f);
   Float* f8 = new Float(8.f);
   auto f9 = add(f7, f8);

   auto f10 = add(f6, f9);

   auto f11 = add(f3, f10);

   TORCH_CHECK(DependencyCheck::isDependencyOf(f0, f11));
   TORCH_CHECK(DependencyCheck::isDependencyOf(f1, f11));
   TORCH_CHECK(DependencyCheck::isDependencyOf(f2, f11));
   TORCH_CHECK(DependencyCheck::isDependencyOf(f3, f11));
   TORCH_CHECK(DependencyCheck::isDependencyOf(f6, f11));
   TORCH_CHECK(DependencyCheck::isDependencyOf(f9, f11));
   TORCH_CHECK(DependencyCheck::isDependencyOf(f0, f2));
   TORCH_CHECK(DependencyCheck::isDependencyOf(f2, f3));
   TORCH_CHECK(DependencyCheck::isDependencyOf(f4, f6));
   TORCH_CHECK(DependencyCheck::isDependencyOf(f8, f10));

   TORCH_CHECK(!DependencyCheck::isDependencyOf(f11, f0));
   TORCH_CHECK(!DependencyCheck::isDependencyOf(f11, f1));
   TORCH_CHECK(!DependencyCheck::isDependencyOf(f11, f2));
   TORCH_CHECK(!DependencyCheck::isDependencyOf(f11, f3));
   TORCH_CHECK(!DependencyCheck::isDependencyOf(f11, f4));
   TORCH_CHECK(!DependencyCheck::isDependencyOf(f11, f5));
   TORCH_CHECK(!DependencyCheck::isDependencyOf(f2, f0));
   TORCH_CHECK(!DependencyCheck::isDependencyOf(f3, f2));
   TORCH_CHECK(!DependencyCheck::isDependencyOf(f6, f4));
   TORCH_CHECK(!DependencyCheck::isDependencyOf(f10, f8));

   std::stack<Val*> dep_chain = DependencyCheck::getDependencyChain(f0, f11);
   TORCH_CHECK(dep_chain.top() == f11);
   dep_chain.pop();
   TORCH_CHECK(dep_chain.top() == f3);
   dep_chain.pop();
   TORCH_CHECK(dep_chain.top() == f2);
   dep_chain.pop();

   dep_chain = DependencyCheck::getDependencyChain(f6, f11);
   TORCH_CHECK(dep_chain.top() == f11);
   dep_chain.pop();
   TORCH_CHECK(dep_chain.top() == f10);
   dep_chain.pop();

   dep_chain = DependencyCheck::getDependencyChain(f4, f11);
   TORCH_CHECK(dep_chain.top() == f11);
   dep_chain.pop();
   TORCH_CHECK(dep_chain.top() == f10);
   dep_chain.pop();
   TORCH_CHECK(dep_chain.top() == f6);
   dep_chain.pop();

   dep_chain = DependencyCheck::getDependencyChain(f11, f2);
   TORCH_CHECK(dep_chain.empty());
 }

 void testGPU_FusionCodeGen() {
   Fusion fusion;
   FusionGuard fg(&fusion);

   TensorView* tv0 = makeDummyTensor(3);

   new BinaryOp(BinaryOpType::Add, tv0, new Float(0.0), new Float(1.0));
   TensorView* tv1 = static_cast<TensorView*>(add(tv0, new Float(2.0)));
   TensorView* tv2 = static_cast<TensorView*>(add(tv1, new Float(3.0)));

   //[I0, I1, I2]
   tv2 = tv2->split(0, 4);
   //[I0o, I0i{4}, I1, I2]
   tv2 = tv2->merge(1);
   //[I0o, I0i{4}*I1, I2]
   tv2 = tv2->split(-1, 2);
   //[I0o, I0i{4}*I1, I2o, I2i{2}]
   tv2 = tv2->reorder({{0, 1}, {1, 0}, {3, 2}});
   //[I0i{4}*I1, I0o, I2i{2}, I2o]
   fusion.addOutput(tv2);

   tv0->computeAt(tv2, -1);

   std::stringstream ref;
   ref << "__device__ int ceilDiv(const int a, const int b) {\n"
       << "  return (a + b - 1) / b;\n"
       << "}\n"
       << "\n"
       << "__global__ void CUDAGeneratedKernel(Tensor<float, 3> T2){\n"
       << "  for(size_t i82 = 0; i82 < ( 4 * T2.size[1] ); ++i82 ) {\n"
       << "    for(size_t i83 = 0; i83 < ( ceilDiv(T2.size[0], 4) ); ++i83 ) {\n"
       << "      for(size_t i84 = 0; i84 < 2; ++i84 ) {\n"
       << "        for(size_t i85 = 0; i85 < ( ceilDiv(T2.size[2], 2) ); ++i85 ) {\n"
       << "          float T0[1];\n"
       << "          if ( ( ( ( ( ( i83 * 4 ) + ( i82 / T2.size[1] ) ) < T2.size[0] ) && ( ( i82 % T2.size[1] ) < T2.size[1] ) ) && ( ( ( i85 * 2 ) + i84 ) < T2.size[2] ) ) ) { \n"
       << "            T0[ 0 ]\n"
       << "               = float(0)\n"
       << "               + float(1);\n"
       << "          }\n"
       << "          float T1[1];\n"
       << "          if ( ( ( ( ( ( i83 * 4 ) + ( i82 / T2.size[1] ) ) < T2.size[0] ) && ( ( i82 % T2.size[1] ) < T2.size[1] ) ) && ( ( ( i85 * 2 ) + i84 ) < T2.size[2] ) ) ) { \n"
       << "            T1[ 0 ]\n"
       << "               = T0[ 0 ]\n"
       << "               + float(2);\n"
       << "          }\n"
       << "          if ( ( ( ( ( ( i83 * 4 ) + ( i82 / T2.size[1] ) ) < T2.size[0] ) && ( ( i82 % T2.size[1] ) < T2.size[1] ) ) && ( ( ( i85 * 2 ) + i84 ) < T2.size[2] ) ) ) { \n"
       << "            T2[ ( ( ( i83 * 4 ) + ( i82 / T2.size[1] ) ) * T2.stride[0] ) + ( ( i82 % T2.size[1] ) * T2.stride[1] ) + ( ( ( i85 * 2 ) + i84 ) * T2.stride[2] ) ]\n"
       << "               = T1[ 0 ]\n"
       << "               + float(3);\n"
       << "          }\n"
       << "        }\n"
       << "      }\n"
       << "    }\n"
       << "  }\n"
       << "}\n";

   GPULower gpulw(&fusion);
   std::stringstream cdg;
   gpulw.printKernel(cdg);

   if (ref.str().size() != cdg.str().size() ||
       ref.str().compare(cdg.str()) != 0) {
     std::cerr
         << " Codegen mismatch, codegen possibly changed, or is incorrect. "
         << " \n ========= REF ========= \n"
         << ref.str() << "\n========= RESULT ========== \n"
         << cdg.str() << "\n=================" << std::endl;
     TORCH_CHECK(false);
   }

   torch::jit::fuser::cuda::CudaKernel prog;
   prog.device_ = 0;
   // These can be set to anything as there are no bindings!
   // All CTAS and threads execute the same thing.
   prog.grid(4);
   prog.block(32);

   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);

   at::Tensor output = at::empty({16, 8, 8}, options);
   std::vector<at::Tensor> outputs{{output}};

   torch::jit::fuser::cuda::compileKernel(fusion, &prog);
   torch::jit::fuser::cuda::runTestKernel(prog, {}, outputs);

   at::Tensor output_ref = at::zeros_like(output, options);
   output_ref = output_ref + 0.0 + 1.0 + 2.0 + 3.0;

   TORCH_CHECK(output_ref.equal(output));
 }

 void testGPU_FusionCodeGen2() {
   Fusion fusion;
   FusionGuard fg(&fusion);

   TensorView* tv0 = makeDummyTensor(3);
   TensorView* tv1 = makeDummyTensor(3);
   TensorView* tv2 = static_cast<TensorView*>(add(tv1, new Float(2.0)));
   TensorView* tv3 = static_cast<TensorView*>(add(tv0, tv2));

   fusion.addInput(tv0);
   fusion.addInput(tv1);
   fusion.addOutput(tv3);

   //[I0, I1, I2]
   tv3->reorder({{0, 2}, {2, 0}});
   //[I2, I1, I0]
   tv3->split(-1, 4);
   //[I2, I1, I0o, I0i{4}]
   tv3->reorder({{2, 0}, {3, 1}, {0, 3}});
   // I0o, I0i{4}, I1, I2]

   tv0->computeAt(tv3, -1);
   tv1->computeAt(tv3, -1);

   tv3->axis(0)->parallelize(ParallelType::BIDx);
   tv3->axis(-1)->parallelize(ParallelType::TIDx);

   std::stringstream ref;
   ref << "__device__ int ceilDiv(const int a, const int b) {\n"
       << "  return (a + b - 1) / b;\n"
       << "}\n"
       << "\n"
       << "__global__ void CUDAGeneratedKernel(Tensor<float, 3> T0, Tensor<float, 3> T1, Tensor<float, 3> T3){\n"
       << "  for(size_t i33 = 0; i33 < 4; ++i33 ) {\n"
       << "    for(size_t i34 = 0; i34 < T3.size[1]; ++i34 ) {\n"
       << "      float T2[1];\n"
       << "      if ( ( ( ( blockIdx.x * 4 ) + i33 ) < T3.size[0] ) ) { \n"
       << "        T2[ 0 ]\n"
       << "           = T1[ ( ( ( blockIdx.x * 4 ) + i33 ) * T1.stride[0] ) + ( i34 * T1.stride[1] ) + ( threadIdx.x * T1.stride[2] ) ]\n"
       << "           + float(2);\n"
       << "      }\n"
       << "      if ( ( ( ( blockIdx.x * 4 ) + i33 ) < T3.size[0] ) ) { \n"
       << "        T3[ ( ( ( blockIdx.x * 4 ) + i33 ) * T3.stride[0] ) + ( i34 * T3.stride[1] ) + ( threadIdx.x * T3.stride[2] ) ]\n"
       << "           = T0[ ( ( ( blockIdx.x * 4 ) + i33 ) * T0.stride[0] ) + ( i34 * T0.stride[1] ) + ( threadIdx.x * T0.stride[2] ) ]\n"
       << "           + T2[ 0 ];\n"
       << "      }\n"
       << "    }\n"
       << "  }\n"
       << "}\n";

   GPULower gpulw(&fusion);
   std::stringstream cdg;
   gpulw.printKernel(cdg);

   if (ref.str().size() != cdg.str().size() ||
       ref.str().compare(cdg.str()) != 0) {
     std::cerr
         << " Codegen mismatch, codegen possibly changed, or is incorrect. "
         << " \n ========= REF ========= \n"
         << ref.str() << "\n========= RESULT ========== \n"
         << cdg.str() << "\n=================" << std::endl;
     TORCH_CHECK(false);
   }

   torch::jit::fuser::cuda::CudaKernel prog;
   prog.device_ = 0;
   prog.grid(4);
   prog.block(8);

   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);

   at::Tensor input1 = at::randn({16, 8, 8}, options);
   at::Tensor input2 = at::randn_like(input1);
   ;
   at::Tensor output = at::empty_like(input1);
   std::vector<at::Tensor> inputs{{input1, input2}};
   std::vector<at::Tensor> outputs{{output}};

   torch::jit::fuser::cuda::compileKernel(fusion, &prog);
   torch::jit::fuser::cuda::runTestKernel(prog, inputs, outputs);

   at::Tensor tv2_ref = input2 + 2.0;
   at::Tensor output_ref = input1 + tv2_ref;

   TORCH_CHECK(output_ref.equal(output));
 }

 void testGPU_FusionSimplePWise() {
   Fusion fusion;
   FusionGuard fg(&fusion);
   // dimensionality of the problem
   int nDims = 3;

   // Set up symbolic sizes for the axes should be dimensionality of the problem
   std::vector<IterDomain*> dom;
   for (int i = 0; i < nDims; i++)
     dom.push_back(new IterDomain(new Int(0), new Int()));

   // Set up your input tensor views
   TensorView* tv0 = new TensorView(new TensorDomain(dom), DataType::Float);
   TensorView* tv1 = new TensorView(new TensorDomain(dom), DataType::Float);

   // Register your inputs
   fusion.addInput(tv0);
   fusion.addInput(tv1);

   // Do math with it, it returns a `Val*` but can be static_casted back to
   // TensorView
   TensorView* tv2 = static_cast<TensorView*>(add(tv1, new Float(2.0)));
   TensorView* tv3 = static_cast<TensorView*>(add(tv0, tv2));

   // Register your outputs
   fusion.addOutput(tv3);

   // Do transformations, remember, transformations are outputs to inputs
   // This doesn't have to be in this order
   tv3->merge(1);
   tv3->merge(0);

   // Split by n_threads
   tv3->split(-1, 128 * 2);
   tv3->split(-1, 128);

   // For all inputs, computeAt the output inline, temporaries should be squeezed
   // between them
   tv0->computeAt(tv3, -1);
   tv1->computeAt(tv3, -1);

   // Parallelize TV3
   tv3->axis(0)->parallelize(ParallelType::BIDx);
   tv3->axis(-2)->parallelize(ParallelType::TIDy);
   tv3->axis(-1)->parallelize(ParallelType::TIDx);

   torch::jit::fuser::cuda::CudaKernel prog;
   prog.device_ = 0;
   prog.grid(64); //   1 CTA
   prog.block(128, 2); // 256 Threads

   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);

   at::Tensor input1 = at::randn({64, 2, 128}, options);
   at::Tensor input2 = at::randn_like(input1);
   ;
   at::Tensor output = at::empty_like(input1);
   std::vector<at::Tensor> inputs{{input1, input2}};
   std::vector<at::Tensor> outputs{{output}};

   torch::jit::fuser::cuda::compileKernel(fusion, &prog);
   torch::jit::fuser::cuda::runTestKernel(prog, inputs, outputs);

   at::Tensor tv2_ref = input2 + 2.0;
   at::Tensor output_ref = input1 + tv2_ref;

   TORCH_CHECK(output_ref.equal(output));
 }

 void testGPU_FusionExecKernel() {
   Fusion fusion;
   FusionGuard fg(&fusion);

   // Set up your input tensor views
   TensorView* tv0 = makeDummyTensor(2);
   TensorView* tv1 = makeDummyTensor(2);

   // Register your inputs
   fusion.addInput(tv0);
   fusion.addInput(tv1);

   // Do math with it, it returns a `Val*` but can be static_casted back to
   // TensorView
   TensorView* tv2 = static_cast<TensorView*>(add(tv1, new Float(2.0)));
   TensorView* tv3 = static_cast<TensorView*>(add(tv0, tv2));

   // Register your outputs
   fusion.addOutput(tv3);

   tv3->split(0, 4);

   // For all inputs, computeAt the output inline, temporaries should be squeezed
   // between them
   tv0->computeAt(tv3, 1);
   tv1->computeAt(tv3, 1);

   // Parallelize TV3
   tv3->axis(0)->parallelize(ParallelType::BIDx);
   tv2->axis(1)->parallelize(ParallelType::Unroll);
   tv3->axis(1)->parallelize(ParallelType::Unroll);
   tv2->axis(-1)->parallelize(ParallelType::TIDx);
   tv3->axis(-1)->parallelize(ParallelType::TIDx);

   torch::jit::fuser::cuda::CudaKernel prog;
   prog.device_ = 0;
   prog.grid(1); // 1 CTA
   prog.block(128); // 128 Threads

   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);

   at::Tensor input1 = at::ones({1, 128}, options);
   at::Tensor input2 = at::ones_like(input1);
   ;
   at::Tensor output = at::empty_like(input1);
   std::vector<at::Tensor> inputs{{input1, input2}};
   std::vector<at::Tensor> outputs{{output}};

   torch::jit::fuser::cuda::compileKernel(fusion, &prog);
   torch::jit::fuser::cuda::runTestKernel(prog, inputs, outputs);

   at::Tensor check = at::full({1, 128}, 4, options);
   ;
   TORCH_CHECK(output.equal(check));
 }

 void testGPU_FusionForLoop() {
   Fusion fusion;
   FusionGuard fg(&fusion);

   const auto TV0 = new TensorView(
       new TensorDomain({new IterDomain(new Int(0), new Int(16))}),
       DataType::Float);
   const auto TV1 = new TensorView(
       new TensorDomain({new IterDomain(new Int(0), new Int(16))}),
       DataType::Float);

   fusion.addInput(TV0);
   fusion.addInput(TV1);

   auto ID0 = new IterDomain(new Int(0), new Int(8));

   TensorView* TV2 = static_cast<TensorView*>(add(TV0, TV1));
   BinaryOp* op = static_cast<BinaryOp*>(TV2->getOrigin());
   fusion.addOutput(TV2);

   ForLoop* fl = new ForLoop(new Int(), ID0, {op});
   std::stringstream result;
   std::stringstream ref;
   result << fl;
   ref << "for(size_t i3{0}; i3 < iS{8}; ++i3 ) {\nT2[ iS{16} ] = T0[ iS{16} ] + T1[ iS{16} ]\n}";

   if (result.str().compare(ref.str()) == 0) {
     std::stringstream err_msg;
     err_msg << "ForLoop printing has changed or something has gone wrong. "
             << result.str() << "\n does not match reference: " << ref.str()
             << std::endl;
     TORCH_CHECK(false, err_msg.str());
   }
 }

 void testGPU_FusionLoopUnroll() {
   Fusion fusion;
   FusionGuard fg(&fusion);

   // Set up your input tensor views
   TensorView* tv0 = makeDummyTensor(1);
   TensorView* tv1 = makeDummyTensor(1);

   // Register your inputs
   fusion.addInput(tv0);
   fusion.addInput(tv1);

   // Do math with it, it returns a `Val*` but can be static_casted back to
   // TensorView
   TensorView* tv2 = static_cast<TensorView*>(add(tv1, new Float(2.0)));
   TensorView* tv3 = static_cast<TensorView*>(add(tv0, tv2));

   // Register your outputs
   fusion.addOutput(tv3);

   int block_size = 16;

   tv3->split(0, block_size);
   tv3->split(0, 4);

   // For all inputs, computeAt the output inline, temporaries should be squeezed
   // between them
   tv0->computeAt(tv3, 1);
   tv1->computeAt(tv3, 1);

   // Parallelize
   tv2->axis(1)->parallelize(ParallelType::Unroll);
   tv3->axis(1)->parallelize(ParallelType::Unroll);
   tv2->axis(-1)->parallelize(ParallelType::TIDx);
   tv3->axis(-1)->parallelize(ParallelType::TIDx);
   tv3->axis(0)->parallelize(ParallelType::BIDx);

   // GPULower lower(&fusion);
   // lower.printKernel(std::cout);

   int inp_size = 129;

   torch::jit::fuser::cuda::CudaKernel prog;
   prog.device_ = 0;
   prog.grid((inp_size + 63) / 64);
   prog.block(block_size);

   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);

   at::Tensor input1 = at::ones({inp_size}, options);
   at::Tensor input2 = at::ones_like(input1);

   at::Tensor output = at::empty_like(input1);
   std::vector<at::Tensor> inputs{{input1, input2}};
   std::vector<at::Tensor> outputs{{output}};

   torch::jit::fuser::cuda::compileKernel(fusion, &prog);
   torch::jit::fuser::cuda::runTestKernel(prog, inputs, outputs);

   at::Tensor check = at::full({inp_size}, 4, options);

   TORCH_CHECK(output.equal(check));
 }

 } // namespace jit
 } // namespace torch
 #endif // #if defined(USE_CUDA)