test/cpp/tensorexpr/tutorial.cpp - platform/external/pytorch - Git at Google

 // *** Tensor Expressions ***
 //
 // This tutorial covers basics of NNC's tensor expressions, shows basic APIs to
 // work with them, and outlines how they are used in the overall TorchScript
 // compilation pipeline. This doc is permanently a "work in progress" since NNC
 // is under active development and things change fast.
 //
 // This Tutorial's code is compiled in the standard pytorch build, and the
 // executable can be found in `build/bin/tutorial_tensorexpr`.
 //
 // *** What is NNC ***
 //
 // NNC stands for Neural Net Compiler. It is a component of TorchScript JIT
 // and it performs on-the-fly code generation for kernels, which are often a
 // combination of multiple aten (torch) operators.
 //
 // When the JIT interpreter executes a torchscript model, it automatically
 // extracts subgraphs from the torchscript IR graph for which specialized code
 // can be JIT generated. This usually improves performance as the 'combined'
 // kernel created from the subgraph could avoid unnecessary memory traffic that
 // is unavoidable when the subgraph is interpreted as-is, operator by operator.
 // This optimization is often referred to as 'fusion'. Relatedly, the process of
 // finding and extracting subgraphs suitable for NNC code generation is done by
 // a JIT pass called 'fuser'.
 //
 // *** What is TE ***
 //
 // TE stands for Tensor Expressions. TE is a commonly used approach for
 // compiling kernels performing tensor (~matrix) computation. The idea behind it
 // is that operators are represented as a mathematical formula describing what
 // computation they do (as TEs) and then the TE engine can perform mathematical
 // simplification and other optimizations using those formulas and eventually
 // generate executable code that would produce the same results as the original
 // sequence of operators, but more efficiently.
 //
 // NNC's design and implementation of TE was heavily inspired by Halide and TVM
 // projects.
 #include <iostream>
 #include <string>

 #include <c10/util/irange.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/tensorexpr/eval.h>
 #include <torch/csrc/jit/tensorexpr/expr.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/stmt.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/torch.h>

 using namespace torch::jit::tensorexpr;

 #ifdef TORCH_ENABLE_LLVM

 // Helper function to print a snippet from a big multi-line string
 static void printLinesToFrom(const std::string& input_str, int from, int to);

 #endif

 int main(int argc, char* argv[]) {
   std::cout << "*** Structure of tensor expressions and statements ***"
             << std::endl;
   {
     // A tensor expression is a tree of expressions. Each expression has a type,
     // and that type defines what sub-expressions the current expression has.
     // For instance, an expression of type 'Mul' would have a type 'kMul' and
     // two subexpressions: LHS and RHS. Each of these two sub-expressions could
     // also be a 'Mul' or some other expression.
     //
     // Let's construct a simple TE:
     ExprPtr lhs = alloc<IntImm>(5);
     ExprPtr rhs = alloc<Var>("x", kInt);
     ExprPtr mul = alloc<Mul>(lhs, rhs);
     std::cout << "Tensor expression: " << *mul << std::endl;
     // Prints: Tensor expression: 5 * x

     // Here we created an expression representing a 5*x computation, where x is
     // an int variable.

     // Another, probably a more convenient, way to construct tensor expressions
     // is to use so called expression handles (as opposed to raw expressions
     // like we did in the previous example). Expression handles overload common
     // operations and allow us to express the same semantics in a more natural
     // way:
     ExprHandle l = 5;
     ExprHandle r = Var::make("x", kInt);
     ExprHandle m = l * r;
     std::cout << "Tensor expression: " << *m.node() << std::endl;
     // Prints: Tensor expression: 5 * x

     // Converting from handles to raw expressions and back is easy:
     ExprHandle handle = Var::make("x", kInt);
     ExprPtr raw_expr_from_handle = handle.node();
     ExprPtr raw_expr = alloc<Var>("x", kInt);
     ExprHandle handle_from_raw_expr = ExprHandle(raw_expr);

     // We could construct arbitrarily complex expressions using mathematical
     // and logical operations, casts between various data types, and a bunch of
     // intrinsics.
     ExprHandle a = Var::make("a", kInt);
     ExprHandle b = Var::make("b", kFloat);
     ExprHandle c = Var::make("c", kFloat);
     ExprHandle x = ExprHandle(5) * a + b / (sigmoid(c) - 3.0f);
     std::cout << "Tensor expression: " << *x.node() << std::endl;
     // Prints: Tensor expression: float(5 * a) + b / ((sigmoid(c)) - 3.f)

     // An ultimate purpose of tensor expressions is to optimize tensor
     // computations, and in order to represent accesses to tensors data, there
     // is a special kind of expression - a load.
     // To construct a load we need two pieces: the base and the indices. The
     // base of a load is a Buf expression, which could be thought of as a
     // placeholder similar to Var, but with dimensions info.
     //
     // Let's construct a simple load:
     BufHandle A("A", {64, 32}, kInt);
     VarPtr i_var = alloc<Var>("i", kInt), j_var = alloc<Var>("j", kInt);
     ExprHandle i(i_var), j(j_var);
     ExprHandle load = Load::make(A.dtype(), A, {i, j});
     std::cout << "Tensor expression: " << *load.node() << std::endl;
     // Prints: Tensor expression: A[i, j]

     // Tensor Expressions constitute Tensor Statements, which are used to
     // represent computation of a given operator or a group of operators from a
     // fusion group.
     //
     // There are three main kinds of tensor statements:
     //  - block
     //  - store
     //  - loop
     //
     // A Store represents a store to a single element of a tensor (or to a
     // group of elements if it's a vectorized store). Store statements,
     // similarly to Load expressions, have a base and indices, but on top of
     // that they also include a value - an expression representing what needs
     // to be stored at the given memory location. Let's create a Store stmt:
     StmtPtr store_a = Store::make(A, {i, j}, i + j);
     std::cout << "Store statement: " << *store_a << std::endl;
     // Prints: Store statement: A[i, j] = i + j;

     // An operator fills the entire tensor, not just a single element, and to
     // represent this we need to use For stmt: let's wrap our store stmt with
     // two nested loops to represent that variables i and j need to iterate
     // over some ranges.
     ForPtr loop_j_a = For::make(VarHandle(j_var), 0, 32, store_a);
     ForPtr loop_i_a = For::make(VarHandle(i_var), 0, 64, loop_j_a);

     std::cout << "Nested for loops: " << std::endl << *loop_i_a << std::endl;
     // Prints:
     // Nested for loops:
     // for (const auto i : c10::irange(64)) {
     //   for (const auto j : c10::irange(32)) {
     //     A[i, j] = i + j;
     //   }
     // }

     // A Block statement is used when we need a sequence of other statements.
     // E.g. if a fusion group contains several operators, we initially define
     // separate loopnest for each of them and put them all into a common block:
     BufHandle B("B", {64, 32}, kInt);
     StmtPtr store_b = Store::make(B, {i, j}, A.load(i, j));
     ForPtr loop_j_b = For::make(VarHandle(j_var), 0, 32, store_b);
     ForPtr loop_i_b = For::make(VarHandle(i_var), 0, 64, loop_j_b);

     BlockPtr block = Block::make({loop_i_a, loop_i_b});
     std::cout << "Compound Block statement: " << std::endl
               << *block << std::endl;
     // Prints:
     // Compound Block statement:
     // {
     //   for (const auto i : c10::irange(64)) {
     //     for (const auto j : c10::irange(32)) {
     //       A[i, j] = i + j;
     //     }
     //   }
     //   for (const auto i : c10::irange(64)) {
     //     for (const auto j : c10::irange(32)) {
     //       B[i, j] = A[i, j];
     //     }
     //   }
     // }

     // Manually constructing nested loops and blocks to represent a computation
     // might be laborious, and instead we can use a 'Compute' API. This API
     // requires us to specify dimensions and a lambda to compute a single
     // element of the resulting tensor and returns a `Tensor` structure. This
     // structure is simply a pair of a buffer that was created to represent the
     // result of the computation (BufPtr) and a statement representing the
     // computation itself (StmtPtr).
     Tensor C =
         Compute("C", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
           return i * j;
         });
     std::cout << "Stmt produced by 'Compute' API: " << std::endl
               << *C.stmt() << std::endl;
     // Prints:
     // Stmt produced by 'Compute' API:
     // for (const auto i : c10::irange(64)) {
     //   for (const auto j : c10::irange(32)) {
     //     C[i, j] = i * j;
     //   }
     // }

     // To construct statements to represent computations with reductions, we
     // can use a 'Reduce' API - it is similar to 'Compute' but takes a couple
     // of extra arguments defining how to perform the reduction. Let's define a
     // simple 2D sum of C using that:
     Tensor D = Reduce(
         "D",
         {},
         Sum(),
         [&](const VarHandle& i, const VarHandle& j) { return C.load(i, j); },
         {64, 32});
     std::cout << "Stmt produced by 'Reduce' API: " << std::endl
               << *D.stmt() << std::endl;
   }

   std::cout << "*** Loopnests transformations ***" << std::endl;
   {
     // When a statement for the computation is generated, we might want to
     // apply some optimizations to it. These transformations allow us to end up
     // with a statement producing the same results, but more efficiently.
     //
     // Let's look at a couple of transformations that are used in NNC. We will
     // begin with constructing a Block statement like we did before.

     Tensor C =
         Compute("C", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
           return i * (j + 1);
         });
     BufHandle c_buf(C.buf());
     Tensor D =
         Compute("D", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
           return c_buf.load(i, j) - i;
         });
     StmtPtr block = Block::make({C.stmt(), D.stmt()});
     std::cout << "Stmt produced by 'Compute' API: " << std::endl
               << *block << std::endl;
     // Prints:
     // Stmt produced by 'Compute' API:
     // {
     //   for (const auto i : c10::irange(64)) {
     //     for (const auto j : c10::irange(32)) {
     //       C[i, j] = i * (j + 1);
     //     }
     //   }
     //   for (const auto i_1 : c10::irange(64)) {
     //     for (const auto j_1 : c10::irange(32)) {
     //       D[i_1, j_1] = (C[i_1, j_1]) - i_1;
     //     }
     //   }
     // }

     // One transformation we can apply to this computation is inlining: i.e.
     // taking the expression that defines values of C and substituting a load
     // from C with it.
     // To do that, we first need to create a special object called LoopNest -
     // all transformations are methods of this class. To create a loopnest we
     // need to provide a list of output buffers and the root statement:
     LoopNest nest(block, {D.buf()});

     // We can always retrieve the Stmt back from LoopNest:
     std::cout << "LoopNest root stmt: " << std::endl
               << *nest.root_stmt() << std::endl;
     // Prints:
     // LoopNest root stmt:
     // {
     //   for (const auto i : c10::irange(64)) {
     //     for (const auto j : c10::irange(32)) {
     //       C[i, j] = i * (j + 1);
     //     }
     //   }
     //   for (const auto i_1 : c10::irange(64)) {
     //     for (const auto j_1 : c10::irange(32)) {
     //       D[i_1, j_1] = (C[i_1, j_1]) - i_1;
     //     }
     //   }
     // }

     // Now we can apply the inlining transformation:
     nest.computeInline(C.buf());
     std::cout << "Stmt after inlining:" << std::endl
               << *nest.root_stmt() << std::endl;
     // Prints:
     // Stmt after inlining:
     // {
     //   for (const auto i : c10::irange(64)) {
     //     for (const auto j : c10::irange(32)) {
     //       D[i, j] = i * (j + 1) - i;
     //     }
     //   }
     // }

     // We can also apply algebraic simplification to a statement:
     StmtPtr simplified = IRSimplifier::simplify(nest.root_stmt());
     std::cout << "Stmt after simplification:" << std::endl
               << *simplified << std::endl;
     // Prints:
     // Stmt after simplification:
     // {
     //   for (const auto i : c10::irange(64)) {
     //     for (const auto j : c10::irange(32)) {
     //       D[i, j] = i * j;
     //     }
     //   }
     // }

     // Many loopnest transformations are stateless and can be applied without
     // creating a LoopNest object. In fact, we plan to make all transformations
     // stateless.
     // splitWithTail is one such transformation: it splits an iteration space
     // of a given loop into two with a given factor.
     ForPtr outer_loop = to<For>(to<Block>(simplified)->stmts().front());
     LoopNest::splitWithTail(outer_loop, 13);
     // Call simplifier once more to fold some arithmetic.
     simplified = IRSimplifier::simplify(simplified);
     std::cout << "Stmt after splitWithTail:" << std::endl
               << *simplified << std::endl;
     // Prints:
     // Stmt after splitWithTail:
     // {
     //   for (const auto i_outer : c10::irange(4)) {
     //     for (const auto i_inner : c10::irange(13)) {
     //       for (const auto j : c10::irange(32)) {
     //         D[i_inner + 13 * i_outer, j] = i_inner * j + 13 * (i_outer * j);
     //       }
     //     }
     //   }
     //   for (const auto i_tail : c10::irange(12)) {
     //     for (const auto j : c10::irange(32)) {
     //       D[i_tail + 52, j] = i_tail * j + 52 * j;
     //     }
     //   }
     // }

     // NNC supports a wide range of loop nest transformations, which we are not
     // listing here. Please refer to documentation in
     // https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/tensorexpr/loopnest.h
     // for more details.
   }

   std::cout << "*** Codegen ***" << std::endl;
   {
     // An ultimate goal of tensor expressions is to be provide a mechanism to
     // execute a given computation in the fastest possible way. So far we've
     // looked at how we could describe what computation we're interested in, but
     // we haven't looked at how to actually execute it.
     //
     // All we've been dealing with was just symbols with no actual data
     // associated, in this section we would look at how we can bridge that gap.

     // Let's start by constructing a simple computation for us to work with:
     BufHandle A("A", {64, 32}, kInt);
     BufHandle B("B", {64, 32}, kInt);
     Tensor X =
         Compute("X", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
           return A.load(i, j) + B.load(i, j);
         });

     // And let's lower it to a loop nest, as we did in the previous section. We
     // can pass Tensor object directly:
     LoopNest loopnest({X});
     std::cout << *loopnest.root_stmt() << std::endl;
     // Prints:
     // {
     //   for (const auto i : c10::irange(64)) {
     //     for (const auto j : c10::irange(32)) {
     //       X[i, j] = (A[i, j]) + (B[i, j]);
     //     }
     //   }

     // Now imagine that we have two actual tensors 64x32 that we want sum
     // together, how do we pass those tensors to the computation and how do we
     // carry it out?
     //
     // Codegen object is aimed at providing exactly that functionality. Codegen
     // is an abstract class and concrete codegens are derived from it.
     // Currently, we have three codegens:
     //  1) Simple Evaluator,
     //  2) LLVM Codegen for CPU,
     //  3) CUDA Codegen.
     // In this example we will be using Simple Evaluator, since it's available
     // everywhere.

     // To create a codegen, we need to provide the statement - it specifies the
     // computation we want to perform - and a list of placeholders and tensors
     // used in the computation. The latter part is crucial since that's the only
     // way the codegen could use to correlate symbols in the statement to actual
     // data arrays that we will be passing when we will actually be performing
     // the computation.
     //
     // Let's create a Simple IR Evaluator codegen for our computation:
     SimpleIREvaluator ir_eval(loopnest.root_stmt(), {A, B, X});

     // We are using the simplest codegen and in it almost no work is done at the
     // construction step. Real codegens such as CUDA and LLVM perform
     // compilation during that stage so that when we're about to run the
     // computation everything is ready.

     // Let's now create some inputs and run our computation with them:
     std::vector<int> data_A(64 * 32, 3); // This will be the input A
     std::vector<int> data_B(64 * 32, 5); // This will be the input B
     std::vector<int> data_X(64 * 32, 0); // This will be used for the result

     // Now let's invoke our codegen to perform the computation on our data. We
     // need to provide as many arguments as how many placeholders and tensors we
     // passed at the codegen construction time. A position in these lists would
     // define how real data arrays from the latter call (these arguments are
     // referred to as 'CallArg's in our codebase) correspond to symbols
     // (placeholders and tensors) used in the tensor expressions we constructed
     // (these are referred to as 'BufferArg').
     // Thus, we will provide three arguments: data_A, data_B, and data_X. data_A
     // contains data for the placeholder A, data_B - for the placeholder B, and
     // data_X would be used for contents of tensor X.
     ir_eval(data_A, data_B, data_X);

     // Let's print one of the elements from each array to verify that the
     // computation did happen:
     std::cout << "A[10] = " << data_A[10] << std::endl
               << "B[10] = " << data_B[10] << std::endl
               << "X[10] = A[10] + B[10] = " << data_X[10] << std::endl;
     // Prints:
     // A[10] = 3
     // B[10] = 5
     // X[10] = A[10] + B[10] = 8
   }

   std::cout << "*** Lowering TorchScript IR to TensorExpr IR ***" << std::endl;
   {
     // This section requires a LLVM-enabled PyTorch build, so we have to use a
     // guard:
 #ifdef TORCH_ENABLE_LLVM

     // Often we would like to convert a TorchScript IR to TE rather than
     // construct TE IR from scratch.  NNC provides an API to perform such
     // lowering: it takes a TorchScript graph and returns an object that can be
     // used to invoke the generated kernel.
     // This API is currently used by the TorchScript JIT fuser and can also be
     // used ahead of time to pre-compile parts of a model.
     //
     // To get familiar with this API let's first start with defining a simple
     // TorchScript graph:
     const auto graph_string = R"IR(
         graph(%A : Float(5, 3, strides=[3, 1], device=cpu),
               %B : Float(5, 3, strides=[3, 1], device=cpu)):
           %AB : Float(5, 3, strides=[3, 1]) = aten::mul(%A, %B)
           %one : int = prim::Constant[value=1]()
           %AAB : Float(5, 3, strides=[3, 1]) = aten::mul(%A, %AB)
           %AAB_plus_B: Float(5, 3, strides=[3, 1]) = aten::add(%AAB, %B, %one)
           return (%AAB_plus_B))IR";
     auto graph = std::make_shared<torch::jit::Graph>();
     parseIR(graph_string, &*graph);

     // This graph defines a simple computation of A*A*B + B where A and B are
     // input 5x3 tensors.

     // To lower this TorchScript graph to TE, we just need to create a
     // TensorExprKernel object. In its constructor it constructs the
     // corresponding TE IR and compiles it for the given backend (in this
     // example for CPU using LLVM compiler).
     TensorExprKernel kernel(graph);

     // We can retrieve the generated TE stmt from the kernel object:
     StmtPtr kernel_stmt = kernel.getCodeGenStmt();
     std::cout << "TE Stmt constructed from TorchScript: " << std::endl
               << *kernel_stmt << std::endl;
     // Prints:
     // TE Stmt constructed from TorchScript:
     // {
     //   for (const auto v : c10::irange(5)) {
     //     for (const auto _tail_tail : c10::irange(3)) {
     //       aten_add[_tail_tail + 3 * v] = (tA[_tail_tail + 3 * v]) *
     //       ((tA[_tail_tail + 3 * v]) * (tB[_tail_tail + 3 * v])) +
     //       (tB[_tail_tail + 3 * v]);
     //     }
     //   }
     // }

     // We can also examine generated LLVM IR and assembly code:
     std::cout << "Generated LLVM IR: " << std::endl;
     auto ir_str = kernel.getCodeText("ir");
     printLinesToFrom(ir_str, 15, 20);
     // Prints:
     // Generated LLVM IR:
     //   %9 = bitcast float* %2 to <8 x float>*
     //   %10 = load <8 x float>, <8 x float>* %9 ...
     //   %11 = bitcast float* %5 to <8 x float>*
     //   %12 = load <8 x float>, <8 x float>* %11 ...
     //   %13 = fmul <8 x float> %10, %12
     //   %14 = fmul <8 x float> %10, %13

     std::cout << "Generated assembly: " << std::endl;
     auto asm_str = kernel.getCodeText("asm");
     printLinesToFrom(asm_str, 10, 15);
     // Prints:
     // Generated assembly:
     //         vmulps  %ymm1, %ymm0, %ymm2
     //         vfmadd213ps     %ymm1, %ymm0, %ymm2
     //         vmovups %ymm2, (%rax)
     //         vmovss  32(%rcx), %xmm0
     //         vmovss  32(%rdx), %xmm1
     //         vmulss  %xmm1, %xmm0, %xmm2

     // We can also execute the generated kernel:
     auto A =
         at::ones({5, 3}, torch::TensorOptions(torch::kCPU).dtype(at::kFloat)) *
         2.0;
     auto B =
         at::ones({5, 3}, torch::TensorOptions(torch::kCPU).dtype(at::kFloat)) *
         3.0;
     std::vector<at::Tensor> inputs = {A, B};
     std::vector<torch::IValue> stack = torch::fmap<torch::IValue>(inputs);
     kernel.run(stack);
     auto R = stack[0].toTensor();

     // Let's print one of the elements from the result tensor to verify that the
     // computation did happen and was correct:
     std::cout << "R[2][2] = " << R[2][2] << std::endl;
     // Prints:
     // R[2][2] = 15
     // [ CPUFloatType{} ]
 #endif
   }
   return 0;
 }

 void printLinesToFrom(const std::string& input_str, int from, int to) {
   std::istringstream f(input_str);
   std::string s;
   int idx = 0;
   while (getline(f, s)) {
     if (idx > from) {
       std::cout << s << "\n";
     }
     if (idx++ > to) {
       break;
     }
   }
 }
	// * Tensor Expressions *
	//
	// This tutorial covers basics of NNC's tensor expressions, shows basic APIs to
	// work with them, and outlines how they are used in the overall TorchScript
	// compilation pipeline. This doc is permanently a "work in progress" since NNC
	// is under active development and things change fast.
	//
	// This Tutorial's code is compiled in the standard pytorch build, and the
	// executable can be found in `build/bin/tutorial_tensorexpr`.
	//
	// * What is NNC *
	//
	// NNC stands for Neural Net Compiler. It is a component of TorchScript JIT
	// and it performs on-the-fly code generation for kernels, which are often a
	// combination of multiple aten (torch) operators.
	//
	// When the JIT interpreter executes a torchscript model, it automatically
	// extracts subgraphs from the torchscript IR graph for which specialized code
	// can be JIT generated. This usually improves performance as the 'combined'
	// kernel created from the subgraph could avoid unnecessary memory traffic that
	// is unavoidable when the subgraph is interpreted as-is, operator by operator.
	// This optimization is often referred to as 'fusion'. Relatedly, the process of
	// finding and extracting subgraphs suitable for NNC code generation is done by
	// a JIT pass called 'fuser'.
	//
	// * What is TE *
	//
	// TE stands for Tensor Expressions. TE is a commonly used approach for
	// compiling kernels performing tensor (~matrix) computation. The idea behind it
	// is that operators are represented as a mathematical formula describing what
	// computation they do (as TEs) and then the TE engine can perform mathematical
	// simplification and other optimizations using those formulas and eventually
	// generate executable code that would produce the same results as the original
	// sequence of operators, but more efficiently.
	//
	// NNC's design and implementation of TE was heavily inspired by Halide and TVM
	// projects.
	#include <iostream>
	#include <string>

	#include <c10/util/irange.h>
	#include <torch/csrc/jit/ir/ir.h>
	#include <torch/csrc/jit/ir/irparser.h>
	#include <torch/csrc/jit/tensorexpr/eval.h>
	#include <torch/csrc/jit/tensorexpr/expr.h>
	#include <torch/csrc/jit/tensorexpr/ir.h>
	#include <torch/csrc/jit/tensorexpr/ir_printer.h>
	#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
	#include <torch/csrc/jit/tensorexpr/kernel.h>
	#include <torch/csrc/jit/tensorexpr/loopnest.h>
	#include <torch/csrc/jit/tensorexpr/stmt.h>
	#include <torch/csrc/jit/tensorexpr/tensor.h>
	#include <torch/torch.h>

	using namespace torch::jit::tensorexpr;

	#ifdef TORCH_ENABLE_LLVM

	// Helper function to print a snippet from a big multi-line string
	static void printLinesToFrom(const std::string& input_str, int from, int to);

	#endif

	int main(int argc, char* argv[]) {
	std::cout << "* Structure of tensor expressions and statements *"
	<< std::endl;
	{
	// A tensor expression is a tree of expressions. Each expression has a type,
	// and that type defines what sub-expressions the current expression has.
	// For instance, an expression of type 'Mul' would have a type 'kMul' and
	// two subexpressions: LHS and RHS. Each of these two sub-expressions could
	// also be a 'Mul' or some other expression.
	//
	// Let's construct a simple TE:
	ExprPtr lhs = alloc<IntImm>(5);
	ExprPtr rhs = alloc<Var>("x", kInt);
	ExprPtr mul = alloc<Mul>(lhs, rhs);
	std::cout << "Tensor expression: " << *mul << std::endl;
	// Prints: Tensor expression: 5 * x

	// Here we created an expression representing a 5*x computation, where x is
	// an int variable.

	// Another, probably a more convenient, way to construct tensor expressions
	// is to use so called expression handles (as opposed to raw expressions
	// like we did in the previous example). Expression handles overload common
	// operations and allow us to express the same semantics in a more natural
	// way:
	ExprHandle l = 5;
	ExprHandle r = Var::make("x", kInt);
	ExprHandle m = l * r;
	std::cout << "Tensor expression: " << *m.node() << std::endl;
	// Prints: Tensor expression: 5 * x

	// Converting from handles to raw expressions and back is easy:
	ExprHandle handle = Var::make("x", kInt);
	ExprPtr raw_expr_from_handle = handle.node();
	ExprPtr raw_expr = alloc<Var>("x", kInt);
	ExprHandle handle_from_raw_expr = ExprHandle(raw_expr);

	// We could construct arbitrarily complex expressions using mathematical
	// and logical operations, casts between various data types, and a bunch of
	// intrinsics.
	ExprHandle a = Var::make("a", kInt);
	ExprHandle b = Var::make("b", kFloat);
	ExprHandle c = Var::make("c", kFloat);
	ExprHandle x = ExprHandle(5) * a + b / (sigmoid(c) - 3.0f);
	std::cout << "Tensor expression: " << *x.node() << std::endl;
	// Prints: Tensor expression: float(5 * a) + b / ((sigmoid(c)) - 3.f)

	// An ultimate purpose of tensor expressions is to optimize tensor
	// computations, and in order to represent accesses to tensors data, there
	// is a special kind of expression - a load.
	// To construct a load we need two pieces: the base and the indices. The
	// base of a load is a Buf expression, which could be thought of as a
	// placeholder similar to Var, but with dimensions info.
	//
	// Let's construct a simple load:
	BufHandle A("A", {64, 32}, kInt);
	VarPtr i_var = alloc<Var>("i", kInt), j_var = alloc<Var>("j", kInt);
	ExprHandle i(i_var), j(j_var);
	ExprHandle load = Load::make(A.dtype(), A, {i, j});
	std::cout << "Tensor expression: " << *load.node() << std::endl;
	// Prints: Tensor expression: A[i, j]

	// Tensor Expressions constitute Tensor Statements, which are used to
	// represent computation of a given operator or a group of operators from a
	// fusion group.
	//
	// There are three main kinds of tensor statements:
	// - block
	// - store
	// - loop
	//
	// A Store represents a store to a single element of a tensor (or to a
	// group of elements if it's a vectorized store). Store statements,
	// similarly to Load expressions, have a base and indices, but on top of
	// that they also include a value - an expression representing what needs
	// to be stored at the given memory location. Let's create a Store stmt:
	StmtPtr store_a = Store::make(A, {i, j}, i + j);
	std::cout << "Store statement: " << *store_a << std::endl;
	// Prints: Store statement: A[i, j] = i + j;

	// An operator fills the entire tensor, not just a single element, and to
	// represent this we need to use For stmt: let's wrap our store stmt with
	// two nested loops to represent that variables i and j need to iterate
	// over some ranges.
	ForPtr loop_j_a = For::make(VarHandle(j_var), 0, 32, store_a);
	ForPtr loop_i_a = For::make(VarHandle(i_var), 0, 64, loop_j_a);

	std::cout << "Nested for loops: " << std::endl << *loop_i_a << std::endl;
	// Prints:
	// Nested for loops:
	// for (const auto i : c10::irange(64)) {
	// for (const auto j : c10::irange(32)) {
	// A[i, j] = i + j;
	// }
	// }

	// A Block statement is used when we need a sequence of other statements.
	// E.g. if a fusion group contains several operators, we initially define
	// separate loopnest for each of them and put them all into a common block:
	BufHandle B("B", {64, 32}, kInt);
	StmtPtr store_b = Store::make(B, {i, j}, A.load(i, j));
	ForPtr loop_j_b = For::make(VarHandle(j_var), 0, 32, store_b);
	ForPtr loop_i_b = For::make(VarHandle(i_var), 0, 64, loop_j_b);

	BlockPtr block = Block::make({loop_i_a, loop_i_b});
	std::cout << "Compound Block statement: " << std::endl
	<< *block << std::endl;
	// Prints:
	// Compound Block statement:
	// {
	// for (const auto i : c10::irange(64)) {
	// for (const auto j : c10::irange(32)) {
	// A[i, j] = i + j;
	// }
	// }
	// for (const auto i : c10::irange(64)) {
	// for (const auto j : c10::irange(32)) {
	// B[i, j] = A[i, j];
	// }
	// }
	// }

	// Manually constructing nested loops and blocks to represent a computation
	// might be laborious, and instead we can use a 'Compute' API. This API
	// requires us to specify dimensions and a lambda to compute a single
	// element of the resulting tensor and returns a `Tensor` structure. This
	// structure is simply a pair of a buffer that was created to represent the
	// result of the computation (BufPtr) and a statement representing the
	// computation itself (StmtPtr).
	Tensor C =
	Compute("C", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
	return i * j;
	});
	std::cout << "Stmt produced by 'Compute' API: " << std::endl
	<< *C.stmt() << std::endl;
	// Prints:
	// Stmt produced by 'Compute' API:
	// for (const auto i : c10::irange(64)) {
	// for (const auto j : c10::irange(32)) {
	// C[i, j] = i * j;
	// }
	// }

	// To construct statements to represent computations with reductions, we
	// can use a 'Reduce' API - it is similar to 'Compute' but takes a couple
	// of extra arguments defining how to perform the reduction. Let's define a
	// simple 2D sum of C using that:
	Tensor D = Reduce(
	"D",
	{},
	Sum(),
	[&](const VarHandle& i, const VarHandle& j) { return C.load(i, j); },
	{64, 32});
	std::cout << "Stmt produced by 'Reduce' API: " << std::endl
	<< *D.stmt() << std::endl;
	}

	std::cout << "* Loopnests transformations *" << std::endl;
	{
	// When a statement for the computation is generated, we might want to
	// apply some optimizations to it. These transformations allow us to end up
	// with a statement producing the same results, but more efficiently.
	//
	// Let's look at a couple of transformations that are used in NNC. We will
	// begin with constructing a Block statement like we did before.

	Tensor C =
	Compute("C", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
	return i * (j + 1);
	});
	BufHandle c_buf(C.buf());
	Tensor D =
	Compute("D", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
	return c_buf.load(i, j) - i;
	});
	StmtPtr block = Block::make({C.stmt(), D.stmt()});
	std::cout << "Stmt produced by 'Compute' API: " << std::endl
	<< *block << std::endl;
	// Prints:
	// Stmt produced by 'Compute' API:
	// {
	// for (const auto i : c10::irange(64)) {
	// for (const auto j : c10::irange(32)) {
	// C[i, j] = i * (j + 1);
	// }
	// }
	// for (const auto i_1 : c10::irange(64)) {
	// for (const auto j_1 : c10::irange(32)) {
	// D[i_1, j_1] = (C[i_1, j_1]) - i_1;
	// }
	// }
	// }

	// One transformation we can apply to this computation is inlining: i.e.
	// taking the expression that defines values of C and substituting a load
	// from C with it.
	// To do that, we first need to create a special object called LoopNest -
	// all transformations are methods of this class. To create a loopnest we
	// need to provide a list of output buffers and the root statement:
	LoopNest nest(block, {D.buf()});

	// We can always retrieve the Stmt back from LoopNest:
	std::cout << "LoopNest root stmt: " << std::endl
	<< *nest.root_stmt() << std::endl;
	// Prints:
	// LoopNest root stmt:
	// {
	// for (const auto i : c10::irange(64)) {
	// for (const auto j : c10::irange(32)) {
	// C[i, j] = i * (j + 1);
	// }
	// }
	// for (const auto i_1 : c10::irange(64)) {
	// for (const auto j_1 : c10::irange(32)) {
	// D[i_1, j_1] = (C[i_1, j_1]) - i_1;
	// }
	// }
	// }

	// Now we can apply the inlining transformation:
	nest.computeInline(C.buf());
	std::cout << "Stmt after inlining:" << std::endl
	<< *nest.root_stmt() << std::endl;
	// Prints:
	// Stmt after inlining:
	// {
	// for (const auto i : c10::irange(64)) {
	// for (const auto j : c10::irange(32)) {
	// D[i, j] = i * (j + 1) - i;
	// }
	// }
	// }

	// We can also apply algebraic simplification to a statement:
	StmtPtr simplified = IRSimplifier::simplify(nest.root_stmt());
	std::cout << "Stmt after simplification:" << std::endl
	<< *simplified << std::endl;
	// Prints:
	// Stmt after simplification:
	// {
	// for (const auto i : c10::irange(64)) {
	// for (const auto j : c10::irange(32)) {
	// D[i, j] = i * j;
	// }
	// }
	// }

	// Many loopnest transformations are stateless and can be applied without
	// creating a LoopNest object. In fact, we plan to make all transformations
	// stateless.
	// splitWithTail is one such transformation: it splits an iteration space
	// of a given loop into two with a given factor.
	ForPtr outer_loop = to<For>(to<Block>(simplified)->stmts().front());
	LoopNest::splitWithTail(outer_loop, 13);
	// Call simplifier once more to fold some arithmetic.
	simplified = IRSimplifier::simplify(simplified);
	std::cout << "Stmt after splitWithTail:" << std::endl
	<< *simplified << std::endl;
	// Prints:
	// Stmt after splitWithTail:
	// {
	// for (const auto i_outer : c10::irange(4)) {
	// for (const auto i_inner : c10::irange(13)) {
	// for (const auto j : c10::irange(32)) {
	// D[i_inner + 13 * i_outer, j] = i_inner * j + 13 * (i_outer * j);
	// }
	// }
	// }
	// for (const auto i_tail : c10::irange(12)) {
	// for (const auto j : c10::irange(32)) {
	// D[i_tail + 52, j] = i_tail * j + 52 * j;
	// }
	// }
	// }

	// NNC supports a wide range of loop nest transformations, which we are not
	// listing here. Please refer to documentation in
	// https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/tensorexpr/loopnest.h
	// for more details.
	}

	std::cout << "* Codegen *" << std::endl;
	{
	// An ultimate goal of tensor expressions is to be provide a mechanism to
	// execute a given computation in the fastest possible way. So far we've
	// looked at how we could describe what computation we're interested in, but
	// we haven't looked at how to actually execute it.
	//
	// All we've been dealing with was just symbols with no actual data
	// associated, in this section we would look at how we can bridge that gap.

	// Let's start by constructing a simple computation for us to work with:
	BufHandle A("A", {64, 32}, kInt);
	BufHandle B("B", {64, 32}, kInt);
	Tensor X =
	Compute("X", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
	return A.load(i, j) + B.load(i, j);
	});

	// And let's lower it to a loop nest, as we did in the previous section. We
	// can pass Tensor object directly:
	LoopNest loopnest({X});
	std::cout << *loopnest.root_stmt() << std::endl;
	// Prints:
	// {
	// for (const auto i : c10::irange(64)) {
	// for (const auto j : c10::irange(32)) {
	// X[i, j] = (A[i, j]) + (B[i, j]);
	// }
	// }

	// Now imagine that we have two actual tensors 64x32 that we want sum
	// together, how do we pass those tensors to the computation and how do we
	// carry it out?
	//
	// Codegen object is aimed at providing exactly that functionality. Codegen
	// is an abstract class and concrete codegens are derived from it.
	// Currently, we have three codegens:
	// 1) Simple Evaluator,
	// 2) LLVM Codegen for CPU,
	// 3) CUDA Codegen.
	// In this example we will be using Simple Evaluator, since it's available
	// everywhere.

	// To create a codegen, we need to provide the statement - it specifies the
	// computation we want to perform - and a list of placeholders and tensors
	// used in the computation. The latter part is crucial since that's the only
	// way the codegen could use to correlate symbols in the statement to actual
	// data arrays that we will be passing when we will actually be performing
	// the computation.
	//
	// Let's create a Simple IR Evaluator codegen for our computation:
	SimpleIREvaluator ir_eval(loopnest.root_stmt(), {A, B, X});

	// We are using the simplest codegen and in it almost no work is done at the
	// construction step. Real codegens such as CUDA and LLVM perform
	// compilation during that stage so that when we're about to run the
	// computation everything is ready.

	// Let's now create some inputs and run our computation with them:
	std::vector<int> data_A(64 * 32, 3); // This will be the input A
	std::vector<int> data_B(64 * 32, 5); // This will be the input B
	std::vector<int> data_X(64 * 32, 0); // This will be used for the result

	// Now let's invoke our codegen to perform the computation on our data. We
	// need to provide as many arguments as how many placeholders and tensors we
	// passed at the codegen construction time. A position in these lists would
	// define how real data arrays from the latter call (these arguments are
	// referred to as 'CallArg's in our codebase) correspond to symbols
	// (placeholders and tensors) used in the tensor expressions we constructed
	// (these are referred to as 'BufferArg').
	// Thus, we will provide three arguments: data_A, data_B, and data_X. data_A
	// contains data for the placeholder A, data_B - for the placeholder B, and
	// data_X would be used for contents of tensor X.
	ir_eval(data_A, data_B, data_X);

	// Let's print one of the elements from each array to verify that the
	// computation did happen:
	std::cout << "A[10] = " << data_A[10] << std::endl
	<< "B[10] = " << data_B[10] << std::endl
	<< "X[10] = A[10] + B[10] = " << data_X[10] << std::endl;
	// Prints:
	// A[10] = 3
	// B[10] = 5
	// X[10] = A[10] + B[10] = 8
	}

	std::cout << "* Lowering TorchScript IR to TensorExpr IR *" << std::endl;
	{
	// This section requires a LLVM-enabled PyTorch build, so we have to use a
	// guard:
	#ifdef TORCH_ENABLE_LLVM

	// Often we would like to convert a TorchScript IR to TE rather than
	// construct TE IR from scratch. NNC provides an API to perform such
	// lowering: it takes a TorchScript graph and returns an object that can be
	// used to invoke the generated kernel.
	// This API is currently used by the TorchScript JIT fuser and can also be
	// used ahead of time to pre-compile parts of a model.
	//
	// To get familiar with this API let's first start with defining a simple
	// TorchScript graph:
	const auto graph_string = R"IR(
	graph(%A : Float(5, 3, strides=[3, 1], device=cpu),
	%B : Float(5, 3, strides=[3, 1], device=cpu)):
	%AB : Float(5, 3, strides=[3, 1]) = aten::mul(%A, %B)
	%one : int = prim::Constant[value=1]()
	%AAB : Float(5, 3, strides=[3, 1]) = aten::mul(%A, %AB)
	%AAB_plus_B: Float(5, 3, strides=[3, 1]) = aten::add(%AAB, %B, %one)
	return (%AAB_plus_B))IR";
	auto graph = std::make_shared<torch::jit::Graph>();
	parseIR(graph_string, &*graph);

	// This graph defines a simple computation of AAB + B where A and B are
	// input 5x3 tensors.

	// To lower this TorchScript graph to TE, we just need to create a
	// TensorExprKernel object. In its constructor it constructs the
	// corresponding TE IR and compiles it for the given backend (in this
	// example for CPU using LLVM compiler).
	TensorExprKernel kernel(graph);

	// We can retrieve the generated TE stmt from the kernel object:
	StmtPtr kernel_stmt = kernel.getCodeGenStmt();
	std::cout << "TE Stmt constructed from TorchScript: " << std::endl
	<< *kernel_stmt << std::endl;
	// Prints:
	// TE Stmt constructed from TorchScript:
	// {
	// for (const auto v : c10::irange(5)) {
	// for (const auto _tail_tail : c10::irange(3)) {
	// aten_add[_tail_tail + 3 * v] = (tA[_tail_tail + 3 * v]) *
	// ((tA[_tail_tail + 3 * v]) * (tB[_tail_tail + 3 * v])) +
	// (tB[_tail_tail + 3 * v]);
	// }
	// }
	// }

	// We can also examine generated LLVM IR and assembly code:
	std::cout << "Generated LLVM IR: " << std::endl;
	auto ir_str = kernel.getCodeText("ir");
	printLinesToFrom(ir_str, 15, 20);
	// Prints:
	// Generated LLVM IR:
	// %9 = bitcast float* %2 to <8 x float>*
	// %10 = load <8 x float>, <8 x float>* %9 ...
	// %11 = bitcast float* %5 to <8 x float>*
	// %12 = load <8 x float>, <8 x float>* %11 ...
	// %13 = fmul <8 x float> %10, %12
	// %14 = fmul <8 x float> %10, %13

	std::cout << "Generated assembly: " << std::endl;
	auto asm_str = kernel.getCodeText("asm");
	printLinesToFrom(asm_str, 10, 15);
	// Prints:
	// Generated assembly:
	// vmulps %ymm1, %ymm0, %ymm2
	// vfmadd213ps %ymm1, %ymm0, %ymm2
	// vmovups %ymm2, (%rax)
	// vmovss 32(%rcx), %xmm0
	// vmovss 32(%rdx), %xmm1
	// vmulss %xmm1, %xmm0, %xmm2

	// We can also execute the generated kernel:
	auto A =
	at::ones({5, 3}, torch::TensorOptions(torch::kCPU).dtype(at::kFloat)) *
	2.0;
	auto B =
	at::ones({5, 3}, torch::TensorOptions(torch::kCPU).dtype(at::kFloat)) *
	3.0;
	std::vector<at::Tensor> inputs = {A, B};
	std::vector<torch::IValue> stack = torch::fmap<torch::IValue>(inputs);
	kernel.run(stack);
	auto R = stack[0].toTensor();

	// Let's print one of the elements from the result tensor to verify that the
	// computation did happen and was correct:
	std::cout << "R[2][2] = " << R[2][2] << std::endl;
	// Prints:
	// R[2][2] = 15
	// [ CPUFloatType{} ]
	#endif
	}
	return 0;
	}

	void printLinesToFrom(const std::string& input_str, int from, int to) {
	std::istringstream f(input_str);
	std::string s;
	int idx = 0;
	while (getline(f, s)) {
	if (idx > from) {
	std::cout << s << "\n";
	}
	if (idx++ > to) {
	break;
	}
	}
	}