Print TE CUDA kernel (#42692)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/42692

Test Plan: Imported from OSS

Reviewed By: mruberry

Differential Revision: D22986112

Pulled By: bertmaher

fbshipit-source-id: 52ec3389535c8b276858bef8c470a59aeba4946f
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index 25d4abb..61af626 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -3,6 +3,7 @@
 
 #include <ATen/CUDAGeneratorImpl.h>
 #include <c10/cuda/CUDAFunctions.h>
+#include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/tensorexpr/analysis.h>
 #include <torch/csrc/jit/tensorexpr/cuda_random.h>
 #include <torch/csrc/jit/tensorexpr/eval.h>
@@ -10,8 +11,6 @@
 #include <torch/csrc/jit/tensorexpr/execution_counter.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 
-#define DEBUG_PRINT 0
-
 namespace torch {
 namespace jit {
 namespace tensorexpr {
@@ -718,6 +717,19 @@
   bool need_rewrite_ = false;
 };
 
+static std::ostream& operator<<(
+    std::ostream& out,
+    const std::vector<const Expr*>& exprs) {
+  size_t i = 0;
+  for (auto expr : exprs) {
+    if (i++ > 0) {
+      out << ", ";
+    }
+    out << *expr;
+  }
+  return out;
+}
+
 void CudaCodeGen::Initialize() {
   // TODO: handle multiple kernels.
   // TODO: handle dynamic dimension.
@@ -805,27 +817,16 @@
     }
   }
 
-#if DEBUG_PRINT
-  std::cout << "stmt: " << std::endl;
-  std::cout << oss_.str() << std::endl;
-  std::cout << "block(";
-  for (size_t i = 0; i < gpu_block_extents.size(); i++) {
-    if (i > 0) {
-      std::cout << ", ";
-    }
-    std::cout << *gpu_block_extents[i];
-  }
-  std::cout << "), thread(";
-  for (size_t i = 0; i < gpu_thread_extents.size(); i++) {
-    if (i > 0) {
-      std::cout << ", ";
-    }
-    std::cout << *gpu_thread_extents[i];
-  }
-  std::cout << ")" << std::endl;
-  ;
-#endif
-
+  GRAPH_DEBUG(
+      "Fused TE CUDA kernel:\n",
+      oss_.str(),
+      "\n",
+      "gpu_block_extents: (",
+      printer_->gpu_block_extents(),
+      ")\n",
+      "gpu_thread_extents: (",
+      printer_->gpu_thread_extents(),
+      ")");
   CompileToNVRTC(oss_.str(), func_name);
   USE_TRIGGER(cuda_codegen_created);
 }
@@ -963,11 +964,6 @@
   int major, minor;
   getMajorMinor(prop, major, minor);
 
-#if DEBUG_PRINT
-  std::cout << "major: " << major << ", "
-            << "minor: " << minor << std::endl;
-#endif
-
   // Creates the NVRTC program
   nvrtcProgram program;
   AT_CUDA_NVRTC_CHECK(nvrtc().nvrtcCreateProgram(