Print TE CUDA kernel (#42692) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/42692 Test Plan: Imported from OSS Reviewed By: mruberry Differential Revision: D22986112 Pulled By: bertmaher fbshipit-source-id: 52ec3389535c8b276858bef8c470a59aeba4946f

commit: 952526804cf16de659330fece1fe54e15cdcadd0 [log] [tgz]
author: Bert Maher <bertrand@fb.com> Thu Aug 06 20:40:05 2020 -0700
committer: Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com> Thu Aug 06 20:42:04 2020 -0700
tree: 2d92240a1bbc579ac8323bf8479c37ce1005dda9
parent: a6c8730045bc19ec94ed8b15bfabc69d17b3c4f7 [diff]
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index 25d4abb..61af626 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp

@@ -3,6 +3,7 @@
 
 #include <ATen/CUDAGeneratorImpl.h>
 #include <c10/cuda/CUDAFunctions.h>
+#include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/tensorexpr/analysis.h>
 #include <torch/csrc/jit/tensorexpr/cuda_random.h>
 #include <torch/csrc/jit/tensorexpr/eval.h>
@@ -10,8 +11,6 @@
 #include <torch/csrc/jit/tensorexpr/execution_counter.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 
-#define DEBUG_PRINT 0
-
 namespace torch {
 namespace jit {
 namespace tensorexpr {
@@ -718,6 +717,19 @@
   bool need_rewrite_ = false;
 };
 
+static std::ostream& operator<<(
+    std::ostream& out,
+    const std::vector<const Expr*>& exprs) {
+  size_t i = 0;
+  for (auto expr : exprs) {
+    if (i++ > 0) {
+      out << ", ";
+    }
+    out << *expr;
+  }
+  return out;
+}
+
 void CudaCodeGen::Initialize() {
   // TODO: handle multiple kernels.
   // TODO: handle dynamic dimension.
@@ -805,27 +817,16 @@
     }
   }
 
-#if DEBUG_PRINT
-  std::cout << "stmt: " << std::endl;
-  std::cout << oss_.str() << std::endl;
-  std::cout << "block(";
-  for (size_t i = 0; i < gpu_block_extents.size(); i++) {
-    if (i > 0) {
-      std::cout << ", ";
-    }
-    std::cout << *gpu_block_extents[i];
-  }
-  std::cout << "), thread(";
-  for (size_t i = 0; i < gpu_thread_extents.size(); i++) {
-    if (i > 0) {
-      std::cout << ", ";
-    }
-    std::cout << *gpu_thread_extents[i];
-  }
-  std::cout << ")" << std::endl;
-  ;
-#endif
-
+  GRAPH_DEBUG(
+      "Fused TE CUDA kernel:\n",
+      oss_.str(),
+      "\n",
+      "gpu_block_extents: (",
+      printer_->gpu_block_extents(),
+      ")\n",
+      "gpu_thread_extents: (",
+      printer_->gpu_thread_extents(),
+      ")");
   CompileToNVRTC(oss_.str(), func_name);
   USE_TRIGGER(cuda_codegen_created);
 }
@@ -963,11 +964,6 @@
   int major, minor;
   getMajorMinor(prop, major, minor);
 
-#if DEBUG_PRINT
-  std::cout << "major: " << major << ", "
-            << "minor: " << minor << std::endl;
-#endif
-
   // Creates the NVRTC program
   nvrtcProgram program;
   AT_CUDA_NVRTC_CHECK(nvrtc().nvrtcCreateProgram(
commit	952526804cf16de659330fece1fe54e15cdcadd0	[log] [tgz]
author	Bert Maher <bertrand@fb.com>	Thu Aug 06 20:40:05 2020 -0700
committer	Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>	Thu Aug 06 20:42:04 2020 -0700
tree	2d92240a1bbc579ac8323bf8479c37ce1005dda9
parent	a6c8730045bc19ec94ed8b15bfabc69d17b3c4f7 [diff]