Print TE CUDA kernel (#42692)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/42692
Test Plan: Imported from OSS
Reviewed By: mruberry
Differential Revision: D22986112
Pulled By: bertmaher
fbshipit-source-id: 52ec3389535c8b276858bef8c470a59aeba4946f
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index 25d4abb..61af626 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -3,6 +3,7 @@
#include <ATen/CUDAGeneratorImpl.h>
#include <c10/cuda/CUDAFunctions.h>
+#include <torch/csrc/jit/jit_log.h>
#include <torch/csrc/jit/tensorexpr/analysis.h>
#include <torch/csrc/jit/tensorexpr/cuda_random.h>
#include <torch/csrc/jit/tensorexpr/eval.h>
@@ -10,8 +11,6 @@
#include <torch/csrc/jit/tensorexpr/execution_counter.h>
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#define DEBUG_PRINT 0
-
namespace torch {
namespace jit {
namespace tensorexpr {
@@ -718,6 +717,19 @@
bool need_rewrite_ = false;
};
+static std::ostream& operator<<(
+ std::ostream& out,
+ const std::vector<const Expr*>& exprs) {
+ size_t i = 0;
+ for (auto expr : exprs) {
+ if (i++ > 0) {
+ out << ", ";
+ }
+ out << *expr;
+ }
+ return out;
+}
+
void CudaCodeGen::Initialize() {
// TODO: handle multiple kernels.
// TODO: handle dynamic dimension.
@@ -805,27 +817,16 @@
}
}
-#if DEBUG_PRINT
- std::cout << "stmt: " << std::endl;
- std::cout << oss_.str() << std::endl;
- std::cout << "block(";
- for (size_t i = 0; i < gpu_block_extents.size(); i++) {
- if (i > 0) {
- std::cout << ", ";
- }
- std::cout << *gpu_block_extents[i];
- }
- std::cout << "), thread(";
- for (size_t i = 0; i < gpu_thread_extents.size(); i++) {
- if (i > 0) {
- std::cout << ", ";
- }
- std::cout << *gpu_thread_extents[i];
- }
- std::cout << ")" << std::endl;
- ;
-#endif
-
+ GRAPH_DEBUG(
+ "Fused TE CUDA kernel:\n",
+ oss_.str(),
+ "\n",
+ "gpu_block_extents: (",
+ printer_->gpu_block_extents(),
+ ")\n",
+ "gpu_thread_extents: (",
+ printer_->gpu_thread_extents(),
+ ")");
CompileToNVRTC(oss_.str(), func_name);
USE_TRIGGER(cuda_codegen_created);
}
@@ -963,11 +964,6 @@
int major, minor;
getMajorMinor(prop, major, minor);
-#if DEBUG_PRINT
- std::cout << "major: " << major << ", "
- << "minor: " << minor << std::endl;
-#endif
-
// Creates the NVRTC program
nvrtcProgram program;
AT_CUDA_NVRTC_CHECK(nvrtc().nvrtcCreateProgram(