Split IGamma cuda kernel into it's own file to speed up compilation times. (#47401) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47401 Test Plan: Imported from OSS Reviewed By: mruberry Differential Revision: D24740657 Pulled By: gchanan fbshipit-source-id: 78244dba8624ca7be8761a8f4bf1aa078602e5cc

commit: 32c76dbecc168464e7a748d43b7eac657d094a4f [log] [tgz]
author: Gregory Chanan <gchanan@fb.com> Wed Nov 04 17:21:50 2020 -0800
committer: Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com> Wed Nov 04 17:23:25 2020 -0800
tree: 99ec3cfbe11fdd3caf34660c59e280e86e691ea5
parent: 735f8cc6c29357ca45ad6113aac73a3e32f1554d [diff]
diff --git a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
index 6423b0f..c0efde1 100644
--- a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu

@@ -1,7 +1,6 @@
 #include <ATen/Dispatch.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/cuda/Loops.cuh>
-#include <ATen/native/cuda/Math.cuh>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/BinaryOps.h>
 
@@ -30,17 +29,8 @@
   });
 }
 
-void igamma_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "igamma_cuda", [&]() {
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-      return calc_igamma(a, b);
-    });
-  });
-}
-
 REGISTER_DISPATCH(smooth_l1_stub, &smooth_l1_kernel_cuda);
 REGISTER_DISPATCH(mse_stub, &mse_kernel_cuda);
-REGISTER_DISPATCH(igamma_stub, &igamma_kernel_cuda);
 
 // DO NOT ADD ANY NEW KERNELS HERE
 // CUDA compilation times grow quickly.  It's perfectly acceptable to have a file per kernel.

diff --git a/aten/src/ATen/native/cuda/IGammaKernel.cu b/aten/src/ATen/native/cuda/IGammaKernel.cu
new file mode 100644
index 0000000..dbdf55d
--- /dev/null
+++ b/aten/src/ATen/native/cuda/IGammaKernel.cu

@@ -0,0 +1,27 @@
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/Math.cuh>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/BinaryOps.h>
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at { namespace native {
+
+void igamma_kernel_cuda(TensorIterator& iter) {
+
+  AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "igamma_cuda", [&]() {
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+      return calc_igamma(a, b);
+    });
+  });
+}
+
+REGISTER_DISPATCH(igamma_stub, &igamma_kernel_cuda);
+
+// DO NOT ADD ANY NEW KERNELS HERE
+// CUDA compilation times grow quickly.  It's perfectly acceptable to have a file per kernel.
+
+}} // namespace at::native
commit	32c76dbecc168464e7a748d43b7eac657d094a4f	[log] [tgz]
author	Gregory Chanan <gchanan@fb.com>	Wed Nov 04 17:21:50 2020 -0800
committer	Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>	Wed Nov 04 17:23:25 2020 -0800
tree	99ec3cfbe11fdd3caf34660c59e280e86e691ea5
parent	735f8cc6c29357ca45ad6113aac73a3e32f1554d [diff]