Split IGamma cuda kernel into it's own file to speed up compilation times. (#47401)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47401
Test Plan: Imported from OSS
Reviewed By: mruberry
Differential Revision: D24740657
Pulled By: gchanan
fbshipit-source-id: 78244dba8624ca7be8761a8f4bf1aa078602e5cc
diff --git a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
index 6423b0f..c0efde1 100644
--- a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
@@ -1,7 +1,6 @@
#include <ATen/Dispatch.h>
#include <ATen/native/DispatchStub.h>
#include <ATen/native/cuda/Loops.cuh>
-#include <ATen/native/cuda/Math.cuh>
#include <ATen/native/TensorIterator.h>
#include <ATen/native/BinaryOps.h>
@@ -30,17 +29,8 @@
});
}
-void igamma_kernel_cuda(TensorIterator& iter) {
- AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "igamma_cuda", [&]() {
- gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
- return calc_igamma(a, b);
- });
- });
-}
-
REGISTER_DISPATCH(smooth_l1_stub, &smooth_l1_kernel_cuda);
REGISTER_DISPATCH(mse_stub, &mse_kernel_cuda);
-REGISTER_DISPATCH(igamma_stub, &igamma_kernel_cuda);
// DO NOT ADD ANY NEW KERNELS HERE
// CUDA compilation times grow quickly. It's perfectly acceptable to have a file per kernel.
diff --git a/aten/src/ATen/native/cuda/IGammaKernel.cu b/aten/src/ATen/native/cuda/IGammaKernel.cu
new file mode 100644
index 0000000..dbdf55d
--- /dev/null
+++ b/aten/src/ATen/native/cuda/IGammaKernel.cu
@@ -0,0 +1,27 @@
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/Math.cuh>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/BinaryOps.h>
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at { namespace native {
+
+void igamma_kernel_cuda(TensorIterator& iter) {
+
+ AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "igamma_cuda", [&]() {
+ gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+ return calc_igamma(a, b);
+ });
+ });
+}
+
+REGISTER_DISPATCH(igamma_stub, &igamma_kernel_cuda);
+
+// DO NOT ADD ANY NEW KERNELS HERE
+// CUDA compilation times grow quickly. It's perfectly acceptable to have a file per kernel.
+
+}} // namespace at::native