Minor improvements cherry-pick (#12973)
Summary:
* Enable disabled functions for ROCm (ROCm 252)
* fixes for topk fp16 (ROCm 270)
* HIP needs kernel invocation to be explicitly templated to be able to take non-const arg as const kernel arg (ROCm 281)
For attention: bddppq ezyang
Full set of PyTorch/Caffe2 tests on ROCm here: https://github.com/ROCmSoftwarePlatform/pytorch/pull/283
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12973
Differential Revision: D10516072
Pulled By: bddppq
fbshipit-source-id: 833b3de1544dfa4886a34e2b5ea53d77b6f0ba9e
diff --git a/aten/src/ATen/native/cuda/Unique.cu b/aten/src/ATen/native/cuda/Unique.cu
index a33c90c..5b62280 100644
--- a/aten/src/ATen/native/cuda/Unique.cu
+++ b/aten/src/ATen/native/cuda/Unique.cu
@@ -11,8 +11,6 @@
namespace at {
namespace native{
-#ifndef __HIP_PLATFORM_HCC__
-
namespace {
template <typename scalar_t>
__global__ void inverse_indices_kernel(
@@ -157,30 +155,20 @@
}
} // namespace
-#endif
-
std::tuple<Tensor, Tensor>
_unique_cuda(const Tensor& self, const bool sorted, const bool return_inverse) {
-#ifndef __HIP_PLATFORM_HCC__
return AT_DISPATCH_ALL_TYPES(self.type(), "unique", [&] {
// The current CUDA implementation of unique always sort due to the
// lack of hashtable implementation in thrust
return _unique_cuda_template<scalar_t>(self, return_inverse);
});
-#else
- AT_ERROR("unique_cuda: HIP not supported");
-#endif
}
std::tuple<Tensor, Tensor>
_unique_dim_cuda(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse) {
- #ifndef __HIP_PLATFORM_HCC__
- return AT_DISPATCH_ALL_TYPES(self.type(), "unique_dim", [&] {
- return _unique_dim_cuda_template<scalar_t>(self, dim, return_inverse);
- });
- #else
- AT_ERROR("unique_dim_cuda: HIP not supported");
- #endif
+ return AT_DISPATCH_ALL_TYPES(self.type(), "unique_dim", [&] {
+ return _unique_dim_cuda_template<scalar_t>(self, dim, return_inverse);
+ });
}
} // namespace native
diff --git a/aten/src/THC/THCNumerics.cuh b/aten/src/THC/THCNumerics.cuh
index 27ec95a..59a27b1 100644
--- a/aten/src/THC/THCNumerics.cuh
+++ b/aten/src/THC/THCNumerics.cuh
@@ -209,7 +209,7 @@
static inline __host__ __device__ at::Half round(at::Half a) { return ::round(a); }
static inline __host__ __device__ at::Half frac(at::Half a) {
- #ifdef __CUDA_ARCH__
+ #if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)
return a - ::trunc(a);
#else // __CUDA_ARCH__
return a - ::floor(a);
diff --git a/aten/src/THC/THCTensorTopK.cuh b/aten/src/THC/THCTensorTopK.cuh
index 71d1bc9..773232d 100644
--- a/aten/src/THC/THCTensorTopK.cuh
+++ b/aten/src/THC/THCTensorTopK.cuh
@@ -117,7 +117,7 @@
typedef uint32_t RadixType;
static inline __device__ RadixType convert(at::Half v) {
-#if CUDA_VERSION >= 8000
+#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__
RadixType x = __half_as_ushort(v);
RadixType mask = -((x >> 15)) | 0x8000;
return (x ^ mask);
@@ -128,7 +128,7 @@
}
static inline __device__ at::Half deconvert(RadixType v) {
-#if CUDA_VERSION >= 8000
+#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__
RadixType mask = ((v >> 15) - 1) | 0x8000;
return __ushort_as_half(v ^ mask);
#else
diff --git a/aten/src/THCUNN/LookupTable.cu b/aten/src/THCUNN/LookupTable.cu
index 9a6e33e..ff222ab 100644
--- a/aten/src/THCUNN/LookupTable.cu
+++ b/aten/src/THCUNN/LookupTable.cu
@@ -7,7 +7,11 @@
#include "THCTensorSort.cuh"
#include "../THC/THCTensorMathReduce.cuh"
+#ifdef __HIP_PLATFORM_HCC__
+const int WARP_SIZE = 64;
+#else
const int WARP_SIZE = 32;
+#endif
template
<typename Dtype,
diff --git a/tools/amd_build/disabled_features.yaml b/tools/amd_build/disabled_features.yaml
index 44dc23a..6d5af41 100644
--- a/tools/amd_build/disabled_features.yaml
+++ b/tools/amd_build/disabled_features.yaml
@@ -33,31 +33,12 @@
}
},
{
- "path": "aten/src/ATen/Context.cpp",
- "s_constants": {
- "#ifdef USE_SSE3": "#if defined(USE_SSE3) && !defined(__HIP_DEVICE_COMPILE__)"
- }
- },
- {
- "path": "aten/src/ATen/native/Distributions.h",
- "s_constants": {
- "scalar_cast": "static_cast"
- }
- },
- {
"path": "aten/src/ATen/native/cuda/Distributions.cu",
"s_constants": {
"#include <nvfunctional>": ""
}
},
{
- "path": "aten/src/THC/THCNumerics.cuh",
- "s_constants": {
- "#ifdef __CUDA_ARCH__": "#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)",
- "#if CUDA_VERSION < 9000": "#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)"
- }
- },
- {
"path": "aten/src/ATen/native/cuda/RoiPooling.cu",
"s_constants": {
"RoiPooling2d_forward_kernel<<<": "RoiPooling2d_forward_kernel<float><<<"
@@ -118,6 +99,12 @@
# ROCm Pytorch issue: https://github.com/ROCmSoftwarePlatform/pytorch/issues/31
"detail::getCUDAHooks().getNumGPUs()": "1",
}
+ },
+ {
+ "path": "aten/src/ATen/native/cuda/Unique.cu",
+ "s_constants": {
+ "inverse_indices_kernel<<<": "inverse_indices_kernel<scalar_t><<<",
+ }
}
],
"disabled_modules": [
@@ -144,8 +131,6 @@
{
"path": "aten/src/ATen/native/cuda/Distributions.cu",
"functions": [
- "_s_poisson_cuda",
- "poisson_cuda_kernel",
"gamma_cuda_kernel",
"gamma_grad_cuda_kernel",
]
@@ -165,12 +150,6 @@
]
},
{
- "path": "aten/src/THCUNN/LookupTable.cu",
- "functions": [
- "calculate_norms_and_renorm"
- ]
- },
- {
"path": "aten/src/THC/generic/THCTensor.cu",
"functions": [
"THCTensor_(getTextureObject)"