Minor improvements cherry-pick (#12973)

Summary:
* Enable disabled functions for ROCm (ROCm 252)
* fixes for topk fp16 (ROCm 270)
* HIP needs kernel invocation to be explicitly templated to be able to take non-const arg as const kernel arg (ROCm 281)

For attention: bddppq ezyang

Full set of PyTorch/Caffe2 tests on ROCm here: https://github.com/ROCmSoftwarePlatform/pytorch/pull/283
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12973

Differential Revision: D10516072

Pulled By: bddppq

fbshipit-source-id: 833b3de1544dfa4886a34e2b5ea53d77b6f0ba9e
diff --git a/aten/src/ATen/native/cuda/Unique.cu b/aten/src/ATen/native/cuda/Unique.cu
index a33c90c..5b62280 100644
--- a/aten/src/ATen/native/cuda/Unique.cu
+++ b/aten/src/ATen/native/cuda/Unique.cu
@@ -11,8 +11,6 @@
 namespace at {
 namespace native{
 
-#ifndef __HIP_PLATFORM_HCC__
-
 namespace {
 template <typename scalar_t>
 __global__ void inverse_indices_kernel(
@@ -157,30 +155,20 @@
   }
 } // namespace
 
-#endif
-
 std::tuple<Tensor, Tensor>
 _unique_cuda(const Tensor& self, const bool sorted, const bool return_inverse) {
-#ifndef __HIP_PLATFORM_HCC__
   return AT_DISPATCH_ALL_TYPES(self.type(), "unique", [&] {
     // The current CUDA implementation of unique always sort due to the
     // lack of hashtable implementation in thrust
     return _unique_cuda_template<scalar_t>(self, return_inverse);
   });
-#else
-  AT_ERROR("unique_cuda: HIP not supported");
-#endif
 }
 
 std::tuple<Tensor, Tensor>
 _unique_dim_cuda(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse) {
-  #ifndef __HIP_PLATFORM_HCC__
-    return AT_DISPATCH_ALL_TYPES(self.type(), "unique_dim", [&] {
-      return _unique_dim_cuda_template<scalar_t>(self, dim, return_inverse);
-    });
-  #else
-    AT_ERROR("unique_dim_cuda: HIP not supported");
-  #endif
+  return AT_DISPATCH_ALL_TYPES(self.type(), "unique_dim", [&] {
+    return _unique_dim_cuda_template<scalar_t>(self, dim, return_inverse);
+  });
 }
 
 }  // namespace native
diff --git a/aten/src/THC/THCNumerics.cuh b/aten/src/THC/THCNumerics.cuh
index 27ec95a..59a27b1 100644
--- a/aten/src/THC/THCNumerics.cuh
+++ b/aten/src/THC/THCNumerics.cuh
@@ -209,7 +209,7 @@
   static inline __host__ __device__ at::Half round(at::Half a) { return ::round(a); }
 
   static inline __host__ __device__ at::Half frac(at::Half a) {
-    #ifdef __CUDA_ARCH__
+    #if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)
         return a - ::trunc(a);
     #else // __CUDA_ARCH__
         return a - ::floor(a);
diff --git a/aten/src/THC/THCTensorTopK.cuh b/aten/src/THC/THCTensorTopK.cuh
index 71d1bc9..773232d 100644
--- a/aten/src/THC/THCTensorTopK.cuh
+++ b/aten/src/THC/THCTensorTopK.cuh
@@ -117,7 +117,7 @@
   typedef uint32_t RadixType;
 
   static inline __device__ RadixType convert(at::Half v) {
-#if CUDA_VERSION >= 8000
+#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__
     RadixType x = __half_as_ushort(v);
     RadixType mask = -((x >> 15)) | 0x8000;
     return (x ^ mask);
@@ -128,7 +128,7 @@
   }
 
   static inline __device__ at::Half deconvert(RadixType v) {
-#if CUDA_VERSION >= 8000
+#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__
     RadixType mask = ((v >> 15) - 1) | 0x8000;
     return __ushort_as_half(v ^ mask);
 #else
diff --git a/aten/src/THCUNN/LookupTable.cu b/aten/src/THCUNN/LookupTable.cu
index 9a6e33e..ff222ab 100644
--- a/aten/src/THCUNN/LookupTable.cu
+++ b/aten/src/THCUNN/LookupTable.cu
@@ -7,7 +7,11 @@
 #include "THCTensorSort.cuh"
 #include "../THC/THCTensorMathReduce.cuh"
 
+#ifdef __HIP_PLATFORM_HCC__
+const int WARP_SIZE = 64;
+#else
 const int WARP_SIZE = 32;
+#endif
 
 template
   <typename Dtype,
diff --git a/tools/amd_build/disabled_features.yaml b/tools/amd_build/disabled_features.yaml
index 44dc23a..6d5af41 100644
--- a/tools/amd_build/disabled_features.yaml
+++ b/tools/amd_build/disabled_features.yaml
@@ -33,31 +33,12 @@
         }
       },
       {
-        "path": "aten/src/ATen/Context.cpp",
-        "s_constants": {
-          "#ifdef USE_SSE3": "#if defined(USE_SSE3) && !defined(__HIP_DEVICE_COMPILE__)"
-        }
-      },
-      {
-        "path": "aten/src/ATen/native/Distributions.h",
-        "s_constants": {
-          "scalar_cast": "static_cast"
-        }
-      },
-      {
         "path": "aten/src/ATen/native/cuda/Distributions.cu",
         "s_constants": {
           "#include <nvfunctional>": ""
         }
       },
       {
-        "path": "aten/src/THC/THCNumerics.cuh",
-        "s_constants": {
-          "#ifdef __CUDA_ARCH__": "#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)",
-          "#if CUDA_VERSION < 9000": "#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)"
-        }
-      },
-      {
         "path": "aten/src/ATen/native/cuda/RoiPooling.cu",
         "s_constants": {
           "RoiPooling2d_forward_kernel<<<": "RoiPooling2d_forward_kernel<float><<<"
@@ -118,6 +99,12 @@
             # ROCm Pytorch issue: https://github.com/ROCmSoftwarePlatform/pytorch/issues/31
             "detail::getCUDAHooks().getNumGPUs()": "1",
         }
+      },
+      {
+        "path": "aten/src/ATen/native/cuda/Unique.cu",
+        "s_constants": {
+            "inverse_indices_kernel<<<": "inverse_indices_kernel<scalar_t><<<",
+        }
       }
     ],
   "disabled_modules": [
@@ -144,8 +131,6 @@
     {
       "path": "aten/src/ATen/native/cuda/Distributions.cu",
       "functions": [
-        "_s_poisson_cuda",
-        "poisson_cuda_kernel",
         "gamma_cuda_kernel",
         "gamma_grad_cuda_kernel",
       ]
@@ -165,12 +150,6 @@
       ]
     },
     {
-      "path": "aten/src/THCUNN/LookupTable.cu",
-      "functions": [
-        "calculate_norms_and_renorm"
-      ]
-    },
-    {
       "path": "aten/src/THC/generic/THCTensor.cu",
       "functions": [
         "THCTensor_(getTextureObject)"