Fix some NVCC warnings (Part 2) (#101383)
PR #95568 enables more NVCC warnings. However, some cu files need to be modified to make building process more warning free. PR #100823 already contains some fixes. This PR aims to fix the remaining ones without breaking the codebase.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/101383
Approved by: https://github.com/zou3519
diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu
index 855b3ac..bf60739 100644
--- a/aten/src/ATen/native/cuda/MultinomialKernel.cu
+++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu
@@ -25,19 +25,22 @@
#include <curand.h>
#include <curand_kernel.h>
#include <curand_philox4x32_x.h>
+#include <type_traits>
namespace at::native {
namespace {
-template <typename T>
-inline __device__ bool _isinf(T x) { return ::isinf(x); }
-
-inline __device__ bool _isinf(c10::Half x) {
- return ::isinf(static_cast<float>(x));
-}
-inline __device__ bool _isinf(c10::BFloat16 x) {
- return ::isinf(static_cast<float>(x));
+template <
+ typename T,
+ typename = std::enable_if_t<
+ std::is_floating_point_v<T> || std::is_convertible_v<T, float>>>
+inline __device__ bool _isinf(T x) {
+ if constexpr (std::is_floating_point_v<T>) {
+ return ::isinf(x);
+ } else {
+ return ::isinf(static_cast<float>(x));
+ }
}
#define MAX_NUM_BLOCKS 200
diff --git a/aten/src/ATen/native/cuda/ReplicationPadding.cu b/aten/src/ATen/native/cuda/ReplicationPadding.cu
index 3ebd7f7..d6c11fd 100644
--- a/aten/src/ATen/native/cuda/ReplicationPadding.cu
+++ b/aten/src/ATen/native/cuda/ReplicationPadding.cu
@@ -303,46 +303,6 @@
);
}
-static inline void shapeCheck3d(
- const Tensor& input,
- int pleft, int pright,
- int ptop, int pbottom,
- int pfront, int pback) {
- TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
- "input tensor must fit into 32-bit index math");
- int numInputDims = input.dim();
-
- bool valid_dims = input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0;
- TORCH_CHECK(
- (numInputDims == 4 && input.size(0) != 0 && valid_dims) ||
- (numInputDims == 5 && valid_dims && input.size(4) != 0),
- "Expected 4D or 5D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ",
- input.sizes());
-
- int planeDim = 0;
- int dimd = 1;
- int dimh = 2;
- int dimw = 3;
- if (numInputDims == 5) {
- planeDim++;
- dimd++;
- dimh++;
- dimw++;
- }
-
- const int idepth = input.size(dimd);
- const int iheight = input.size(dimh);
- const int iwidth = input.size(dimw);
- const int odepth = idepth + pfront + pback;
- const int oheight = iheight + ptop + pbottom;
- const int owidth = iwidth + pleft + pright;
- TORCH_CHECK(owidth >= 1 || oheight >= 1 || odepth >= 1,
- "input (D: ", idepth, " H: ", iheight, ", W: ", iwidth,
- ") is too small."
- " Calculated output D: ", odepth, " H: ", oheight, " W: ", owidth);
-
-}
-
static inline void shapeAndGradOutputCheck3d(
const Tensor& input,
const Tensor& gradOutput,
diff --git a/aten/src/ATen/native/cuda/Sort.cu b/aten/src/ATen/native/cuda/Sort.cu
index d83f881..e60dd72 100644
--- a/aten/src/ATen/native/cuda/Sort.cu
+++ b/aten/src/ATen/native/cuda/Sort.cu
@@ -18,7 +18,7 @@
template <typename T>
static int minimum_grid_for_occupancy(T kernel, int max_block_size) {
- int minGridSize;
+ int minGridSize = 0;
int blockSize;
C10_CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(
&minGridSize,
diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
index c2de4d8..bf7badc 100644
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -33,7 +33,6 @@
namespace {
constexpr int kCUDANumThreads = 256;
-constexpr int kColwiseReduceTileSize = 32;
constexpr unsigned int kWarpSize = 32;
constexpr int vec_size = 4; //we could make it dependent on dtype, but that would lead to different results between float and low-p types
@@ -484,80 +483,6 @@
}
template <typename T, typename T_ACC>
-__global__ void GammaBetaBackwardCUDAKernel1(
- int64_t M,
- int64_t N,
- const T* dY,
- const T* X,
- const T_ACC* mean,
- const T_ACC* rstd,
- T* dg,
- T* db) {
- __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
- __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
- const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
- T_ACC dg_sum1 = 0;
- T_ACC dg_sum2 = 0;
- T_ACC db_sum1 = 0;
- T_ACC db_sum2 = 0;
- if (j < N) {
- for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) {
- const int64_t i1 = i;
- const int64_t i2 = i + blockDim.y;
- const int64_t index1 = i1 * N + j;
- const int64_t index2 = i2 * N + j;
- dg_sum1 += dg == nullptr ? T_ACC(0)
- : static_cast<T_ACC>(dY[index1]) *
- (static_cast<T_ACC>(X[index1]) - static_cast<T_ACC>(mean[i1])) *
- static_cast<T_ACC>(rstd[i1]);
- db_sum1 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index1]);
- if (i2 < M) {
- dg_sum2 += dg == nullptr ? T_ACC(0)
- : static_cast<T_ACC>(dY[index2]) *
- (static_cast<T_ACC>(X[index2]) - static_cast<T_ACC>(mean[i2])) *
- static_cast<T_ACC>(rstd[i2]);
- db_sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index2]);
- }
- }
- }
- g_shared[threadIdx.y][threadIdx.x] = dg_sum1;
- g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2;
- b_shared[threadIdx.y][threadIdx.x] = db_sum1;
- b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2;
- __syncthreads();
- T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y];
- T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y];
- sum1 = cuda_utils::WarpReduceSum(sum1);
- sum2 = cuda_utils::WarpReduceSum(sum2);
- if (threadIdx.x == 0) {
- const int64_t j = blockIdx.x * blockDim.x + threadIdx.y;
- if (j < N) {
- if (dg != nullptr) {
- dg[j] = sum1;
- }
- if (db != nullptr) {
- db[j] = sum2;
- }
- }
- }
- sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
- sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
- sum1 = cuda_utils::WarpReduceSum(sum1);
- sum2 = cuda_utils::WarpReduceSum(sum2);
- if (threadIdx.x == 0) {
- const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y;
- if (j < N) {
- if (dg != nullptr) {
- dg[j] = sum1;
- }
- if (db != nullptr) {
- db[j] = sum2;
- }
- }
- }
-}
-
-template <typename T, typename T_ACC>
__global__ void GammaBetaBackwardCUDAKernel_32x32(
int64_t M,
int64_t N,