make ATen/native/cuda/AdaptiveAveragePooling.cu data_ptr-correct (#100030)
make ATen/native/cuda/AdaptiveAveragePooling.cu data_ptr-correct
Summary:
Traced through each input and output to ensure correctness.
Test Plan: Rely on CI.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/100030
Approved by: https://github.com/ezyang
diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
index b25f387..d96a299 100644
--- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
@@ -49,7 +49,7 @@
* 4D input, 4D output
*/
template <typename T>
- __global__ void adaptive_average_pool(T *input, T *output,
+ __global__ void adaptive_average_pool(const T *input, T *output,
int isizeH, int isizeW,
int osizeH, int osizeW,
int64_t istrideD, int64_t istrideH, int64_t istrideW)
@@ -86,7 +86,7 @@
int kW = iendW - istartW;
// Compute the average pooling over corresponding input pixels
- T *ptr_input = input + istartH*istrideH + istartW*istrideW;
+ const T *ptr_input = input + istartH*istrideH + istartW*istrideW;
T *ptr_output = output + oh*osizeW + ow;
T sum = static_cast<T>(0);
int ih, iw;
@@ -109,7 +109,7 @@
*/
template <typename T>
__global__ void adaptive_average_gradinput(
- T *gradInput, T *gradOutput,
+ T *gradInput, const T *gradOutput,
int isizeH, int isizeW, int osizeH, int osizeW
)
{
@@ -165,7 +165,7 @@
*/
template <typename T>
__global__ void atomic_adaptive_average_gradinput(
- T *gradInput, T *gradOutput,
+ T *gradInput, const T *gradOutput,
int isizeH, int isizeW, int osizeH, int osizeW
)
{
@@ -202,7 +202,7 @@
// Compute the gradients for over corresponding input pixels
T *ptr_gradInput = gradInput + istartH*isizeW + istartW;
- T *ptr_gradOutput = gradOutput + oh*osizeW + ow;
+ const T *ptr_gradOutput = gradOutput + oh*osizeW + ow;
T grad_delta = *ptr_gradOutput / kW / kH;
int ih, iw;
@@ -532,8 +532,8 @@
size_t shmem_size = (kernel_size_C * block_x * block_y * block_z) * sizeof(scalar_t);
AT_ASSERT(shmem_size <= sharedMemPerBlock);
adaptive_average_pool_nhwc<int32_t><<<grid, block, shmem_size, at::cuda::getCurrentCUDAStream()>>> (
- input_.data_ptr<scalar_t>(),
- output.data_ptr<scalar_t>(),
+ input_.const_data_ptr<scalar_t>(),
+ output.mutable_data_ptr<scalar_t>(),
sizeB, sizeC, isizeH, isizeW, osizeH, osizeW,
kernel_stride_C, kernel_size_C,
istrideB, istrideC, istrideH, istrideW);
@@ -569,8 +569,8 @@
AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
input_.scalar_type(), "adaptive_avg_pool2d_cuda", [&] {
- scalar_t *input_data = input_.data_ptr<scalar_t>();
- scalar_t *output_data = output.data_ptr<scalar_t>();
+ const scalar_t *input_data = input_.const_data_ptr<scalar_t>();
+ scalar_t *output_data = output.mutable_data_ptr<scalar_t>();
// cuda blocks & threads:
int blocksH = std::max<int64_t>((int)(16L / sizeD), 1);
@@ -682,8 +682,8 @@
size_t shmem_size = (kernel_size_C * block_x * block_y * block_z + osizeH + osizeW) * sizeof(scalar_t) + 2 * isizeW * sizeof(int32_t);
AT_ASSERT(shmem_size <= sharedMemPerBlock);
adaptive_average_gradinput_nhwc<int32_t><<<grid, block, shmem_size, at::cuda::getCurrentCUDAStream()>>> (
- gradInput.data_ptr<scalar_t>(),
- gradOutput.data_ptr<scalar_t>(),
+ gradInput.mutable_data_ptr<scalar_t>(),
+ gradOutput.const_data_ptr<scalar_t>(),
sizeB, sizeC, isizeH, isizeW, osizeH, osizeW,
kernel_stride_C, kernel_size_C,
ostrideB, ostrideC, ostrideH, ostrideW);
@@ -710,8 +710,8 @@
//bool atomic = (isizeW%osizeW != 0) || (isizeH%osizeH != 0);
AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
input.scalar_type(), "adaptive_avg_pool2d_backward_cuda", [&] {
- scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
- scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
+ const scalar_t *gradOutput_data = gradOutput.const_data_ptr<scalar_t>();
+ scalar_t *gradInput_data = gradInput.mutable_data_ptr<scalar_t>();
// cuda blocks & threads:
int blocksH = std::max((int)(16L / sizeD), 1);