blob: 642a3b2c35ea4fced88b722aca039523de97ca3c [file] [log] [blame]
#include <ATen/ATen.h>
#include <ATen/Parallel.h>
#include <ATen/NativeFunctions.h>
#include <ATen/native/Pool.h>
#include <tuple>
namespace at {
namespace native {
namespace {
template <typename scalar_t>
static void avg_pool2d_out_frame(
scalar_t *input_data,
scalar_t *output_data,
int64_t nbatch,
int64_t nInputPlane,
int64_t inputWidth,
int64_t inputHeight,
int64_t outputWidth,
int64_t outputHeight,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
bool count_include_pad,
c10::optional<int64_t> divisor_override)
{
at::parallel_for(0, nInputPlane, 0, [&](int64_t start, int64_t end) {
for (auto k = start; k < end; k++)
{
int64_t p;
for(p = 0; p < nbatch; p++)
{
int64_t xx, yy;
/* For all output pixels... */
scalar_t *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
const scalar_t *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
int64_t i;
for(i = 0; i < outputWidth*outputHeight; i++)
ptr_output[i] = 0;
for(yy = 0; yy < outputHeight; yy++)
{
for(xx = 0; xx < outputWidth; xx++)
{
/* Compute the mean of the input image... */
int64_t hstart = yy * dH - padH;
int64_t wstart = xx * dW - padW;
int64_t hend = std::min(hstart + kH, inputHeight + padH);
int64_t wend = std::min(wstart + kW, inputWidth + padW);
int pool_size = (hend - hstart) * (wend - wstart);
hstart = std::max(hstart, (int64_t) 0);
wstart = std::max(wstart, (int64_t) 0);
hend = std::min(hend, inputHeight);
wend = std::min(wend, inputWidth);
scalar_t sum = 0;
int divide_factor;
if (divisor_override.has_value()) {
divide_factor = divisor_override.value();
} else {
if(count_include_pad) {
divide_factor = pool_size;
} else {
divide_factor = (hend - hstart) * (wend - wstart);
}
}
int64_t kx, ky;
for(ky = hstart; ky < hend; ky++)
{
for(kx = wstart; kx < wend; kx++)
sum += ptr_input[ky*inputWidth + kx];
}
/* Update output */
*ptr_output++ += sum/divide_factor;
}
}
}
}
});
}
void avg_pool2d_out_cpu_template(
Tensor &output,
const Tensor &input_,
IntArrayRef kernel_size,
IntArrayRef stride,
IntArrayRef padding,
bool ceil_mode,
bool count_include_pad,
c10::optional<int64_t> divisor_override)
{
// #20866, #22032: Guarantee this for the official C++ API?
TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
"avg_pool2d: kernel_size must either be a single int, or a tuple of two ints");
const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 2,
"avg_pool2d: stride must either be omitted, a single int, or a tuple of two ints");
const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
const int dW = stride.empty() ? kW :
stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
"avg_pool2d: padding must either be a single int, or a tuple of two ints");
const int padH = safe_downcast<int, int64_t>(padding[0]);
const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
TORCH_CHECK((input_.ndimension() == 3 || input_.ndimension() == 4),
"non-empty 2D or 3D (batch mode) tensor expected for input");
TORCH_CHECK(!divisor_override.has_value() || divisor_override.value() != 0,
"divisor must be not zero");
/* sizes */
const int64_t nbatch = input_.ndimension() == 4 ? input_.size(-4) : 1;
const int64_t nInputPlane = input_.size(-3);
const int64_t inputHeight = input_.size(-2);
const int64_t inputWidth = input_.size(-1);
const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, 1, ceil_mode);
const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, 1, ceil_mode);
pool2d_shape_check(
input_,
kH, kW, dH, dW, padH, padW, 1, 1,
nInputPlane,
inputHeight, inputWidth,
outputHeight, outputWidth);
if (input_.ndimension() == 3) {
output.resize_({nInputPlane, outputHeight, outputWidth});
}
else {
output.resize_({nbatch, nInputPlane, outputHeight, outputWidth});
}
TORCH_CHECK(output.is_contiguous(), "avg_pool2d: output must be contiguous");
Tensor input = input_.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, input.scalar_type(),
"avg_pool2d_out_frame",
[&] {
scalar_t *input_data = input.data_ptr<scalar_t>();
scalar_t *output_data = output.data_ptr<scalar_t>();
avg_pool2d_out_frame(
input_data,
output_data,
nbatch,
nInputPlane,
inputWidth, inputHeight,
outputWidth, outputHeight,
kW, kH,
dW, dH,
padW, padH,
count_include_pad,
divisor_override);
}
);
}
template <typename scalar_t>
static void avg_pool2d_backward_out_frame(
scalar_t *gradInput_data,
scalar_t *gradOutput_data,
int64_t nbatch,
int64_t nInputPlane,
int64_t inputWidth,
int64_t inputHeight,
int64_t outputWidth,
int64_t outputHeight,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
bool count_include_pad,
c10::optional<int64_t> divisor_override)
{
at::parallel_for(0, nInputPlane, 0, [&](int64_t start, int64_t end) {
for (auto k = start; k < end; k++)
{
int64_t p;
for(p = 0; p < nbatch; p++)
{
const scalar_t *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
int64_t xx, yy;
scalar_t* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
scalar_t *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
int64_t i;
for(i=0; i<inputWidth*inputHeight; i++)
ptr_gi[i] = 0.0;
for(yy = 0; yy < outputHeight; yy++)
{
for(xx = 0; xx < outputWidth; xx++)
{
int64_t hstart = yy * dH - padH;
int64_t wstart = xx * dW - padW;
int64_t hend = std::min(hstart + kH, inputHeight + padH);
int64_t wend = std::min(wstart + kW, inputWidth + padW);
int pool_size = (hend - hstart) * (wend - wstart);
hstart = std::max(hstart, (int64_t) 0);
wstart = std::max(wstart, (int64_t) 0);
hend = std::min(hend, inputHeight);
wend = std::min(wend, inputWidth);
scalar_t z = *ptr_gradOutput++;
int divide_factor;
if (divisor_override.has_value()) {
divide_factor = divisor_override.value();
} else {
if(count_include_pad) {
divide_factor = pool_size;
} else {
divide_factor = (hend - hstart) * (wend - wstart);
}
}
int64_t kx, ky;
for(ky = hstart ; ky < hend; ky++)
{
for(kx = wstart; kx < wend; kx++)
ptr_gradInput[ky*inputWidth + kx] += z/divide_factor;
}
}
}
}
}
});
}
Tensor& avg_pool2d_backward_out_cpu_template(
Tensor& gradInput,
const Tensor& gradOutput_,
const Tensor& input,
IntArrayRef kernel_size,
IntArrayRef stride,
IntArrayRef padding,
bool ceil_mode,
bool count_include_pad,
c10::optional<int64_t> divisor_override)
{
// #20866, #22032: Guarantee this for the official C++ API?
TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
"avg_pool2d: kernel_size must either be a single int, or a tuple of two ints");
const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 2,
"avg_pool2d: stride must either be omitted, a single int, or a tuple of two ints");
const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
const int dW = stride.empty() ? kW :
stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
"avg_pool2d: padding must either be a single int, or a tuple of two ints");
const int padH = safe_downcast<int, int64_t>(padding[0]);
const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
const int64_t ndim = input.ndimension();
TORCH_CHECK((ndim == 3 || ndim == 4),
"non-empty 3D or 4D (batch mode) tensor expected for input");
TORCH_CHECK(!divisor_override.has_value() || divisor_override.value() != 0, "divisor must be not zero");
/* sizes */
const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
const int64_t nInputPlane = input.size(-3); // number of channels (or colors)
const int64_t inputHeight = input.size(-2);
const int64_t inputWidth = input.size(-1);
const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, 1, ceil_mode);
const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, 1, ceil_mode);
avg_pool2d_backward_shape_check(
input,
gradOutput_,
nbatch,
kH, kW, dH, dW, padH, padW,
nInputPlane,
inputHeight, inputWidth,
outputHeight, outputWidth);
/* get contiguous gradOutput */
const Tensor gradOutput = gradOutput_.contiguous();
/* resize */
gradInput.resize_as_(input);
gradInput.zero_();
TORCH_CHECK(gradInput.is_contiguous(), "gradInput must be contiguous");
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, input.scalar_type(),
"avg_pool2d_backward_out_frame",
[&] {
scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
avg_pool2d_backward_out_frame(
gradInput_data,
gradOutput_data,
nbatch,
nInputPlane,
inputWidth, inputHeight,
outputWidth, outputHeight,
kW, kH,
dW, dH,
padW, padH,
count_include_pad,
divisor_override);
}
);
return gradInput;
}
} // namespace
Tensor& avg_pool2d_out_cpu(
Tensor& output,
const Tensor& input,
IntArrayRef kernel_size,
IntArrayRef stride,
IntArrayRef padding,
bool ceil_mode,
bool count_include_pad,
c10::optional<int64_t> divisor_override)
{
avg_pool2d_out_cpu_template(
output,
input,
kernel_size,
stride,
padding,
ceil_mode,
count_include_pad,
divisor_override);
return output;
}
Tensor avg_pool2d_cpu(
const Tensor& input,
IntArrayRef kernel_size,
IntArrayRef stride,
IntArrayRef padding,
bool ceil_mode,
bool count_include_pad,
c10::optional<int64_t> divisor_override)
{
Tensor output = at::empty({0}, input.options());
avg_pool2d_out_cpu_template(
output,
input,
kernel_size,
stride,
padding,
ceil_mode,
count_include_pad,
divisor_override);
return output;
}
Tensor& avg_pool2d_backward_out_cpu(
Tensor& gradInput,
const Tensor& gradOutput_,
const Tensor& input,
IntArrayRef kernel_size,
IntArrayRef stride,
IntArrayRef padding,
bool ceil_mode,
bool count_include_pad,
c10::optional<int64_t> divisor_override)
{
avg_pool2d_backward_out_cpu_template(
gradInput,
gradOutput_,
input,
kernel_size,
stride,
padding,
ceil_mode,
count_include_pad,
divisor_override);
return gradInput;
}
Tensor avg_pool2d_backward_cpu(
const Tensor& gradOutput_,
const Tensor& input,
IntArrayRef kernel_size,
IntArrayRef stride,
IntArrayRef padding,
bool ceil_mode,
bool count_include_pad,
c10::optional<int64_t> divisor_override)
{
auto gradInput = at::zeros_like(input);
avg_pool2d_backward_out_cpu_template(
gradInput,
gradOutput_,
input,
kernel_size,
stride,
padding,
ceil_mode,
count_include_pad,
divisor_override);
return gradInput;
}
} // at::native
} // at