blob: c22ac510705e9c2054ca4be5e651a10cd57efa50 [file] [log] [blame]
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#include <ATen/cuda/CUDABlas.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/native/cuda/im2col.cuh>
#include <ATen/native/cuda/vol2col.cuh>
#include <ATen/native/DilatedConvolutionUtils.h>
#include <c10/util/accumulate.h>
#include <tuple>
namespace at {
namespace native {
namespace {
// hyper-volume to column, CUDA
template <typename Dtype, int64_t dim>
void hvol2col(
cudaStream_t stream,
const Dtype* data_hvol,
const int channels,
const IntArrayRef input_size,
const IntArrayRef output_size,
const IntArrayRef kernel_size,
const IntArrayRef stride_size,
const IntArrayRef pad_size,
const IntArrayRef dilation_size,
Dtype* data_col) {
if (dim == 3) {
vol2col<Dtype>(
stream,
data_hvol,
channels,
input_size[0],
input_size[1],
input_size[2],
output_size[0],
output_size[1],
output_size[2],
kernel_size[0],
kernel_size[1],
kernel_size[2],
pad_size[0],
pad_size[1],
pad_size[2],
stride_size[0],
stride_size[1],
stride_size[2],
dilation_size[0],
dilation_size[1],
dilation_size[2],
data_col);
}
if (dim == 2) {
im2col<Dtype>(
stream,
data_hvol,
channels,
input_size[0],
input_size[1],
output_size[0],
output_size[1],
kernel_size[0],
kernel_size[1],
pad_size[0],
pad_size[1],
stride_size[0],
stride_size[1],
dilation_size[0],
dilation_size[1],
data_col);
}
}
// column to hyper-volume, CUDA
template <typename Dtype, int64_t dim>
void col2hvol(
cudaStream_t stream,
const Dtype* data_col,
const int channels,
const IntArrayRef input_size,
const IntArrayRef output_size,
const IntArrayRef kernel_size,
const IntArrayRef stride_size,
const IntArrayRef pad_size,
const IntArrayRef dilation_size,
Dtype* data_hvol) {
if (dim == 3) {
col2vol<Dtype, Dtype>(
stream,
data_col,
channels,
input_size[0],
input_size[1],
input_size[2],
output_size[0],
output_size[1],
output_size[2],
kernel_size[0],
kernel_size[1],
kernel_size[2],
pad_size[0],
pad_size[1],
pad_size[2],
stride_size[0],
stride_size[1],
stride_size[2],
dilation_size[0],
dilation_size[1],
dilation_size[2],
data_hvol);
}
if (dim == 2) {
col2im<Dtype, Dtype>(
stream,
data_col,
channels,
input_size[0],
input_size[1],
output_size[0],
output_size[1],
kernel_size[0],
kernel_size[1],
pad_size[0],
pad_size[1],
stride_size[0],
stride_size[1],
dilation_size[0],
dilation_size[1],
data_hvol);
}
}
/*
check tensor data locations
*/
void slow_conv_dilated_location_check(
const Tensor& input,
const Tensor& weight,
const Tensor& bias,
const Tensor& grad_output) {
// checking data locations of user-provided tensor arguments
TensorArg input_arg{input, "input", 2}, weight_arg{weight, "weight", 3},
bias_arg{bias, "bias", 4}, grad_output_arg{grad_output, "grad_output", 5};
checkAllSameGPU("slow_conv_dilated_all_cuda_template", {input_arg, weight_arg});
if (bias.defined()) {
checkAllSameGPU("slow_conv_dilated_all_cuda_template", {input_arg, bias_arg});
}
if (grad_output.defined()) {
checkAllSameGPU(
"slow_conv_dilated_all_cuda_template", {input_arg, grad_output_arg});
}
// we are not checking the data locations of other tensor
// arguments such as output, grad_input, etc because of these are
// allocated based on input options and hence these tensors always
// have the same data location as of input tensor.
}
/*
slow_conv_dilated_all_cuda_template
Main worker. Computes tensors output, grad_input, grad_weight,
and/or grad_bias if defined, respectively.
*/
template <int64_t dim>
void slow_conv_dilated_all_cuda_template(
Tensor& output,
const Tensor& input,
const Tensor& weight,
const Tensor& bias,
const Tensor& grad_output,
Tensor& grad_input,
Tensor& grad_weight,
Tensor& grad_bias,
IntArrayRef kernel_size,
IntArrayRef stride_size,
IntArrayRef pad_size,
IntArrayRef dilation_size) {
slow_conv_dilated_location_check(input, weight, bias, grad_output);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
auto options = input.options();
// The rear part of input tensor sizes:
auto input_size = input.sizes().slice(2);
// The rear part of output tensor sizes:
auto output_size = internal::get_output_size<dim>(
input, kernel_size, stride_size, pad_size, dilation_size);
int64_t batchSize = input.size(0);
int64_t nInputPlane = weight.size(1);
int64_t nOutputPlane = weight.size(0);
// Temporary buffers:
const int64_t m = c10::multiply_integers(kernel_size);
const int64_t output_vsize = c10::multiply_integers(output_size);
Tensor columns = at::empty({0}, options);
if (output.defined() || grad_weight.defined() || grad_input.defined()) {
columns.resize_({nInputPlane * m, output_vsize});
}
// Initialize
if (grad_weight.defined()) {
grad_weight.zero_();
}
if (grad_bias.defined()) {
grad_bias.zero_();
}
if (output.defined() && !bias.defined()) {
output.zero_();
}
#ifdef __HIP_PLATFORM_HCC__
/* When using ROCm, the sum evaluation is inaccurate for double
tensors. The reason is currently unknown. Hence, we use gemv for
computing `grad_output_n.sum(dims)` until the ROCm-sum issue is
resolved. */
Tensor ones = at::empty({0}, options);
if (grad_bias.defined()) {
ones.resize_({output_vsize});
ones.fill_(1);
}
/* MSVC does not like #ifdef-s inside the CPP macro
AT_DISPATCH_FLOATING_TYPES_AND_HALF. So, we define the code
branching outside the CPP macro: */
#define CALCULATE_GRAD_BIAS \
at::cuda::blas::gemv<scalar_t>( \
/*trans=*/'t', \
/* m=*/output_vsize, \
/* n=*/nOutputPlane, \
/*alpha=*/ScalarConvert<int, scalar_t>::to(1), \
/* A=*/grad_output_n.data_ptr<scalar_t>(), \
/* lda=*/output_vsize, \
/* x=*/ones.data_ptr<scalar_t>(), \
/* incx=*/1, \
/* beta=*/ScalarConvert<int, scalar_t>::to(1), \
/* y=*/grad_bias.data_ptr<scalar_t>(), \
/* incy=*/1)
#else
#define CALCULATE_GRAD_BIAS grad_bias += grad_output_n.sum(dims)
#endif
// Helpers
Tensor grad_output_n;
std::vector<int64_t> dims(dim);
std::iota(dims.begin(), dims.end(), 1);
AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
input.scalar_type(), "slow_conv_dilated<>", [&] {
// For each elt in batch, do:
for (int elt = 0; elt < batchSize; elt++) {
// Matrix multiply per output:
Tensor input_n = input.select(0, elt);
// Output
if (output.defined()) {
Tensor output_n = output.select(0, elt);
if (bias.defined()) {
/* For gemm argument derivation, see
slow_conv_dilated_all_cuda_template in
ATen/native/DilatedConvolution.cpp */
for (int n = 0; n < nOutputPlane; n++) {
output_n.select(0, n).fill_(bias[n]);
}
}
// Extract columns:
hvol2col<scalar_t, dim>(
stream,
input_n.data_ptr<scalar_t>(),
nInputPlane,
input_size,
output_size,
kernel_size,
stride_size,
pad_size,
dilation_size,
columns.data_ptr<scalar_t>());
/* For gemm argument derivation, see
slow_conv_dilated_all_cuda_template in
ATen/native/DilatedConvolution.cpp */
at::cuda::blas::gemm<scalar_t>(
/*transa=*/'n',
/*transb=*/'n',
/* m=*/columns.size(1),
/* n=*/nOutputPlane,
/* k=*/columns.size(0),
/* alpha=*/ScalarConvert<int, scalar_t>::to(1),
/* A=*/columns.data_ptr<scalar_t>(),
/* lda=*/columns.size(1),
/* B=*/weight.data_ptr<scalar_t>(),
/* ldb=*/columns.size(0),
/* beta=*/ScalarConvert<int, scalar_t>::to(1),
/* C=*/output_n.data_ptr<scalar_t>(),
/* ldc=*/columns.size(1));
} else {
// All gradients
grad_output_n = grad_output.select(0, elt);
}
// Gradient of input:
if (grad_input.defined()) {
/* For gemm argument derivation, see
slow_conv_dilated_all_cuda_template in
ATen/native/DilatedConvolution.cpp */
at::cuda::blas::gemm<scalar_t>(
/*transa=*/'n',
/*transb=*/'t',
/* m=*/columns.size(1),
/* n=*/columns.size(0),
/* k=*/nOutputPlane,
/* alpha=*/ScalarConvert<int, scalar_t>::to(1),
/* A=*/grad_output_n.data_ptr<scalar_t>(),
/* lda=*/columns.size(1),
/* B=*/weight.data_ptr<scalar_t>(),
/* ldb=*/columns.size(0),
/* beta=*/ScalarConvert<int, scalar_t>::to(0),
/* C=*/columns.data_ptr<scalar_t>(),
/* ldc=*/columns.size(1));
// Unpack columns back into input:
Tensor grad_input_n = grad_input.select(0, elt);
col2hvol<scalar_t, dim>(
stream,
columns.data_ptr<scalar_t>(),
nInputPlane,
input_size,
output_size,
kernel_size,
stride_size,
pad_size,
dilation_size,
grad_input_n.data_ptr<scalar_t>());
}
// Gradient of weight:
if (grad_weight.defined()) {
// Extract columns:
hvol2col<scalar_t, dim>(
stream,
input_n.data_ptr<scalar_t>(),
nInputPlane,
input_size,
output_size,
kernel_size,
stride_size,
pad_size,
dilation_size,
columns.data_ptr<scalar_t>());
scalar_t scale = ScalarConvert<int, scalar_t>::to(
1); // TODO: expose as argument?
/* For gemm argument derivation, see
slow_conv_dilated_all_cuda_template in
ATen/native/DilatedConvolution.cpp */
at::cuda::blas::gemm<scalar_t>(
/*transa=*/'t',
/*transb=*/'n',
/* m=*/columns.size(0),
/* n=*/nOutputPlane,
/* k=*/columns.size(1),
/* alpha=*/scale,
/* A=*/columns.data_ptr<scalar_t>(),
/* lda=*/columns.size(1),
/* B=*/grad_output_n.data_ptr<scalar_t>(),
/* ldb=*/columns.size(1),
/* beta=*/ScalarConvert<int, scalar_t>::to(1),
/* C=*/grad_weight.data_ptr<scalar_t>(),
/* ldc=*/columns.size(0));
}
// Gradient of bias:
if (grad_bias.defined()) {
/* For gemv argument derivation, see
slow_conv_dilated_all_cpu_template in
ATen/native/DilatedConvolution.cpp */
CALCULATE_GRAD_BIAS; /* MSVC does not like #ifdef-s
inside the CPP macros, see above. */
/*
TODO: when scale != 1 is introduced then use:
grad_bias += scale * grad_output_n.sum(dims);
*/
}
}
});
} // slow_conv_dilated_all_cuda_template
} // namespace
Tensor slow_conv_dilated2d_cuda(
const Tensor& input,
const Tensor& weight,
IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
IntArrayRef stride_size,
IntArrayRef pad_size,
IntArrayRef dilation_size) {
// See [Note: hacky wrapper removal for optional tensor]
c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
const Tensor& bias = *bias_maybe_owned;
Tensor undefined;
internal::slow_conv_dilated_shape_check<2>(
input,
weight,
bias,
undefined,
kernel_size,
stride_size,
pad_size,
dilation_size);
auto is_batch = input.dim() == 4;
auto options = input.options();
// calculate output tensor size
auto output_size = internal::get_output_size<2>(
input, weight, kernel_size, stride_size, pad_size, dilation_size);
// template function assumes batched tensors. unsqueeze(0) will
// insert batch dimension without affecting the original tensor.
const Tensor input_ =
(is_batch ? input.contiguous() : input.contiguous().unsqueeze(0));
const Tensor weight_ = weight.contiguous();
const Tensor bias_ = (bias.defined() ? bias.contiguous() : undefined);
Tensor output = at::empty(output_size, options);
Tensor output_ = (is_batch ? output : output.unsqueeze(0));
slow_conv_dilated_all_cuda_template<2>(
output_,
input_,
weight_,
bias_,
undefined,
undefined,
undefined,
undefined,
kernel_size,
stride_size,
pad_size,
dilation_size);
return output;
}
std::tuple<Tensor, Tensor, Tensor> slow_conv_dilated2d_backward_cuda(
const Tensor& grad_output,
const Tensor& input,
const Tensor& weight,
IntArrayRef kernel_size,
IntArrayRef stride_size,
IntArrayRef pad_size,
IntArrayRef dilation_size,
const std::array<bool, 3ul> output_mask) {
Tensor undefined;
internal::slow_conv_dilated_shape_check<2>(
input,
weight,
undefined,
grad_output,
kernel_size,
stride_size,
pad_size,
dilation_size);
auto is_batch = input.dim() == 4;
auto options = grad_output.options();
// template function assumes batched tensors. unsqueeze(0) will
// insert batch dimension without affecting the original tensor.
const Tensor grad_output_ =
(is_batch ? grad_output.contiguous()
: grad_output.contiguous().unsqueeze(0));
const Tensor input_ =
(is_batch ? input.contiguous() : input.contiguous().unsqueeze(0));
const Tensor weight_ = weight.contiguous();
// compute only gradients for which the corresponding output_mask is true:
Tensor grad_input =
(output_mask[0] ? at::empty(input.sizes(), options) : undefined);
Tensor grad_weight =
(output_mask[1] ? at::empty(weight.sizes(), options) : undefined);
Tensor grad_bias =
(output_mask[2] ? at::empty(weight.size(0), options) : undefined);
Tensor grad_input_ =
(output_mask[0] ? (is_batch ? grad_input : grad_input.unsqueeze(0))
: undefined);
slow_conv_dilated_all_cuda_template<2>(
undefined,
input_,
weight_,
undefined,
grad_output_,
grad_input,
grad_weight,
grad_bias,
kernel_size,
stride_size,
pad_size,
dilation_size);
return std::tie(grad_input, grad_weight, grad_bias);
}
Tensor slow_conv_dilated3d_cuda(
const Tensor& input,
const Tensor& weight,
IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
IntArrayRef stride_size,
IntArrayRef pad_size,
IntArrayRef dilation_size) {
// See [Note: hacky wrapper removal for optional tensor]
c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
const Tensor& bias = *bias_maybe_owned;
Tensor undefined;
internal::slow_conv_dilated_shape_check<3>(
input,
weight,
bias,
undefined,
kernel_size,
stride_size,
pad_size,
dilation_size);
auto is_batch = input.dim() == 5;
auto options = input.options();
// calculate output tensor size
auto output_size = internal::get_output_size<3>(
input, weight, kernel_size, stride_size, pad_size, dilation_size);
// template function assumes batched tensors. unsqueeze(0) will
// insert batch dimension without affecting the original tensor.
const Tensor input_ =
(is_batch ? input.contiguous() : input.contiguous().unsqueeze(0));
const Tensor weight_ = weight.contiguous();
const Tensor bias_ = (bias.defined() ? bias.contiguous() : undefined);
Tensor output = at::empty(output_size, options);
Tensor output_ = (is_batch ? output : output.unsqueeze(0));
slow_conv_dilated_all_cuda_template<3>(
output,
input_,
weight_,
bias_,
undefined,
undefined,
undefined,
undefined,
kernel_size,
stride_size,
pad_size,
dilation_size);
return output;
}
std::tuple<Tensor, Tensor, Tensor> slow_conv_dilated3d_backward_cuda(
const Tensor& grad_output,
const Tensor& input,
const Tensor& weight,
IntArrayRef kernel_size,
IntArrayRef stride_size,
IntArrayRef pad_size,
IntArrayRef dilation_size,
const std::array<bool, 3ul> output_mask) {
Tensor undefined;
internal::slow_conv_dilated_shape_check<3>(
input,
weight,
undefined,
grad_output,
kernel_size,
stride_size,
pad_size,
dilation_size);
auto is_batch = input.dim() == 5;
auto options = grad_output.options();
// template function assumes batched tensors. unsqueeze(0) will
// insert batch dimension without affecting the original tensor.
const Tensor grad_output_ =
(is_batch ? grad_output.contiguous()
: grad_output.contiguous().unsqueeze(0));
const Tensor input_ =
(is_batch ? input.contiguous() : input.contiguous().unsqueeze(0));
const Tensor weight_ = weight.contiguous();
// compute only gradients for which the corresponding output_mask is true:
Tensor grad_input =
(output_mask[0] ? at::empty(input.sizes(), options) : undefined);
Tensor grad_weight =
(output_mask[1] ? at::empty(weight.sizes(), options) : undefined);
Tensor grad_bias =
(output_mask[2] ? at::empty(weight.size(0), options) : undefined);
Tensor grad_input_ =
(output_mask[0] ? (is_batch ? grad_input : grad_input.unsqueeze(0))
: undefined);
slow_conv_dilated_all_cuda_template<3>(
undefined,
input_,
weight_,
undefined,
grad_output_,
grad_input,
grad_weight,
grad_bias,
kernel_size,
stride_size,
pad_size,
dilation_size);
return std::tie(grad_input, grad_weight, grad_bias);
}
} // namespace native
} // namespace at