blob: 43e3ea8407a604f4573805de64d6ca4b76c0b12f [file] [log] [blame]
#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <ATen/TensorUtils.h>
#include <ATen/native/CPUBlas.h>
#include <ATen/native/vol2col.h>
namespace at {
namespace native {
template<typename scalar_t>
void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, scalar_t *a, int64_t lda, scalar_t *x, int64_t incx, scalar_t beta, scalar_t *y, int64_t incy);
namespace {
static inline void slow_conv_transpose3d_shape_check(
const Tensor& input,
const Tensor& grad_output,
const Tensor& weight,
const Tensor& bias,
int kernel_depth,
int kernel_width,
int kernel_height,
int stride_depth,
int stride_width,
int stride_height,
int padding_depth,
int padding_width,
int padding_height,
int dilation_depth,
int dilation_width,
int dilation_height,
int output_padding_depth,
int output_padding_width,
int output_padding_height,
int weight_nullable) {
TORCH_CHECK(
input.numel() != 0 && (input.dim() == 4 || input.dim() == 5),
"non-empty 4D or 5D (batch mode) tensor expected for input, but got: ",
input.sizes());
TORCH_CHECK(
stride_depth > 0 && stride_width > 0 && stride_height > 0,
"stride should be greater than zero, but got stride_depth: ",
stride_depth,
" stride_height: ",
stride_height,
" stride_width: ",
stride_width);
TORCH_CHECK(
dilation_depth > 0 && dilation_width > 0 && dilation_height > 0,
"dilation should be greater than zero, but got dilation_depth: ",
dilation_depth,
", dilation_height: ",
dilation_height,
", dilation_width: ",
dilation_width);
TORCH_CHECK(
(output_padding_depth < stride_depth ||
output_padding_depth < dilation_depth) &&
(output_padding_width < stride_width ||
output_padding_width < dilation_width) &&
(output_padding_height < stride_height ||
output_padding_height < dilation_height),
"output padding must be smaller than either stride or dilation,",
" but got output_padding_depth: ",
output_padding_depth,
" output_padding_height: ",
output_padding_height,
" output_padding_width: ",
output_padding_width,
" stride_depth: ",
stride_depth,
" stride_height: ",
stride_height,
" stride_width: ",
stride_width,
" dilation_depth: ",
dilation_depth,
" dilation_height: ",
dilation_height,
" dilation_width: ",
dilation_width);
// number of input & output planes and kernel size is indirectly defined by
// the weight tensor
if (weight.defined()) {
/* TODO: TORCH_CHECK just have 2 args: condition and message */
TORCH_CHECK(
weight.numel() != 0 && weight.dim() == 5,
"non-empty 5D (n_output_plane x n_input_plane x kernel_depth",
" x kernel_height x kernel_width) tensor ",
"expected for weight, but got: ",
weight.sizes());
if (bias.defined()) {
check_dim_size(bias, 1, 0, weight.size(1));
}
} else if (!weight_nullable) {
AT_ERROR("weight tensor is expected to be non-nullable");
}
int ndim = input.dim();
int dimf = 0;
int dimd = 1;
int dimh = 2;
int dimw = 3;
if (ndim == 5) {
dimf++;
dimd++;
dimh++;
dimw++;
}
if (weight.defined()) {
const int64_t n_input_plane = weight.size(0);
check_dim_size(input, ndim, dimf, n_input_plane);
}
const int64_t input_width = input.size(dimw);
const int64_t input_height = input.size(dimh);
const int64_t input_depth = input.size(dimd);
const int64_t output_depth = (input_depth - 1) * stride_depth -
2 * padding_depth + (dilation_depth * (kernel_depth - 1) + 1) +
output_padding_depth;
const int64_t output_height = (input_height - 1) * stride_height -
2 * padding_height + (dilation_height * (kernel_height - 1) + 1) +
output_padding_height;
const int64_t output_width = (input_width - 1) * stride_width -
2 * padding_width + (dilation_width * (kernel_width - 1) + 1) +
output_padding_width;
if (output_depth < 1 || output_width < 1 || output_height < 1) {
AT_ERROR(
"Given input size per channel: (",
input_depth,
" x ",
input_height,
" x ",
input_width,
"). "
"Calculated output size per channel: (",
output_depth,
" x ",
output_height,
" x ",
output_width,
"). Output size is too small");
}
if (grad_output.defined()) {
if (weight.defined()) {
const int64_t n_output_plane = weight.size(1);
check_dim_size(grad_output, ndim, dimf, n_output_plane);
} else if (bias.defined()) {
const int64_t n_output_plane = bias.size(0);
check_dim_size(grad_output, ndim, dimf, n_output_plane);
}
check_dim_size(grad_output, ndim, dimd, output_depth);
check_dim_size(grad_output, ndim, dimh, output_height);
check_dim_size(grad_output, ndim, dimw, output_width);
}
}
void slow_conv_transpose3d_out_cpu_template(
Tensor& output,
const Tensor& input_, // 4D or 5D (batch) tensor
const Tensor& weight_, // weight tensor (n_input_plane x n_output_plane x
// kernel_depth x kernel_height x kernel_width)
IntArrayRef kernel_size,
const Tensor& bias_,
IntArrayRef stride,
IntArrayRef padding,
IntArrayRef output_padding,
IntArrayRef dilation,
Tensor& finput,
Tensor& fgrad_input) {
TORCH_CHECK(
kernel_size.size() == 3,
"It is expected kernel_size equals to 3, but got size ",
kernel_size.size());
TORCH_CHECK(
dilation.size() == 3,
"It is expected dilation equals to 3, but got size ",
dilation.size());
TORCH_CHECK(
padding.size() == 3,
"It is expected padding equals to 3, but got size ",
padding.size());
TORCH_CHECK(
stride.size() == 3,
"It is expected stride equals to 3, but got size ",
stride.size());
TORCH_CHECK(
output_padding.size() == 3,
"It is expected stride equals to 3, but got size ",
output_padding.size());
int64_t kernel_depth = kernel_size[0];
int64_t kernel_height = kernel_size[1];
int64_t kernel_width = kernel_size[2];
int64_t dilation_depth = dilation[0];
int64_t dilation_height = dilation[1];
int64_t dilation_width = dilation[2];
int64_t padding_depth = padding[0];
int64_t padding_height = padding[1];
int64_t padding_width = padding[2];
int64_t stride_depth = stride[0];
int64_t stride_height = stride[1];
int64_t stride_width = stride[2];
int64_t output_padding_depth = output_padding[0];
int64_t output_padding_height = output_padding[1];
int64_t output_padding_width = output_padding[2];
// internal columns buffer
Tensor& columns = finput;
// internal ones buffer
Tensor& ones = fgrad_input;
slow_conv_transpose3d_shape_check(
input_,
Tensor(),
weight_,
bias_,
kernel_depth,
kernel_width,
kernel_height,
stride_depth,
stride_width,
stride_height,
padding_depth,
padding_width,
padding_height,
dilation_depth,
dilation_width,
dilation_height,
output_padding_depth,
output_padding_width,
output_padding_height,
0);
Tensor input = input_.contiguous();
Tensor weight = weight_.contiguous();
Tensor bias = bias_.defined() ? bias_.contiguous() : bias_;
const int n_input_plane = (int)weight.size(0);
const int n_output_plane = (int)weight.size(1);
bool is_batch = false;
if (input.dim() == 4) {
// Force batch
is_batch = true;
input.resize_(
{1, input.size(0), input.size(1), input.size(2), input.size(3)});
}
const int64_t input_width = input.size(4);
const int64_t input_height = input.size(3);
const int64_t input_depth = input.size(2);
const int64_t output_depth = (input_depth - 1) * stride_depth -
2 * padding_depth + (dilation_depth * (kernel_depth - 1) + 1) +
output_padding_depth;
const int64_t output_height = (input_height - 1) * stride_height -
2 * padding_height + (dilation_height * (kernel_height - 1) + 1) +
output_padding_height;
const int64_t output_width = (input_width - 1) * stride_width -
2 * padding_width + (dilation_width * (kernel_width - 1) + 1) +
output_padding_width;
// Batch size + input planes
const int64_t batch_size = input.size(0);
// Resize output
output.resize_(
{batch_size, n_output_plane, output_depth, output_height, output_width});
// Resize temporary columns
columns.resize_({n_output_plane * kernel_width * kernel_height * kernel_depth,
input_depth * input_height * input_width});
columns.zero_();
// Define a buffer of ones, for bias accumulation
// Note: this buffer can be shared with other modules, it only ever gets
// increased, and always contains ones.
if (ones.dim() != 3 ||
ones.size(0) * ones.size(1) * ones.size(2) <
output_depth * output_height * output_width) {
// Resize plane and fill with ones...
ones.resize_({output_depth, output_height, output_width});
ones.fill_(1);
}
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long,
input.scalar_type(), "slow_conv_transpose3d_out_cpu", [&] {
// Helpers
Tensor input_n;
Tensor output_n;
int64_t elt;
// For each elt in batch, do:
for (elt = 0; elt < batch_size; ++elt) {
// Matrix mulitply per output:
input_n = input.select(0, elt);
output_n = output.select(0, elt);
// M,N,K are dims of matrix A and B
// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
const int64_t m =
weight.size(1) * weight.size(2) * weight.size(3) * weight.size(4);
const int64_t n = columns.size(1);
const int64_t k = weight.size(0);
// Do GEMM (note: this is a bit confusing because gemm assumes
// column-major matrices)
cpublas::gemm(
cpublas::NoTranspose,
cpublas::Transpose,
n,
m,
k,
1,
input_n.data_ptr<scalar_t>(),
n,
weight.data_ptr<scalar_t>(),
m,
0,
columns.data_ptr<scalar_t>(),
n);
// Unpack columns back into input:
at::native::col2vol<scalar_t>(
columns.data_ptr<scalar_t>(),
n_output_plane,
output_depth,
output_height,
output_width,
input_depth,
input_height,
input_width,
kernel_depth,
kernel_height,
kernel_width,
padding_depth,
padding_height,
padding_width,
stride_depth,
stride_height,
stride_width,
dilation_depth,
dilation_height,
dilation_width,
output_n.data_ptr<scalar_t>());
// Do Bias after:
// M,N,K are dims of matrix A and B
// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
const int64_t m_ = n_output_plane;
const int64_t n_ = output_depth * output_height * output_width;
const int64_t k_ = 1;
// Do GEMM (note: this is a bit confusing because gemm assumes
// column-major matrices)
if (bias.defined()) {
cpublas::gemm(
cpublas::Transpose,
cpublas::NoTranspose,
n_,
m_,
k_,
1,
ones.data_ptr<scalar_t>(),
k_,
bias.data_ptr<scalar_t>(),
k_,
1,
output_n.data_ptr<scalar_t>(),
n_);
}
}
// Resize output
if (is_batch) {
output.resize_(
{n_output_plane, output_depth, output_height, output_width});
input.resize_(
{n_input_plane, input_depth, input_height, input_width});
}
});
}
void slow_conv_transpose3d_backward_out_cpu_template(
const Tensor& input_,
const Tensor& grad_output_,
Tensor& grad_input,
const Tensor& weight_,
const Tensor& finput,
const Tensor& fgrad_input,
IntArrayRef kernel_size,
IntArrayRef stride,
IntArrayRef padding,
IntArrayRef output_padding,
IntArrayRef dilation) {
TORCH_CHECK(
kernel_size.size() == 3,
"It is expected kernel_size equals to 3, but got size ",
kernel_size.size());
TORCH_CHECK(
dilation.size() == 3,
"It is expected dilation equals to 3, but got size ",
dilation.size());
TORCH_CHECK(
padding.size() == 3,
"It is expected padding equals to 3, but got size ",
padding.size());
TORCH_CHECK(
stride.size() == 3,
"It is expected stride equals to 3, but got size ",
stride.size());
TORCH_CHECK(
output_padding.size() == 3,
"It is expected stride equals to 3, but got size ",
output_padding.size());
Tensor grad_columns = finput;
int64_t kernel_depth = kernel_size[0];
int64_t kernel_height = kernel_size[1];
int64_t kernel_width = kernel_size[2];
int64_t dilation_depth = dilation[0];
int64_t dilation_height = dilation[1];
int64_t dilation_width = dilation[2];
int64_t padding_depth = padding[0];
int64_t padding_height = padding[1];
int64_t padding_width = padding[2];
int64_t stride_depth = stride[0];
int64_t stride_height = stride[1];
int64_t stride_width = stride[2];
int64_t output_padding_depth = output_padding[0];
int64_t output_padding_height = output_padding[1];
int64_t output_padding_width = output_padding[2];
// number of input & output planes and kernel size is indirectly defined by
// the weight tensor
slow_conv_transpose3d_shape_check(
input_,
grad_output_,
weight_,
Tensor(),
kernel_depth,
kernel_width,
kernel_height,
stride_depth,
stride_width,
stride_height,
padding_depth,
padding_width,
padding_height,
dilation_depth,
dilation_width,
dilation_height,
output_padding_depth,
output_padding_width,
output_padding_height,
0);
Tensor input = input_.contiguous();
Tensor weight = weight_.contiguous();
Tensor grad_output = grad_output_.contiguous();
const int64_t n_input_plane = weight.size(0);
const int64_t n_output_plane = weight.size(1);
bool is_batch = false;
if (input.dim() == 4) {
// Force batch
is_batch = true;
input.resize_(
{1, input.size(0), input.size(1), input.size(2), input.size(3)});
grad_output.resize_({1,
grad_output.size(0),
grad_output.size(1),
grad_output.size(2),
grad_output.size(3)});
}
const int64_t input_width = input.size(4);
const int64_t input_height = input.size(3);
const int64_t input_depth = input.size(2);
const int64_t output_depth = (input_depth - 1) * stride_depth -
2 * padding_depth + (dilation_depth * (kernel_depth - 1) + 1) +
output_padding_depth;
const int64_t output_height = (input_height - 1) * stride_height -
2 * padding_height + (dilation_height * (kernel_height - 1) + 1) +
output_padding_height;
const int64_t output_width = (input_width - 1) * stride_width -
2 * padding_width + (dilation_width * (kernel_width - 1) + 1) +
output_padding_width;
// Batch size + input planes
const int64_t batch_size = input.size(0);
// Resize output
grad_input.resize_(
{batch_size, n_input_plane, input_depth, input_height, input_width});
grad_input.zero_();
// Resize temporary columns
grad_columns.resize_(
{n_output_plane * kernel_width * kernel_height * kernel_depth,
input_depth * input_height * input_width});
AT_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "slow_conv_transpose3d_backward_out_cpu", [&] {
// Helpers
Tensor grad_input_n;
Tensor grad_output_n;
int64_t elt;
// For each elt in batch, do:
for (elt = 0; elt < batch_size; ++elt) {
// Matrix mulitply per sample:
grad_input_n = grad_input.select(0, elt);
grad_output_n = grad_output.select(0, elt);
if (kernel_depth != 1 || kernel_height != 1 || kernel_width != 1 ||
stride_depth != 1 || stride_height != 1 || stride_width != 1 ||
dilation_depth != 1 || dilation_height != 1 ||
dilation_width != 1 || padding_depth != 0 ||
padding_height != 0 || padding_width != 0) {
// Extract columns:
at::native::vol2col<scalar_t>(
grad_output_n.data_ptr<scalar_t>(),
n_output_plane,
output_depth,
output_height,
output_width,
input_depth,
input_height,
input_width,
kernel_depth,
kernel_height,
kernel_width,
padding_depth,
padding_height,
padding_width,
stride_depth,
stride_height,
stride_width,
dilation_depth,
dilation_height,
dilation_width,
grad_columns.data_ptr<scalar_t>());
}
// M,N,K are dims of matrix A and B
// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
const int64_t m = weight.size(0);
const int64_t n = grad_columns.size(1);
const int64_t k =
weight.size(1) * weight.size(2) * weight.size(3) * weight.size(4);
// Do GEMM (note: this is a bit confusing because gemm assumes
// column-major matrices)
auto gemm_in_ptr =
(kernel_depth != 1 || kernel_height != 1 || kernel_width != 1 ||
stride_depth != 1 || stride_height != 1 || stride_width != 1 ||
dilation_depth != 1 || dilation_height != 1 ||
dilation_width != 1 || padding_depth != 0 ||
padding_height != 0 || padding_width != 0)
? grad_columns.data_ptr<scalar_t>()
: grad_output_n.data_ptr<scalar_t>();
cpublas::gemm(
cpublas::NoTranspose,
cpublas::NoTranspose,
n,
m,
k,
1,
gemm_in_ptr,
n,
weight.data_ptr<scalar_t>(),
k,
0,
grad_input_n.data_ptr<scalar_t>(),
n);
}
// Resize output
if (is_batch) {
grad_output.resize_(
{n_output_plane, output_depth, output_height, output_width});
input.resize_(
{n_input_plane, input_depth, input_height, input_width});
grad_input.resize_(
{n_input_plane, input_depth, input_height, input_width});
}
});
}
void slow_conv_transpose3d_acc_grad_parameters_cpu(
const Tensor& input_,
const Tensor& grad_output_,
Tensor& grad_weight,
Tensor& grad_bias,
const Tensor& finput,
const Tensor& fgrad_input,
IntArrayRef kernel_size,
IntArrayRef stride,
IntArrayRef padding,
IntArrayRef output_padding,
IntArrayRef dilation,
int scale_) {
TORCH_CHECK(
kernel_size.size() == 3,
"It is expected kernel_size equals to 3, but got size ",
kernel_size.size());
TORCH_CHECK(
dilation.size() == 3,
"It is expected dilation equals to 3, but got size ",
dilation.size());
TORCH_CHECK(
padding.size() == 3,
"It is expected padding equals to 3, but got size ",
padding.size());
TORCH_CHECK(
stride.size() == 3,
"It is expected stride equals to 3, but got size ",
stride.size());
TORCH_CHECK(
output_padding.size() == 3,
"It is expected stride equals to 3, but got size ",
output_padding.size());
int64_t kernel_depth = kernel_size[0];
int64_t kernel_height = kernel_size[1];
int64_t kernel_width = kernel_size[2];
int64_t dilation_depth = dilation[0];
int64_t dilation_height = dilation[1];
int64_t dilation_width = dilation[2];
int64_t padding_depth = padding[0];
int64_t padding_height = padding[1];
int64_t padding_width = padding[2];
int64_t stride_depth = stride[0];
int64_t stride_height = stride[1];
int64_t stride_width = stride[2];
int64_t output_padding_depth = output_padding[0];
int64_t output_padding_height = output_padding[1];
int64_t output_padding_width = output_padding[2];
// number of input & output planes and kernel size is indirectly defined by
// the grad_weight tensor
slow_conv_transpose3d_shape_check(
input_,
grad_output_,
grad_weight,
grad_bias,
kernel_depth,
kernel_width,
kernel_height,
stride_depth,
stride_width,
stride_height,
padding_depth,
padding_width,
padding_height,
dilation_depth,
dilation_width,
dilation_height,
output_padding_depth,
output_padding_width,
output_padding_height,
1);
int64_t n_output_plane;
if (grad_weight.defined()) {
n_output_plane = grad_weight.size(1);
} else if (grad_bias.defined()) {
n_output_plane = grad_bias.size(0);
} else {
return;
}
Tensor columns = finput;
Tensor ones = fgrad_input;
Tensor input = input_.contiguous();
Tensor grad_output = grad_output_.contiguous();
if (grad_weight.defined()) {
TORCH_CHECK(grad_weight.is_contiguous(), "grad_weight needs to be contiguous");
}
if (grad_bias.defined()) {
TORCH_CHECK(grad_bias.is_contiguous(), "grad_bias needs to be contiguous");
TORCH_CHECK(ones.is_contiguous(), "ones needs to be contiguous");
}
bool is_batch = false;
if (input.dim() == 4) {
// Force batch
is_batch = true;
input.resize_(
{1, input.size(0), input.size(1), input.size(2), input.size(3)});
grad_output.resize_({1,
grad_output.size(0),
grad_output.size(1),
grad_output.size(2),
grad_output.size(3)});
}
const int64_t input_width = input.size(4);
const int64_t input_height = input.size(3);
const int64_t input_depth = input.size(2);
const int64_t output_depth = (input_depth - 1) * stride_depth -
2 * padding_depth + (dilation_depth * (kernel_depth - 1) + 1) +
output_padding_depth;
const int64_t output_height = (input_height - 1) * stride_height -
2 * padding_height + (dilation_height * (kernel_height - 1) + 1) +
output_padding_height;
const int64_t output_width = (input_width - 1) * stride_width -
2 * padding_width + (dilation_width * (kernel_width - 1) + 1) +
output_padding_width;
// Batch size + input planes
const int64_t batch_size = input.size(0);
// Define a buffer of ones, for bias accumulation
if (ones.dim() != 3 ||
ones.size(0) * ones.size(1) * ones.size(2) <
output_depth * output_height * output_width) {
// Resize plane and fill with ones...
ones.resize_({output_depth, output_height, output_width});
ones.fill_(1);
}
// Resize temporary columns
columns.resize_({n_output_plane * kernel_width * kernel_height * kernel_depth,
input_depth * input_height * input_width});
AT_DISPATCH_FLOATING_TYPES(
input.scalar_type(),
"slow_conv_transpose3d_acc_grad_parameters_cpu",
[&] {
// Helpers
Tensor input_n;
Tensor grad_output_n;
scalar_t scale = static_cast<scalar_t>(scale_);
int64_t elt;
// For each elt in batch, do:
for (elt = 0; elt < batch_size; ++elt) {
// Matrix mulitply per output:
grad_output_n = grad_output.select(0, elt);
// Do Weight:
if (grad_weight.defined()) {
// Matrix mulitply per output:
input_n = input.select(0, elt);
if (kernel_depth != 1 || kernel_height != 1 || kernel_width != 1 ||
stride_depth != 1 || stride_height != 1 || stride_width != 1 ||
dilation_depth != 1 || dilation_height != 1 ||
dilation_width != 1 || padding_depth != 0 ||
padding_height != 0 || padding_width != 0) {
// Extract columns:
at::native::vol2col<scalar_t>(
grad_output_n.data_ptr<scalar_t>(),
n_output_plane,
output_depth,
output_height,
output_width,
input_depth,
input_height,
input_width,
kernel_depth,
kernel_height,
kernel_width,
padding_depth,
padding_height,
padding_width,
stride_depth,
stride_height,
stride_width,
dilation_depth,
dilation_height,
dilation_width,
columns.data_ptr<scalar_t>());
}
// M,N,K are dims of matrix A and B
// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
const int64_t n = columns.size(0); // n_output_plane * kt * kh * kw
const int64_t m = input_n.size(0); // n_input_plane
const int64_t k = columns.size(1); // input_height * input_width
// Do GEMM (note: this is a bit confusing because gemm assumes
// column-major matrices)
auto gemm_in_ptr =
(kernel_depth != 1 || kernel_height != 1 || kernel_width != 1 ||
stride_depth != 1 || stride_height != 1 || stride_width != 1 ||
dilation_depth != 1 || dilation_height != 1 ||
dilation_width != 1 || padding_depth != 0 ||
padding_height != 0 || padding_width != 0)
? columns.data_ptr<scalar_t>()
: grad_output_n.data_ptr<scalar_t>();
cpublas::gemm(
cpublas::Transpose,
cpublas::NoTranspose,
n,
m,
k,
scale,
gemm_in_ptr,
k,
input_n.data_ptr<scalar_t>(),
k,
1,
grad_weight.data_ptr<scalar_t>(),
n);
}
// Do Bias:
if (grad_bias.defined()) {
// M,N,K are dims of matrix A and B
// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
const int64_t m_ = n_output_plane;
const int64_t k_ = output_depth * output_height * output_width;
// Do GEMV (note: this is a bit confusing because gemv assumes
// column-major matrices)
native::gemv<scalar_t>(
't',
k_,
m_,
scale,
grad_output_n.data_ptr<scalar_t>(),
k_,
ones.data_ptr<scalar_t>(),
1,
1,
grad_bias.data_ptr<scalar_t>(),
1);
}
}
// Resize
if (is_batch) {
grad_output.resize_(
{n_output_plane, output_depth, output_height, output_width});
input.resize_(
{input.size(1), input_depth, input_height, input_width});
}
});
}
} // namespace
Tensor& slow_conv_transpose3d_out_cpu(const Tensor& input,
const Tensor& weight,
IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
IntArrayRef stride,
IntArrayRef padding,
IntArrayRef output_padding,
IntArrayRef dilation,
Tensor& output) {
// See [Note: hacky wrapper removal for optional tensor]
const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
Tensor finput = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
Tensor fgrad = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
slow_conv_transpose3d_out_cpu_template(
output,
input,
weight,
kernel_size,
bias,
stride,
padding,
output_padding,
dilation,
finput,
fgrad);
return output;
}
Tensor slow_conv_transpose3d_cpu(
const Tensor& input,
const Tensor& weight,
IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
IntArrayRef stride,
IntArrayRef padding,
IntArrayRef output_padding,
IntArrayRef dilation) {
// See [Note: hacky wrapper removal for optional tensor]
const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
Tensor output = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
Tensor finput = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
Tensor fgrad = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
slow_conv_transpose3d_out_cpu_template(
output,
input,
weight,
kernel_size,
bias,
stride,
padding,
output_padding,
dilation,
finput,
fgrad);
return output;
}
std::tuple<Tensor&, Tensor&, Tensor&> slow_conv_transpose3d_backward_out_cpu(const Tensor& grad_output,
const Tensor& input,
const Tensor& weight,
IntArrayRef kernel_size,
IntArrayRef stride,
IntArrayRef padding,
IntArrayRef output_padding,
IntArrayRef dilation,
const Tensor& finput,
const Tensor& fgrad,
Tensor& grad_input,
Tensor& grad_weight,
Tensor& grad_bias) {
if (grad_input.defined()) {
slow_conv_transpose3d_backward_out_cpu_template(
input,
grad_output,
grad_input,
weight,
finput,
fgrad,
kernel_size,
stride,
padding,
output_padding,
dilation);
}
if (grad_weight.defined()) {
grad_weight.resize_(weight.sizes());
grad_weight.zero_();
}
if (grad_bias.defined()) {
grad_bias.resize_({weight.size(1)});
grad_bias.zero_();
}
if (grad_weight.defined() || grad_bias.defined()) {
slow_conv_transpose3d_acc_grad_parameters_cpu(
input,
grad_output,
grad_weight,
grad_bias,
finput,
fgrad,
kernel_size,
stride,
padding,
output_padding,
dilation,
1);
}
return std::tuple<Tensor&, Tensor&, Tensor&>(
grad_input, grad_weight, grad_bias);
}
std::tuple<Tensor, Tensor, Tensor> slow_conv_transpose3d_backward_cpu(
const Tensor& grad_output,
const Tensor& input,
const Tensor& weight,
IntArrayRef kernel_size,
IntArrayRef stride,
IntArrayRef padding,
IntArrayRef output_padding,
IntArrayRef dilation,
const Tensor& finput,
const Tensor& fgrad,
std::array<bool, 3> output_mask) {
Tensor grad_input;
Tensor grad_weight;
Tensor grad_bias;
if (output_mask[0]) {
grad_input = at::empty({0}, grad_output.options());
} else {
grad_input = Tensor();
}
if (output_mask[1]) {
grad_weight = at::empty({0}, grad_output.options());
} else {
grad_weight = Tensor();
}
if (output_mask[2]) {
grad_bias = at::empty({0}, grad_output.options());
} else {
grad_bias = Tensor();
}
if (grad_input.defined()) {
slow_conv_transpose3d_backward_out_cpu_template(
input,
grad_output,
grad_input,
weight,
finput,
fgrad,
kernel_size,
stride,
padding,
output_padding,
dilation);
}
if (grad_weight.defined()) {
grad_weight.resize_(weight.sizes());
grad_weight.zero_();
}
if (grad_bias.defined()) {
grad_bias.resize_({weight.size(1)});
grad_bias.zero_();
}
if (grad_weight.defined() || grad_bias.defined()) {
slow_conv_transpose3d_acc_grad_parameters_cpu(
input,
grad_output,
grad_weight,
grad_bias,
finput,
fgrad,
kernel_size,
stride,
padding,
output_padding,
dilation,
1);
}
return std::tuple<Tensor, Tensor, Tensor>(grad_input, grad_weight, grad_bias);
}
} // namespace native
} // namespace at