blob: 3f766dfb3df3307cab8f6361a8abf10b6248f4b6 [file] [log] [blame]
// copied and pasted from pytorch to test if this passes the build.
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/upsample_op.h"
#include "caffe2/utils/GpuAtomics.cuh"
#include "caffe2/utils/math.h"
namespace caffe2 {
namespace {
inline __device__ int idx(
const int n,
const int num_channels,
const int c,
const int height,
const int width,
const int y,
const int x) {
return ((n * num_channels + c) * height + y) * width + x;
}
// input is X, output is Y
__global__ void UpsampleBilinearKernel(
const int num_batch,
const int num_channels,
const int input_height,
const int input_width,
const int output_height,
const int output_width,
const float* __restrict__ X,
float* __restrict__ Y) {
const int size = output_height * output_width;
CUDA_1D_KERNEL_LOOP(index, size) {
int indexTemp = index;
const int out_x = indexTemp % output_width;
indexTemp /= output_width;
const int out_y = indexTemp % output_height;
indexTemp /= output_height;
indexTemp /= num_channels;
const float rheight =
output_height > 1 ? (input_height - 1.f) / (output_height - 1.f) : 0.f;
const float rwidth =
output_width > 1 ? (input_width - 1.f) / (output_width - 1.f) : 0.f;
// Compute Y axis lambdas
const float h1r = rheight * out_y;
const int h1 = (int)h1r;
const int h1p = (h1 < input_height - 1) ? 1 : 0;
const float h1lambda = h1r - h1;
const float h0lambda = 1.f - h1lambda;
// Compute X axis lambdas
const float w1r = rwidth * out_x;
const int w1 = (int)w1r;
const int w1p = (w1 < input_width - 1) ? 1 : 0;
const float w1lambda = w1r - w1;
const float w0lambda = 1.f - w1lambda;
for (int n = 0; n < num_batch; n++){
for (int c = 0; c < num_channels; c++) {
float X0 = X[idx(n, num_channels, c, input_height, input_width, h1, w1)];
float X1 = X[idx(n, num_channels, c, input_height, input_width, h1, w1 + w1p)];
float X2 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1)];
float X3 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1 + w1p)];
Y[idx(n, num_channels, c, output_height, output_width, out_y, out_x)] =
h0lambda * (w0lambda * X0 + w1lambda * X1) +
h1lambda * (w0lambda * X2 + w1lambda * X3);
}
}
}
}
// input is dY, output is dX
__global__ void UpsampleBilinearGradientKernel(
const int input_size,
const int num_channels,
const int input_height,
const int input_width,
const int output_height,
const int output_width,
const float* dY,
float* dX) {
CUDA_1D_KERNEL_LOOP(index, input_size) {
int indexTemp = index;
const int in_x = indexTemp % input_width;
indexTemp /= input_width;
const int in_y = indexTemp % input_height;
indexTemp /= input_height;
const int c = indexTemp % num_channels;
indexTemp /= num_channels;
const int n = indexTemp;
const float rheight =
output_height > 1 ? (output_height - 1.f) / (input_height - 1.f) : 0.f;
const float rwidth =
output_width > 1 ? (output_width - 1.f) / (input_width - 1.f) : 0.f;
// Compute Y axis lambdas
const float h1r = rheight * in_y;
const int h1 = (int)h1r;
const int h1p = (h1 < output_height - 1) ? 1 : 0;
const float h1lambda = h1r - h1;
const float h0lambda = 1.f - h1lambda;
// Compute X axis lambdas
const float w1r = rwidth * in_x;
const int w1 = (int)w1r;
const int w1p = (w1 < output_width - 1) ? 1 : 0;
const float w1lambda = w1r - w1;
const float w0lambda = 1.f - w1lambda;
#if __CUDA_ARCH__ >= 350
const float dYi = __ldg(&dY[index]);
#else
const float dYi = dY[index];
#endif
gpu_atomic_add(
&dX[idx(n, num_channels, c, output_height, output_width, h1, w1)],
h0lambda * w0lambda * dYi);
gpu_atomic_add(
&dX[idx(n, num_channels, c, output_height, output_width, h1, w1 + w1p)],
h0lambda * w1lambda * dYi);
gpu_atomic_add(
&dX[idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1)],
h1lambda * w0lambda * dYi);
gpu_atomic_add(
&dX[idx(
n,
num_channels,
c,
output_height,
output_width,
h1 + h1p,
w1 + w1p)],
h1lambda * w1lambda * dYi);
}
}
} // namespace
template <>
bool UpsampleBilinearOp<float, CUDAContext>::RunOnDevice() {
const auto& X = Input(0);
const auto inputDims = X.sizes();
CAFFE_ENFORCE_EQ(4, inputDims.size());
const int batch_size = X.dim32(0), num_channels = X.dim32(1),
input_height = X.dim32(2), input_width = X.dim32(3);
if (InputSize() == 2) {
const auto& scales = Input(1);
CAFFE_ENFORCE_EQ(scales.dim(), 1);
CAFFE_ENFORCE_EQ(scales.numel(), 2);
float scales_data[2];
context_.CopyToCPU<float>(2, scales.data<float>(), scales_data);
height_scale_ = scales_data[0];
width_scale_ = scales_data[1];
}
int output_width = input_width * width_scale_;
int output_height = input_height * height_scale_;
auto* Y = Output(
0,
{batch_size, num_channels, output_height, output_width},
at::dtype<float>());
const auto size = output_height * output_width;
UpsampleBilinearKernel<<<
CAFFE_GET_BLOCKS(size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
batch_size,
num_channels,
input_height,
input_width,
output_height,
output_width,
X.data<float>(),
Y->template mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
template <>
bool UpsampleBilinearGradientOp<float, CUDAContext>::RunOnDevice() {
const auto& dY = Input(0);
const auto& X = Input(1);
const auto inputDims = dY.sizes();
CAFFE_ENFORCE_EQ(4, inputDims.size());
const int batch_size = dY.dim32(0);
const int num_channels = dY.dim32(1);
const int input_height = dY.dim32(2);
const int input_width = dY.dim32(3);
const int output_height = X.dim32(2);
const int output_width = X.dim32(3);
if (InputSize() == 3) {
const auto& scales = Input(2);
CAFFE_ENFORCE_EQ(scales.dim(), 1);
CAFFE_ENFORCE_EQ(scales.numel(), 2);
float scales_data[2];
context_.CopyToCPU<float>(2, scales.data<float>(), scales_data);
height_scale_ = scales_data[0];
width_scale_ = scales_data[1];
}
auto* dX = Output(
0,
{batch_size, num_channels, output_height, output_width},
at::dtype<float>());
math::Set<float, CUDAContext>(
dX->numel(), 0.0f, dX->mutable_data<float>(), &context_);
const auto size = dY.numel();
UpsampleBilinearGradientKernel<<<
CAFFE_GET_BLOCKS(size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
size,
num_channels,
input_height,
input_width,
output_height,
output_width,
dY.data<float>(),
dX->template mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
REGISTER_CUDA_OPERATOR(
UpsampleBilinear,
UpsampleBilinearOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(
UpsampleBilinearGradient,
UpsampleBilinearGradientOp<float, CUDAContext>);
} // namespace caffe2