caffe2/operators/upsample_op.cu - platform/external/pytorch - Git at Google

 // copied and pasted from pytorch to test if this passes the build.

 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/upsample_op.h"
 #include "caffe2/utils/GpuAtomics.cuh"
 #include "caffe2/utils/math.h"

 namespace caffe2 {

 namespace {

 inline __device__ int idx(
     const int n,
     const int num_channels,
     const int c,
     const int height,
     const int width,
     const int y,
     const int x) {
   return ((n * num_channels + c) * height + y) * width + x;
 }

 // input is X, output is Y
 __global__ void UpsampleBilinearKernel(
     const int num_batch,
     const int num_channels,
     const int input_height,
     const int input_width,
     const int output_height,
     const int output_width,
     const float* __restrict__ X,
     float* __restrict__ Y) {


     const int size = output_height * output_width;
     CUDA_1D_KERNEL_LOOP(index, size) {
     int indexTemp = index;
     const int out_x = indexTemp % output_width;
     indexTemp /= output_width;
     const int out_y = indexTemp % output_height;
     indexTemp /= output_height;
     indexTemp /= num_channels;

     const float rheight =
          output_height > 1 ? (input_height - 1.f) / (output_height - 1.f) : 0.f;
     const float rwidth =
          output_width > 1 ? (input_width - 1.f) / (output_width - 1.f) : 0.f;

     // Compute Y axis lambdas
     const float h1r = rheight * out_y;
     const int h1 = (int)h1r;
     const int h1p = (h1 < input_height - 1) ? 1 : 0;
     const float h1lambda = h1r - h1;
     const float h0lambda = 1.f - h1lambda;

     // Compute X axis lambdas
     const float w1r = rwidth * out_x;
     const int w1 = (int)w1r;
     const int w1p = (w1 < input_width - 1) ? 1 : 0;
     const float w1lambda = w1r - w1;
     const float w0lambda = 1.f - w1lambda;

     for (int n = 0; n < num_batch; n++){
       for (int c = 0; c < num_channels; c++) {

         float X0 = X[idx(n, num_channels, c, input_height, input_width, h1, w1)];
         float X1 = X[idx(n, num_channels, c, input_height, input_width, h1, w1 + w1p)];
         float X2 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1)];
         float X3 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1 + w1p)];

         Y[idx(n, num_channels, c, output_height, output_width, out_y, out_x)] =
                     h0lambda * (w0lambda * X0 + w1lambda * X1) +
                     h1lambda * (w0lambda * X2 + w1lambda * X3);
       }
     }
   }
 }
 // input is dY, output is dX
 __global__ void UpsampleBilinearGradientKernel(
     const int input_size,
     const int num_channels,
     const int input_height,
     const int input_width,
     const int output_height,
     const int output_width,
     const float* dY,
     float* dX) {
   CUDA_1D_KERNEL_LOOP(index, input_size) {
     int indexTemp = index;
     const int in_x = indexTemp % input_width;
     indexTemp /= input_width;
     const int in_y = indexTemp % input_height;
     indexTemp /= input_height;
     const int c = indexTemp % num_channels;
     indexTemp /= num_channels;
     const int n = indexTemp;

     const float rheight =
         output_height > 1 ? (output_height - 1.f) / (input_height - 1.f) : 0.f;
     const float rwidth =
         output_width > 1 ? (output_width - 1.f) / (input_width - 1.f) : 0.f;

     // Compute Y axis lambdas
     const float h1r = rheight * in_y;
     const int h1 = (int)h1r;
     const int h1p = (h1 < output_height - 1) ? 1 : 0;
     const float h1lambda = h1r - h1;
     const float h0lambda = 1.f - h1lambda;

     // Compute X axis lambdas
     const float w1r = rwidth * in_x;
     const int w1 = (int)w1r;
     const int w1p = (w1 < output_width - 1) ? 1 : 0;
     const float w1lambda = w1r - w1;
     const float w0lambda = 1.f - w1lambda;

 #if __CUDA_ARCH__ >= 350
     const float dYi = __ldg(&dY[index]);
 #else
     const float dYi = dY[index];
 #endif

     gpu_atomic_add(
         &dX[idx(n, num_channels, c, output_height, output_width, h1, w1)],
         h0lambda * w0lambda * dYi);
     gpu_atomic_add(
         &dX[idx(n, num_channels, c, output_height, output_width, h1, w1 + w1p)],
         h0lambda * w1lambda * dYi);
     gpu_atomic_add(
         &dX[idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1)],
         h1lambda * w0lambda * dYi);
     gpu_atomic_add(
         &dX[idx(
             n,
             num_channels,
             c,
             output_height,
             output_width,
             h1 + h1p,
             w1 + w1p)],
         h1lambda * w1lambda * dYi);
   }
 }

 } // namespace

 template <>
 bool UpsampleBilinearOp<float, CUDAContext>::RunOnDevice() {
   const auto& X = Input(0);

   const auto inputDims = X.sizes();
   CAFFE_ENFORCE_EQ(4, inputDims.size());
   const int batch_size = X.dim32(0), num_channels = X.dim32(1),
             input_height = X.dim32(2), input_width = X.dim32(3);
   if (InputSize() == 2) {
     const auto& scales = Input(1);
     CAFFE_ENFORCE_EQ(scales.dim(), 1);
     CAFFE_ENFORCE_EQ(scales.numel(), 2);
     float scales_data[2];
     context_.CopyToCPU<float>(2, scales.data<float>(), scales_data);
     height_scale_ = scales_data[0];
     width_scale_ = scales_data[1];
   }
   int output_width = input_width * width_scale_;
   int output_height = input_height * height_scale_;
   auto* Y = Output(
       0,
       {batch_size, num_channels, output_height, output_width},
       at::dtype<float>());

   const auto size = output_height * output_width;
   UpsampleBilinearKernel<<<
       CAFFE_GET_BLOCKS(size),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
       batch_size,
       num_channels,
       input_height,
       input_width,
       output_height,
       output_width,
       X.data<float>(),
       Y->template mutable_data<float>());
   C10_CUDA_KERNEL_LAUNCH_CHECK();

   return true;
 }
 template <>
 bool UpsampleBilinearGradientOp<float, CUDAContext>::RunOnDevice() {
   const auto& dY = Input(0);
   const auto& X = Input(1);
   const auto inputDims = dY.sizes();
   CAFFE_ENFORCE_EQ(4, inputDims.size());
   const int batch_size = dY.dim32(0);
   const int num_channels = dY.dim32(1);
   const int input_height = dY.dim32(2);
   const int input_width = dY.dim32(3);
   const int output_height = X.dim32(2);
   const int output_width = X.dim32(3);
   if (InputSize() == 3) {
     const auto& scales = Input(2);
     CAFFE_ENFORCE_EQ(scales.dim(), 1);
     CAFFE_ENFORCE_EQ(scales.numel(), 2);
     float scales_data[2];
     context_.CopyToCPU<float>(2, scales.data<float>(), scales_data);
     height_scale_ = scales_data[0];
     width_scale_ = scales_data[1];
   }
   auto* dX = Output(
       0,
       {batch_size, num_channels, output_height, output_width},
       at::dtype<float>());
   math::Set<float, CUDAContext>(
       dX->numel(), 0.0f, dX->mutable_data<float>(), &context_);
   const auto size = dY.numel();
   UpsampleBilinearGradientKernel<<<
       CAFFE_GET_BLOCKS(size),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
       size,
       num_channels,
       input_height,
       input_width,
       output_height,
       output_width,
       dY.data<float>(),
       dX->template mutable_data<float>());
   C10_CUDA_KERNEL_LAUNCH_CHECK();

   return true;
 }
 REGISTER_CUDA_OPERATOR(
     UpsampleBilinear,
     UpsampleBilinearOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(
     UpsampleBilinearGradient,
     UpsampleBilinearGradientOp<float, CUDAContext>);
 } // namespace caffe2
	// copied and pasted from pytorch to test if this passes the build.

	#include "caffe2/core/context_gpu.h"
	#include "caffe2/operators/upsample_op.h"
	#include "caffe2/utils/GpuAtomics.cuh"
	#include "caffe2/utils/math.h"

	namespace caffe2 {

	namespace {

	inline __device__ int idx(
	const int n,
	const int num_channels,
	const int c,
	const int height,
	const int width,
	const int y,
	const int x) {
	return ((n * num_channels + c) * height + y) * width + x;
	}

	// input is X, output is Y
	__global__ void UpsampleBilinearKernel(
	const int num_batch,
	const int num_channels,
	const int input_height,
	const int input_width,
	const int output_height,
	const int output_width,
	const float* __restrict__ X,
	float* __restrict__ Y) {


	const int size = output_height * output_width;
	CUDA_1D_KERNEL_LOOP(index, size) {
	int indexTemp = index;
	const int out_x = indexTemp % output_width;
	indexTemp /= output_width;
	const int out_y = indexTemp % output_height;
	indexTemp /= output_height;
	indexTemp /= num_channels;

	const float rheight =
	output_height > 1 ? (input_height - 1.f) / (output_height - 1.f) : 0.f;
	const float rwidth =
	output_width > 1 ? (input_width - 1.f) / (output_width - 1.f) : 0.f;

	// Compute Y axis lambdas
	const float h1r = rheight * out_y;
	const int h1 = (int)h1r;
	const int h1p = (h1 < input_height - 1) ? 1 : 0;
	const float h1lambda = h1r - h1;
	const float h0lambda = 1.f - h1lambda;

	// Compute X axis lambdas
	const float w1r = rwidth * out_x;
	const int w1 = (int)w1r;
	const int w1p = (w1 < input_width - 1) ? 1 : 0;
	const float w1lambda = w1r - w1;
	const float w0lambda = 1.f - w1lambda;

	for (int n = 0; n < num_batch; n++){
	for (int c = 0; c < num_channels; c++) {

	float X0 = X[idx(n, num_channels, c, input_height, input_width, h1, w1)];
	float X1 = X[idx(n, num_channels, c, input_height, input_width, h1, w1 + w1p)];
	float X2 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1)];
	float X3 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1 + w1p)];

	Y[idx(n, num_channels, c, output_height, output_width, out_y, out_x)] =
	h0lambda * (w0lambda * X0 + w1lambda * X1) +
	h1lambda * (w0lambda * X2 + w1lambda * X3);
	}
	}
	}
	}
	// input is dY, output is dX
	__global__ void UpsampleBilinearGradientKernel(
	const int input_size,
	const int num_channels,
	const int input_height,
	const int input_width,
	const int output_height,
	const int output_width,
	const float* dY,
	float* dX) {
	CUDA_1D_KERNEL_LOOP(index, input_size) {
	int indexTemp = index;
	const int in_x = indexTemp % input_width;
	indexTemp /= input_width;
	const int in_y = indexTemp % input_height;
	indexTemp /= input_height;
	const int c = indexTemp % num_channels;
	indexTemp /= num_channels;
	const int n = indexTemp;

	const float rheight =
	output_height > 1 ? (output_height - 1.f) / (input_height - 1.f) : 0.f;
	const float rwidth =
	output_width > 1 ? (output_width - 1.f) / (input_width - 1.f) : 0.f;

	// Compute Y axis lambdas
	const float h1r = rheight * in_y;
	const int h1 = (int)h1r;
	const int h1p = (h1 < output_height - 1) ? 1 : 0;
	const float h1lambda = h1r - h1;
	const float h0lambda = 1.f - h1lambda;

	// Compute X axis lambdas
	const float w1r = rwidth * in_x;
	const int w1 = (int)w1r;
	const int w1p = (w1 < output_width - 1) ? 1 : 0;
	const float w1lambda = w1r - w1;
	const float w0lambda = 1.f - w1lambda;

	#if __CUDA_ARCH__ >= 350
	const float dYi = __ldg(&dY[index]);
	#else
	const float dYi = dY[index];
	#endif

	gpu_atomic_add(
	&dX[idx(n, num_channels, c, output_height, output_width, h1, w1)],
	h0lambda * w0lambda * dYi);
	gpu_atomic_add(
	&dX[idx(n, num_channels, c, output_height, output_width, h1, w1 + w1p)],
	h0lambda * w1lambda * dYi);
	gpu_atomic_add(
	&dX[idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1)],
	h1lambda * w0lambda * dYi);
	gpu_atomic_add(
	&dX[idx(
	n,
	num_channels,
	c,
	output_height,
	output_width,
	h1 + h1p,
	w1 + w1p)],
	h1lambda * w1lambda * dYi);
	}
	}

	} // namespace

	template <>
	bool UpsampleBilinearOp<float, CUDAContext>::RunOnDevice() {
	const auto& X = Input(0);

	const auto inputDims = X.sizes();
	CAFFE_ENFORCE_EQ(4, inputDims.size());
	const int batch_size = X.dim32(0), num_channels = X.dim32(1),
	input_height = X.dim32(2), input_width = X.dim32(3);
	if (InputSize() == 2) {
	const auto& scales = Input(1);
	CAFFE_ENFORCE_EQ(scales.dim(), 1);
	CAFFE_ENFORCE_EQ(scales.numel(), 2);
	float scales_data[2];
	context_.CopyToCPU<float>(2, scales.data<float>(), scales_data);
	height_scale_ = scales_data[0];
	width_scale_ = scales_data[1];
	}
	int output_width = input_width * width_scale_;
	int output_height = input_height * height_scale_;
	auto* Y = Output(
	0,
	{batch_size, num_channels, output_height, output_width},
	at::dtype<float>());

	const auto size = output_height * output_width;
	UpsampleBilinearKernel<<<
	CAFFE_GET_BLOCKS(size),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	batch_size,
	num_channels,
	input_height,
	input_width,
	output_height,
	output_width,
	X.data<float>(),
	Y->template mutable_data<float>());
	C10_CUDA_KERNEL_LAUNCH_CHECK();

	return true;
	}
	template <>
	bool UpsampleBilinearGradientOp<float, CUDAContext>::RunOnDevice() {
	const auto& dY = Input(0);
	const auto& X = Input(1);
	const auto inputDims = dY.sizes();
	CAFFE_ENFORCE_EQ(4, inputDims.size());
	const int batch_size = dY.dim32(0);
	const int num_channels = dY.dim32(1);
	const int input_height = dY.dim32(2);
	const int input_width = dY.dim32(3);
	const int output_height = X.dim32(2);
	const int output_width = X.dim32(3);
	if (InputSize() == 3) {
	const auto& scales = Input(2);
	CAFFE_ENFORCE_EQ(scales.dim(), 1);
	CAFFE_ENFORCE_EQ(scales.numel(), 2);
	float scales_data[2];
	context_.CopyToCPU<float>(2, scales.data<float>(), scales_data);
	height_scale_ = scales_data[0];
	width_scale_ = scales_data[1];
	}
	auto* dX = Output(
	0,
	{batch_size, num_channels, output_height, output_width},
	at::dtype<float>());
	math::Set<float, CUDAContext>(
	dX->numel(), 0.0f, dX->mutable_data<float>(), &context_);
	const auto size = dY.numel();
	UpsampleBilinearGradientKernel<<<
	CAFFE_GET_BLOCKS(size),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	size,
	num_channels,
	input_height,
	input_width,
	output_height,
	output_width,
	dY.data<float>(),
	dX->template mutable_data<float>());
	C10_CUDA_KERNEL_LAUNCH_CHECK();

	return true;
	}
	REGISTER_CUDA_OPERATOR(
	UpsampleBilinear,
	UpsampleBilinearOp<float, CUDAContext>);
	REGISTER_CUDA_OPERATOR(
	UpsampleBilinearGradient,
	UpsampleBilinearGradientOp<float, CUDAContext>);
	} // namespace caffe2