caffe2/operators/integral_image_op.cu - platform/external/pytorch - Git at Google

 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/integral_image_op.h"

 namespace caffe2 {

 namespace {
 __global__ void RowPassKernel(
     int count,
     int rows_out,
     int cols_out,
     int chans,
     const float* in,
     float* out) {
   CUDA_1D_KERNEL_LOOP(i, count) {
     // Figure out which row, channel, and batch element we're processing
     int row = i % rows_out;
     int chan = (i / rows_out) % chans;
     int ind = i / rows_out / chans;
     // Input is (H, W) and output is (H + 1, W + 1)
     int rows_in = rows_out - 1;
     int cols_in = cols_out - 1;
     // Row pointer to input data
     // Input data is shift (-1, -1) relative to output data, hence row - 1
     const float* row_in_data =
         in + cols_in * ((row - 1) + rows_in * (chan + ind * chans));
     // Row pointer to output data
     float* row_out_data =
         out + cols_out * (row + rows_out * (chan + ind * chans));
     // The first row and first column of the output is all zeros
     row_out_data[0] = 0.;
     if (row == 0) {
       for (int i = 1; i < cols_out; ++i) {
         row_out_data[i] = 0.;
       }
     } else {
       for (int i = 1; i < cols_out; ++i) {
         // Recall that input data is shift (-1, -1) relative to the output,
         // hence i - 1
         row_out_data[i] = row_out_data[i - 1] + row_in_data[i - 1];
       }
     }
   }
 }

 __global__ void RowPassGradientKernel(
     int count,
     int rows_out,
     int cols_out,
     int chans,
     const float* in,
     float* out) {
   CUDA_1D_KERNEL_LOOP(i, count) {
     // Figure out which row, channel, and batch element we're processing
     int row = i % rows_out;
     int chan = (i / rows_out) % chans;
     int ind = i / rows_out / chans;
     // Input in (H + 1, W + 1) and output is (H + 1, W)
     int rows_in = rows_out;
     int cols_in = cols_out + 1;
     // Col pointer to input data
     const float* row_in_data =
         in + cols_in * (row + rows_in * (chan + ind * chans));
     // Col pointer to output data
     float* row_out_data =
         out + cols_out * (row + rows_out * (chan + ind * chans));
     row_out_data[0] = row_in_data[0];
     for (int i = 1; i < cols_out; ++i) {
       row_out_data[i] = row_out_data[i - 1] + row_in_data[i];
     }
   }
 }

 __global__ void
 ColPassKernel(int count, int rows_out, int cols_out, int chans, float* out) {
   CUDA_1D_KERNEL_LOOP(i, count) {
     // Figure out which col, channel, and batch element we're processing
     int col = i % cols_out;
     int chan = (i / cols_out) % chans;
     int ind = i / cols_out / chans;
     float* col_out_data =
         out + col + cols_out * rows_out * (chan + ind * chans);
     for (int i = 1; i < rows_out; ++i) {
       col_out_data[i * cols_out] += col_out_data[(i - 1) * cols_out];
     }
   }
 }

 __global__ void ColPassGradientKernel(
     int count,
     int rows_out,
     int cols_out,
     int chans,
     const float* in,
     float* out) {
   CUDA_1D_KERNEL_LOOP(i, count) {
     // Figure out which col, channel, and batch element we're processing
     int col = i % cols_out;
     int chan = (i / cols_out) % chans;
     int ind = i / cols_out / chans;
     // Input is (H + 1, W) and output is (H, W)
     int rows_in = rows_out + 1;
     int cols_in = cols_out;
     // Col pointer to input data
     const float* col_in_data =
         in + col + cols_in * rows_in * (chan + ind * chans);
     // Col pointer to output data
     float* col_out_data =
         out + col + cols_out * rows_out * (chan + ind * chans);
     col_out_data[0] = col_in_data[0];
     for (int i = 1; i < rows_out; ++i) {
       col_out_data[i * cols_out] =
           col_out_data[(i - 1) * cols_out] + col_in_data[i * cols_in];
     }
   }
 }

 } // namespace

 template <>
 bool IntegralImageOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0);

   CAFFE_ENFORCE(X.dim() == 4, "Only supports 4D tensors for the momement");

   // Input is (N, C, H, W)
   // Output is (N, C, H + 1, W + 1)
   vector<int64_t> out_shape(X.sizes().vec());
   out_shape[2] += 1; // H + 1 output size
   out_shape[3] += 1; // W + 1 output size
   auto* Y = Output(0, out_shape, at::dtype<float>());

   const int chans = X.dim32(1);
   const int rows_out = Y->dim32(2);
   const int cols_out = Y->dim32(3);
   // Integral image over rows of input X
   const int row_pass_size = X.dim32(0) * chans * rows_out;
   RowPassKernel<<<
       CAFFE_GET_BLOCKS(row_pass_size),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
       row_pass_size,
       rows_out,
       cols_out,
       chans,
       X.data<float>(),
       Y->template mutable_data<float>());
   C10_CUDA_KERNEL_LAUNCH_CHECK();

   // Integral image over columns of the integral image over rows
   const int col_pass_size = X.dim32(0) * chans * cols_out;
   ColPassKernel<<<
       CAFFE_GET_BLOCKS(col_pass_size),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
       col_pass_size,
       rows_out,
       cols_out,
       chans,
       Y->template mutable_data<float>());
   C10_CUDA_KERNEL_LAUNCH_CHECK();

   return true;
 }

 template <>
 bool IntegralImageGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0); // Original input to "forward" op
   auto& dY = Input(1); // Gradient of net w.r.t. output of "forward" op
                        // (aka "gradOutput")

   auto* dX = Output(
       0, X.sizes(), at::dtype<float>()); // Gradient of net w.r.t. input to
                                          // "forward" op (aka "gradInput")

   // Row pass reduces shape of dY from (N, C, H + 1, W + 1)
   // to (N, C, H + 1, W)
   // Col pass reduces shape to (N, C, H, W)
   vector<int64_t> row_pass_shape(dY.sizes().vec());
   row_pass_shape[3] -= 1;
   ReinitializeTensor(&row_pass_buffer_, row_pass_shape, at::dtype<float>().device(CUDA));
   const int chans = row_pass_buffer_.dim32(1);
   const int rows_out = row_pass_buffer_.dim32(2);
   const int cols_out = row_pass_buffer_.dim32(3);
   // Integral image over rows of input X
   const int row_pass_size = X.dim32(0) * chans * rows_out;
   RowPassGradientKernel<<<
       CAFFE_GET_BLOCKS(row_pass_size),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
       row_pass_size,
       rows_out,
       cols_out,
       chans,
       dY.data<float>(),
       row_pass_buffer_.mutable_data<float>());
   C10_CUDA_KERNEL_LAUNCH_CHECK();

   // Integral image over columns of the integral image over rows
   const int col_pass_size = X.dim32(0) * chans * cols_out;
   ColPassGradientKernel<<<
       CAFFE_GET_BLOCKS(col_pass_size),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
       col_pass_size,
       rows_out - 1,
       cols_out,
       chans,
       row_pass_buffer_.data<float>(),
       dX->template mutable_data<float>());
   C10_CUDA_KERNEL_LAUNCH_CHECK();

   return true;
 }

 REGISTER_CUDA_OPERATOR(IntegralImage, IntegralImageOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(
     IntegralImageGradient,
     IntegralImageGradientOp<float, CUDAContext>);

 } // namespace caffe2
	#include "caffe2/core/context_gpu.h"
	#include "caffe2/operators/integral_image_op.h"

	namespace caffe2 {

	namespace {
	__global__ void RowPassKernel(
	int count,
	int rows_out,
	int cols_out,
	int chans,
	const float* in,
	float* out) {
	CUDA_1D_KERNEL_LOOP(i, count) {
	// Figure out which row, channel, and batch element we're processing
	int row = i % rows_out;
	int chan = (i / rows_out) % chans;
	int ind = i / rows_out / chans;
	// Input is (H, W) and output is (H + 1, W + 1)
	int rows_in = rows_out - 1;
	int cols_in = cols_out - 1;
	// Row pointer to input data
	// Input data is shift (-1, -1) relative to output data, hence row - 1
	const float* row_in_data =
	in + cols_in * ((row - 1) + rows_in * (chan + ind * chans));
	// Row pointer to output data
	float* row_out_data =
	out + cols_out * (row + rows_out * (chan + ind * chans));
	// The first row and first column of the output is all zeros
	row_out_data[0] = 0.;
	if (row == 0) {
	for (int i = 1; i < cols_out; ++i) {
	row_out_data[i] = 0.;
	}
	} else {
	for (int i = 1; i < cols_out; ++i) {
	// Recall that input data is shift (-1, -1) relative to the output,
	// hence i - 1
	row_out_data[i] = row_out_data[i - 1] + row_in_data[i - 1];
	}
	}
	}
	}

	__global__ void RowPassGradientKernel(
	int count,
	int rows_out,
	int cols_out,
	int chans,
	const float* in,
	float* out) {
	CUDA_1D_KERNEL_LOOP(i, count) {
	// Figure out which row, channel, and batch element we're processing
	int row = i % rows_out;
	int chan = (i / rows_out) % chans;
	int ind = i / rows_out / chans;
	// Input in (H + 1, W + 1) and output is (H + 1, W)
	int rows_in = rows_out;
	int cols_in = cols_out + 1;
	// Col pointer to input data
	const float* row_in_data =
	in + cols_in * (row + rows_in * (chan + ind * chans));
	// Col pointer to output data
	float* row_out_data =
	out + cols_out * (row + rows_out * (chan + ind * chans));
	row_out_data[0] = row_in_data[0];
	for (int i = 1; i < cols_out; ++i) {
	row_out_data[i] = row_out_data[i - 1] + row_in_data[i];
	}
	}
	}

	__global__ void
	ColPassKernel(int count, int rows_out, int cols_out, int chans, float* out) {
	CUDA_1D_KERNEL_LOOP(i, count) {
	// Figure out which col, channel, and batch element we're processing
	int col = i % cols_out;
	int chan = (i / cols_out) % chans;
	int ind = i / cols_out / chans;
	float* col_out_data =
	out + col + cols_out * rows_out * (chan + ind * chans);
	for (int i = 1; i < rows_out; ++i) {
	col_out_data[i * cols_out] += col_out_data[(i - 1) * cols_out];
	}
	}
	}

	__global__ void ColPassGradientKernel(
	int count,
	int rows_out,
	int cols_out,
	int chans,
	const float* in,
	float* out) {
	CUDA_1D_KERNEL_LOOP(i, count) {
	// Figure out which col, channel, and batch element we're processing
	int col = i % cols_out;
	int chan = (i / cols_out) % chans;
	int ind = i / cols_out / chans;
	// Input is (H + 1, W) and output is (H, W)
	int rows_in = rows_out + 1;
	int cols_in = cols_out;
	// Col pointer to input data
	const float* col_in_data =
	in + col + cols_in * rows_in * (chan + ind * chans);
	// Col pointer to output data
	float* col_out_data =
	out + col + cols_out * rows_out * (chan + ind * chans);
	col_out_data[0] = col_in_data[0];
	for (int i = 1; i < rows_out; ++i) {
	col_out_data[i * cols_out] =
	col_out_data[(i - 1) * cols_out] + col_in_data[i * cols_in];
	}
	}
	}

	} // namespace

	template <>
	bool IntegralImageOp<float, CUDAContext>::RunOnDevice() {
	auto& X = Input(0);

	CAFFE_ENFORCE(X.dim() == 4, "Only supports 4D tensors for the momement");

	// Input is (N, C, H, W)
	// Output is (N, C, H + 1, W + 1)
	vector<int64_t> out_shape(X.sizes().vec());
	out_shape[2] += 1; // H + 1 output size
	out_shape[3] += 1; // W + 1 output size
	auto* Y = Output(0, out_shape, at::dtype<float>());

	const int chans = X.dim32(1);
	const int rows_out = Y->dim32(2);
	const int cols_out = Y->dim32(3);
	// Integral image over rows of input X
	const int row_pass_size = X.dim32(0) * chans * rows_out;
	RowPassKernel<<<
	CAFFE_GET_BLOCKS(row_pass_size),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	row_pass_size,
	rows_out,
	cols_out,
	chans,
	X.data<float>(),
	Y->template mutable_data<float>());
	C10_CUDA_KERNEL_LAUNCH_CHECK();

	// Integral image over columns of the integral image over rows
	const int col_pass_size = X.dim32(0) * chans * cols_out;
	ColPassKernel<<<
	CAFFE_GET_BLOCKS(col_pass_size),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	col_pass_size,
	rows_out,
	cols_out,
	chans,
	Y->template mutable_data<float>());
	C10_CUDA_KERNEL_LAUNCH_CHECK();

	return true;
	}

	template <>
	bool IntegralImageGradientOp<float, CUDAContext>::RunOnDevice() {
	auto& X = Input(0); // Original input to "forward" op
	auto& dY = Input(1); // Gradient of net w.r.t. output of "forward" op
	// (aka "gradOutput")

	auto* dX = Output(
	0, X.sizes(), at::dtype<float>()); // Gradient of net w.r.t. input to
	// "forward" op (aka "gradInput")

	// Row pass reduces shape of dY from (N, C, H + 1, W + 1)
	// to (N, C, H + 1, W)
	// Col pass reduces shape to (N, C, H, W)
	vector<int64_t> row_pass_shape(dY.sizes().vec());
	row_pass_shape[3] -= 1;
	ReinitializeTensor(&row_pass_buffer_, row_pass_shape, at::dtype<float>().device(CUDA));
	const int chans = row_pass_buffer_.dim32(1);
	const int rows_out = row_pass_buffer_.dim32(2);
	const int cols_out = row_pass_buffer_.dim32(3);
	// Integral image over rows of input X
	const int row_pass_size = X.dim32(0) * chans * rows_out;
	RowPassGradientKernel<<<
	CAFFE_GET_BLOCKS(row_pass_size),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	row_pass_size,
	rows_out,
	cols_out,
	chans,
	dY.data<float>(),
	row_pass_buffer_.mutable_data<float>());
	C10_CUDA_KERNEL_LAUNCH_CHECK();

	// Integral image over columns of the integral image over rows
	const int col_pass_size = X.dim32(0) * chans * cols_out;
	ColPassGradientKernel<<<
	CAFFE_GET_BLOCKS(col_pass_size),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	col_pass_size,
	rows_out - 1,
	cols_out,
	chans,
	row_pass_buffer_.data<float>(),
	dX->template mutable_data<float>());
	C10_CUDA_KERNEL_LAUNCH_CHECK();

	return true;
	}

	REGISTER_CUDA_OPERATOR(IntegralImage, IntegralImageOp<float, CUDAContext>);
	REGISTER_CUDA_OPERATOR(
	IntegralImageGradient,
	IntegralImageGradientOp<float, CUDAContext>);

	} // namespace caffe2