caffe2/operators/roi_pool_op.cu - platform/external/pytorch - Git at Google

 #include <cfloat>

 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/roi_pool_op.h"
 #include "caffe2/utils/GpuAtomics.cuh"

 namespace caffe2 {

 namespace {

 template <typename T>
 __global__ void ROIPoolForward(
     const int nthreads,
     const T* bottom_data,
     const T spatial_scale,
     const int channels,
     const int height,
     const int width,
     const int pooled_height,
     const int pooled_width,
     const T* bottom_rois,
     T* top_data,
     int* argmax_data) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
     int ph = (index / pooled_width) % pooled_height;
     int c = (index / pooled_width / pooled_height) % channels;
     int n = index / pooled_width / pooled_height / channels;

     const T* offset_bottom_rois = bottom_rois + n * 5;
     int roi_batch_ind = offset_bottom_rois[0];
     int roi_start_w = roundf(offset_bottom_rois[1] * spatial_scale);
     int roi_start_h = roundf(offset_bottom_rois[2] * spatial_scale);
     int roi_end_w = roundf(offset_bottom_rois[3] * spatial_scale);
     int roi_end_h = roundf(offset_bottom_rois[4] * spatial_scale);

     // Force malformed ROIs to be 1x1
     int roi_width = max(roi_end_w - roi_start_w + 1, 1);
     int roi_height = max(roi_end_h - roi_start_h + 1, 1);
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

     int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
     int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
     int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
     int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));

     // Add roi offsets and clip to input boundaries
     hstart = min(max(hstart + roi_start_h, 0), height);
     hend = min(max(hend + roi_start_h, 0), height);
     wstart = min(max(wstart + roi_start_w, 0), width);
     wend = min(max(wend + roi_start_w, 0), width);
     bool is_empty = (hend <= hstart) || (wend <= wstart);

     // Define an empty pooling region to be zero
     T maxval = is_empty ? 0 : -FLT_MAX;
     // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
     int maxidx = -1;
     const T* offset_bottom_data =
         bottom_data + (roi_batch_ind * channels + c) * height * width;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         int bottom_index = h * width + w;
         if (offset_bottom_data[bottom_index] > maxval) {
           maxval = offset_bottom_data[bottom_index];
           maxidx = bottom_index;
         }
       }
     }
     top_data[index] = maxval;
     if (argmax_data) {
       argmax_data[index] = maxidx;
     }
   }
 }

 template <typename T>
 __global__ void ROIPoolBackward(
     const int nthreads,
     const T* top_diff,
     const int* argmax_data,
     const int num_rois,
     const T spatial_scale,
     const int channels,
     const int height,
     const int width,
     const int pooled_height,
     const int pooled_width,
     T* bottom_diff,
     const T* bottom_rois) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
     int ph = (index / pooled_width) % pooled_height;
     int c = (index / pooled_width / pooled_height) % channels;
     int n = index / pooled_width / pooled_height / channels;

     const T* offset_bottom_rois = bottom_rois + n * 5;
     int roi_batch_ind = offset_bottom_rois[0];
     int bottom_offset = (roi_batch_ind * channels + c) * height * width;
     int top_offset = (n * channels + c) * pooled_height * pooled_width;
     const T* offset_top_diff = top_diff + top_offset;
     T* offset_bottom_diff = bottom_diff + bottom_offset;
     const int* offset_argmax_data = argmax_data + top_offset;

     int argmax = offset_argmax_data[ph * pooled_width + pw];
     if (argmax != -1) {
       gpu_atomic_add(
           offset_bottom_diff + argmax,
           static_cast<T>(offset_top_diff[ph * pooled_width + pw]));
     }
   }
 }

 } // namespace

 template <>
 bool RoIPoolOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0); // Input data to pool
   auto& R = Input(1); // RoIs
   auto* Y = Output(0); // RoI pooled data
   auto* A = is_test_ ? nullptr : Output(1); // argmaxes

   // Handle empty rois
   if (R.numel() == 0) {
     Y->Resize(0, X.dim32(1), pooled_height_, pooled_width_);
     // mutable_data calls are needed to allocate the tensors
     Y->template mutable_data<float>();
     if (!is_test_) {
       A->Resize(Y->sizes());
       A->template mutable_data<int>();
     }
     return true;
   }

   Y->Resize(R.dim32(0), X.dim32(1), pooled_height_, pooled_width_);
   if (!is_test_) {
     A->Resize(Y->sizes());
   }
   int output_size = Y->numel();
   int* argmax_data = is_test_ ? nullptr : A->template mutable_data<int>();
   ROIPoolForward<float>
       <<<CAFFE_GET_BLOCKS(output_size),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context_.cuda_stream()>>>(
           output_size,
           X.data<float>(),
           spatial_scale_,
           X.dim32(1),
           X.dim32(2),
           X.dim32(3),
           pooled_height_,
           pooled_width_,
           R.data<float>(),
           Y->template mutable_data<float>(),
           argmax_data);
   C10_CUDA_KERNEL_LAUNCH_CHECK();

   return true;
 }

 template <>
 C10_EXPORT bool RoIPoolGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0); // Input data to pool
   auto& R = Input(1); // RoIs
   auto& A = Input(2); // argmaxes
   auto& dY = Input(3); // Gradient of net w.r.t. output of "forward" op
   // (aka "gradOutput")

   auto* dX = Output(
       0, X.sizes(), at::dtype<float>()); // Gradient of net w.r.t. input to
                                          // "forward" op (aka "gradInput")
   // Must zero-out dX before accumulating gradients
   math::Set<float, CUDAContext>(
       dX->numel(), 0.f, dX->template mutable_data<float>(), &context_);
   if (dY.numel() > 0) { // Handle possibly empty gradient if there were no rois
     ROIPoolBackward<float>
         <<<CAFFE_GET_BLOCKS(dY.numel()),
            CAFFE_CUDA_NUM_THREADS,
            0,
            context_.cuda_stream()>>>(
             dY.numel(),
             dY.data<float>(),
             A.data<int>(),
             R.dim32(0),
             spatial_scale_,
             X.dim32(1),
             X.dim32(2),
             X.dim32(3),
             pooled_height_,
             pooled_width_,
             dX->template mutable_data<float>(),
             R.data<float>());
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
   return true;
 }

 REGISTER_CUDA_OPERATOR(RoIPool, RoIPoolOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(RoIPoolGradient, RoIPoolGradientOp<float, CUDAContext>);

 } // namespace caffe2
	#include <cfloat>

	#include "caffe2/core/context_gpu.h"
	#include "caffe2/operators/roi_pool_op.h"
	#include "caffe2/utils/GpuAtomics.cuh"

	namespace caffe2 {

	namespace {

	template <typename T>
	__global__ void ROIPoolForward(
	const int nthreads,
	const T* bottom_data,
	const T spatial_scale,
	const int channels,
	const int height,
	const int width,
	const int pooled_height,
	const int pooled_width,
	const T* bottom_rois,
	T* top_data,
	int* argmax_data) {
	CUDA_1D_KERNEL_LOOP(index, nthreads) {
	// (n, c, ph, pw) is an element in the pooled output
	int pw = index % pooled_width;
	int ph = (index / pooled_width) % pooled_height;
	int c = (index / pooled_width / pooled_height) % channels;
	int n = index / pooled_width / pooled_height / channels;

	const T* offset_bottom_rois = bottom_rois + n * 5;
	int roi_batch_ind = offset_bottom_rois[0];
	int roi_start_w = roundf(offset_bottom_rois[1] * spatial_scale);
	int roi_start_h = roundf(offset_bottom_rois[2] * spatial_scale);
	int roi_end_w = roundf(offset_bottom_rois[3] * spatial_scale);
	int roi_end_h = roundf(offset_bottom_rois[4] * spatial_scale);

	// Force malformed ROIs to be 1x1
	int roi_width = max(roi_end_w - roi_start_w + 1, 1);
	int roi_height = max(roi_end_h - roi_start_h + 1, 1);
	T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
	T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

	int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
	int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
	int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
	int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));

	// Add roi offsets and clip to input boundaries
	hstart = min(max(hstart + roi_start_h, 0), height);
	hend = min(max(hend + roi_start_h, 0), height);
	wstart = min(max(wstart + roi_start_w, 0), width);
	wend = min(max(wend + roi_start_w, 0), width);
	bool is_empty = (hend <= hstart) \|\| (wend <= wstart);

	// Define an empty pooling region to be zero
	T maxval = is_empty ? 0 : -FLT_MAX;
	// If nothing is pooled, argmax = -1 causes nothing to be backprop'd
	int maxidx = -1;
	const T* offset_bottom_data =
	bottom_data + (roi_batch_ind * channels + c) * height * width;
	for (int h = hstart; h < hend; ++h) {
	for (int w = wstart; w < wend; ++w) {
	int bottom_index = h * width + w;
	if (offset_bottom_data[bottom_index] > maxval) {
	maxval = offset_bottom_data[bottom_index];
	maxidx = bottom_index;
	}
	}
	}
	top_data[index] = maxval;
	if (argmax_data) {
	argmax_data[index] = maxidx;
	}
	}
	}

	template <typename T>
	__global__ void ROIPoolBackward(
	const int nthreads,
	const T* top_diff,
	const int* argmax_data,
	const int num_rois,
	const T spatial_scale,
	const int channels,
	const int height,
	const int width,
	const int pooled_height,
	const int pooled_width,
	T* bottom_diff,
	const T* bottom_rois) {
	CUDA_1D_KERNEL_LOOP(index, nthreads) {
	// (n, c, ph, pw) is an element in the pooled output
	int pw = index % pooled_width;
	int ph = (index / pooled_width) % pooled_height;
	int c = (index / pooled_width / pooled_height) % channels;
	int n = index / pooled_width / pooled_height / channels;

	const T* offset_bottom_rois = bottom_rois + n * 5;
	int roi_batch_ind = offset_bottom_rois[0];
	int bottom_offset = (roi_batch_ind * channels + c) * height * width;
	int top_offset = (n * channels + c) * pooled_height * pooled_width;
	const T* offset_top_diff = top_diff + top_offset;
	T* offset_bottom_diff = bottom_diff + bottom_offset;
	const int* offset_argmax_data = argmax_data + top_offset;

	int argmax = offset_argmax_data[ph * pooled_width + pw];
	if (argmax != -1) {
	gpu_atomic_add(
	offset_bottom_diff + argmax,
	static_cast<T>(offset_top_diff[ph * pooled_width + pw]));
	}
	}
	}

	} // namespace

	template <>
	bool RoIPoolOp<float, CUDAContext>::RunOnDevice() {
	auto& X = Input(0); // Input data to pool
	auto& R = Input(1); // RoIs
	auto* Y = Output(0); // RoI pooled data
	auto* A = is_test_ ? nullptr : Output(1); // argmaxes

	// Handle empty rois
	if (R.numel() == 0) {
	Y->Resize(0, X.dim32(1), pooled_height_, pooled_width_);
	// mutable_data calls are needed to allocate the tensors
	Y->template mutable_data<float>();
	if (!is_test_) {
	A->Resize(Y->sizes());
	A->template mutable_data<int>();
	}
	return true;
	}

	Y->Resize(R.dim32(0), X.dim32(1), pooled_height_, pooled_width_);
	if (!is_test_) {
	A->Resize(Y->sizes());
	}
	int output_size = Y->numel();
	int* argmax_data = is_test_ ? nullptr : A->template mutable_data<int>();
	ROIPoolForward<float>
	<<<CAFFE_GET_BLOCKS(output_size),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	output_size,
	X.data<float>(),
	spatial_scale_,
	X.dim32(1),
	X.dim32(2),
	X.dim32(3),
	pooled_height_,
	pooled_width_,
	R.data<float>(),
	Y->template mutable_data<float>(),
	argmax_data);
	C10_CUDA_KERNEL_LAUNCH_CHECK();

	return true;
	}

	template <>
	C10_EXPORT bool RoIPoolGradientOp<float, CUDAContext>::RunOnDevice() {
	auto& X = Input(0); // Input data to pool
	auto& R = Input(1); // RoIs
	auto& A = Input(2); // argmaxes
	auto& dY = Input(3); // Gradient of net w.r.t. output of "forward" op
	// (aka "gradOutput")

	auto* dX = Output(
	0, X.sizes(), at::dtype<float>()); // Gradient of net w.r.t. input to
	// "forward" op (aka "gradInput")
	// Must zero-out dX before accumulating gradients
	math::Set<float, CUDAContext>(
	dX->numel(), 0.f, dX->template mutable_data<float>(), &context_);
	if (dY.numel() > 0) { // Handle possibly empty gradient if there were no rois
	ROIPoolBackward<float>
	<<<CAFFE_GET_BLOCKS(dY.numel()),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	dY.numel(),
	dY.data<float>(),
	A.data<int>(),
	R.dim32(0),
	spatial_scale_,
	X.dim32(1),
	X.dim32(2),
	X.dim32(3),
	pooled_height_,
	pooled_width_,
	dX->template mutable_data<float>(),
	R.data<float>());
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	}
	return true;
	}

	REGISTER_CUDA_OPERATOR(RoIPool, RoIPoolOp<float, CUDAContext>);
	REGISTER_CUDA_OPERATOR(RoIPoolGradient, RoIPoolGradientOp<float, CUDAContext>);

	} // namespace caffe2