caffe2/operators/rmac_regions_op.cu - platform/external/pytorch - Git at Google

 #include <cub/block/block_reduce.cuh>
 #include "caffe2/utils/cub_namespace.cuh"

 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/rmac_regions_op.h"

 #if defined(USE_ROCM)
 #include <cfloat>
 #endif

 #if defined(USE_ROCM)
 namespace rocprim {
 #else
 #if USE_GLOBAL_CUB_WRAPPED_NAMESPACE()
 namespace at_cuda_detail {
 #endif
 namespace cub {
 #endif

 template <typename KeyT, typename ValueT>
 inline __host__ __device__ bool operator<(
     const cub::KeyValuePair<KeyT, ValueT>& kv1,
     const cub::KeyValuePair<KeyT, ValueT>& kv2) {
   return (kv1.value < kv2.value) ||
       (kv1.value == kv2.value && kv2.key < kv1.key);
 }

 } // namespace cub
 #if USE_GLOBAL_CUB_WRAPPED_NAMESPACE()
 }  // namespace at_cuda_detail
 #endif

 namespace caffe2 {

 namespace {

 __global__ void NumRMACRegionsKernel(
     const int W,
     const int H,
     const int min_step,
     const int max_step,
     const float overlap,
     const int scales,
     int* num_rois_data) {
   // steps(idx) regions for long dimension
   typedef cub::KeyValuePair<int, float> KeyValuePair; // <step, value>
   KeyValuePair kv, min_kv;
   min_kv.value = FLT_MAX;

   // Local reduction
   int minW = min(H, W);
   int diff = max(H, W) - minW;
   CUDA_1D_KERNEL_LOOP(index, max_step - min_step + 1) {
     kv.key = min_step + index;
     float b = diff / (1.0 * kv.key);
     kv.value = fabsf((minW * minW - minW * b) / (minW * minW) - overlap);

     if (kv < min_kv) {
       min_kv = kv;
     }
   }

   // Block-wise arg-min reduction to find step
   int step;
   {
     typedef cub::BlockReduce<KeyValuePair, CAFFE_CUDA_NUM_THREADS> BlockReduce;
     __shared__ typename BlockReduce::TempStorage temp_storage;
     min_kv = BlockReduce(temp_storage).Reduce(min_kv, cub::Min());

     __shared__ int step_shared;
     if (threadIdx.x == 0) {
       step_shared = min_kv.key;
     }
     __syncthreads();
     step = step_shared;
   }

   // Region overplus per dimension
   int Wd = (W > H) ? step : 0;
   int Hd = (H > W) ? step : 0;

   // Local reduction to compute the total number of rois at all scales
   int num_rois = 0;
   CUDA_1D_KERNEL_LOOP(index, scales) {
     int l = index + 1;
     int region_size = 2 * minW / (l + 1);
     num_rois += (region_size > 0) ? ((l + Wd) * (l + Hd)) : 0;
   }

   // Block-wise sum reduction to compute num_rois at all scales
   {
     typedef cub::BlockReduce<int, CAFFE_CUDA_NUM_THREADS> BlockReduce;
     __shared__ typename BlockReduce::TempStorage temp_storage;
     num_rois = BlockReduce(temp_storage).Sum(num_rois);
   }

   if (threadIdx.x == 0) {
     num_rois_data[0] = num_rois;
     num_rois_data[1] = Wd;
     num_rois_data[2] = Hd;
   }
 }

 __global__ void RMACRegionsKernel(
     const int W,
     const int H,
     const int N,
     const int* num_rois_data,
     float* output) {
   int num_rois = num_rois_data[0];
   int Wd = num_rois_data[1];
   int Hd = num_rois_data[2];

   // Block-wide temp shared storage for intermediate ROI results to avoid
   // uncoalesced writes to global mem
   __shared__ float output_shared[CAFFE_CUDA_NUM_THREADS * 5];

   CUDA_1D_KERNEL_LOOP(index, N) {
     int batch_id = index / num_rois;
     int roi_id = index % num_rois;

     int roi[5];
     roi[0] = batch_id;

     // Find the scale corresponding to this index and the roi_id relative
     // to the scale.
     int l = 0;
     int num_rois_at_scale = 0;
     do {
       roi_id -= num_rois_at_scale;
       l++;
       num_rois_at_scale = (l + Wd) * (l + Hd);
     } while (roi_id - num_rois_at_scale >= 0);

     int region_size = 2 * min(H, W) / (l + 1);
     float bw =
         (l + Wd - 1 > 0) ? ((W - region_size) / (1.0 * (l + Wd - 1))) : 0;
     float bh =
         (l + Hd - 1 > 0) ? ((H - region_size) / (1.0 * (l + Hd - 1))) : 0;

     int i = roi_id / (l + Hd);
     int j = roi_id % (l + Hd);

     roi[1] = bw * i;
     roi[2] = bh * j;
     // Careful with the borders
     if (roi[1] + region_size > W) {
       roi[1] -= (roi[1] + region_size - W);
     }
     if (roi[2] + region_size > H) {
       roi[2] -= (roi[2] + region_size - H);
     }
     roi[3] = roi[1] + region_size - 1;
     roi[4] = roi[2] + region_size - 1;

     // Writing directly to output (global memory) will result in uncoalesced
     // writes. Write output to shared mem first and then write ROI results to
     // global output in a coalesced manner.
     __syncthreads(); // Since output_shared is reused across loop iterations
     for (int i = 0; i < 5; ++i) {
       output_shared[threadIdx.x * 5 + i] = roi[i];
     }
     __syncthreads();
     int offset = index - threadIdx.x;
     float* output_offset = output + offset * 5;
     int num_threads = min(blockDim.x, N - offset); // Active threads in block
     for (int i = 0; i < 5; ++i) {
       output_offset[num_threads * i + threadIdx.x] =
           output_shared[num_threads * i + threadIdx.x];
     }
   }
 }

 } // namespace

 template <>
 bool RMACRegionsOp<CUDAContext>::RunOnDevice() {
   const auto& X = Input(0); // Input tensor
    // RoIs

   if (X.numel() == 0) {
     return true;
   }

   int batch_size = X.dim32(0);
   int H = X.dim32(2);
   int W = X.dim32(3);

   // Compute number of regions
   int min_step = 1;
   int max_step = 6;
   ReinitializeTensor(&num_rois_, {3}, at::dtype<int>().device(CUDA)); // num_rois, Wd, Hd
   NumRMACRegionsKernel<<<
       1,
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
       W,
       H,
       min_step,
       max_step,
       overlap_,
       scales_,
       num_rois_.mutable_data<int>());
   C10_CUDA_KERNEL_LAUNCH_CHECK();

   // Bit awkward, but the size of the output tensor depends on the output of
   // NumRMACRegionsKernel (number of RoIs), so need to copy that to CPU
   // to Resize() output appropriately.
   int num_rois = 0;
   context_.CopyBytesToCPU(sizeof(int), num_rois_.data<int>(), &num_rois);
   int N = batch_size * num_rois;
   auto* output = Output(0, {N, 5}, at::dtype<float>()); // [batch_id x1 y1 x2 y2]

   // Compute region coordinates
   RMACRegionsKernel<<<
       CAFFE_GET_BLOCKS(N),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
       W, H, N, num_rois_.data<int>(), output->template mutable_data<float>());
   C10_CUDA_KERNEL_LAUNCH_CHECK();

   return true;
 }

 REGISTER_CUDA_OPERATOR(RMACRegions, RMACRegionsOp<CUDAContext>);

 } // namespace caffe2
	#include <cub/block/block_reduce.cuh>
	#include "caffe2/utils/cub_namespace.cuh"

	#include "caffe2/core/context_gpu.h"
	#include "caffe2/operators/rmac_regions_op.h"

	#if defined(USE_ROCM)
	#include <cfloat>
	#endif

	#if defined(USE_ROCM)
	namespace rocprim {
	#else
	#if USE_GLOBAL_CUB_WRAPPED_NAMESPACE()
	namespace at_cuda_detail {
	#endif
	namespace cub {
	#endif

	template <typename KeyT, typename ValueT>
	inline __host__ __device__ bool operator<(
	const cub::KeyValuePair<KeyT, ValueT>& kv1,
	const cub::KeyValuePair<KeyT, ValueT>& kv2) {
	return (kv1.value < kv2.value) \|\|
	(kv1.value == kv2.value && kv2.key < kv1.key);
	}

	} // namespace cub
	#if USE_GLOBAL_CUB_WRAPPED_NAMESPACE()
	} // namespace at_cuda_detail
	#endif

	namespace caffe2 {

	namespace {

	__global__ void NumRMACRegionsKernel(
	const int W,
	const int H,
	const int min_step,
	const int max_step,
	const float overlap,
	const int scales,
	int* num_rois_data) {
	// steps(idx) regions for long dimension
	typedef cub::KeyValuePair<int, float> KeyValuePair; // <step, value>
	KeyValuePair kv, min_kv;
	min_kv.value = FLT_MAX;

	// Local reduction
	int minW = min(H, W);
	int diff = max(H, W) - minW;
	CUDA_1D_KERNEL_LOOP(index, max_step - min_step + 1) {
	kv.key = min_step + index;
	float b = diff / (1.0 * kv.key);
	kv.value = fabsf((minW * minW - minW * b) / (minW * minW) - overlap);

	if (kv < min_kv) {
	min_kv = kv;
	}
	}

	// Block-wise arg-min reduction to find step
	int step;
	{
	typedef cub::BlockReduce<KeyValuePair, CAFFE_CUDA_NUM_THREADS> BlockReduce;
	__shared__ typename BlockReduce::TempStorage temp_storage;
	min_kv = BlockReduce(temp_storage).Reduce(min_kv, cub::Min());

	__shared__ int step_shared;
	if (threadIdx.x == 0) {
	step_shared = min_kv.key;
	}
	__syncthreads();
	step = step_shared;
	}

	// Region overplus per dimension
	int Wd = (W > H) ? step : 0;
	int Hd = (H > W) ? step : 0;

	// Local reduction to compute the total number of rois at all scales
	int num_rois = 0;
	CUDA_1D_KERNEL_LOOP(index, scales) {
	int l = index + 1;
	int region_size = 2 * minW / (l + 1);
	num_rois += (region_size > 0) ? ((l + Wd) * (l + Hd)) : 0;
	}

	// Block-wise sum reduction to compute num_rois at all scales
	{
	typedef cub::BlockReduce<int, CAFFE_CUDA_NUM_THREADS> BlockReduce;
	__shared__ typename BlockReduce::TempStorage temp_storage;
	num_rois = BlockReduce(temp_storage).Sum(num_rois);
	}

	if (threadIdx.x == 0) {
	num_rois_data[0] = num_rois;
	num_rois_data[1] = Wd;
	num_rois_data[2] = Hd;
	}
	}

	__global__ void RMACRegionsKernel(
	const int W,
	const int H,
	const int N,
	const int* num_rois_data,
	float* output) {
	int num_rois = num_rois_data[0];
	int Wd = num_rois_data[1];
	int Hd = num_rois_data[2];

	// Block-wide temp shared storage for intermediate ROI results to avoid
	// uncoalesced writes to global mem
	__shared__ float output_shared[CAFFE_CUDA_NUM_THREADS * 5];

	CUDA_1D_KERNEL_LOOP(index, N) {
	int batch_id = index / num_rois;
	int roi_id = index % num_rois;

	int roi[5];
	roi[0] = batch_id;

	// Find the scale corresponding to this index and the roi_id relative
	// to the scale.
	int l = 0;
	int num_rois_at_scale = 0;
	do {
	roi_id -= num_rois_at_scale;
	l++;
	num_rois_at_scale = (l + Wd) * (l + Hd);
	} while (roi_id - num_rois_at_scale >= 0);

	int region_size = 2 * min(H, W) / (l + 1);
	float bw =
	(l + Wd - 1 > 0) ? ((W - region_size) / (1.0 * (l + Wd - 1))) : 0;
	float bh =
	(l + Hd - 1 > 0) ? ((H - region_size) / (1.0 * (l + Hd - 1))) : 0;

	int i = roi_id / (l + Hd);
	int j = roi_id % (l + Hd);

	roi[1] = bw * i;
	roi[2] = bh * j;
	// Careful with the borders
	if (roi[1] + region_size > W) {
	roi[1] -= (roi[1] + region_size - W);
	}
	if (roi[2] + region_size > H) {
	roi[2] -= (roi[2] + region_size - H);
	}
	roi[3] = roi[1] + region_size - 1;
	roi[4] = roi[2] + region_size - 1;

	// Writing directly to output (global memory) will result in uncoalesced
	// writes. Write output to shared mem first and then write ROI results to
	// global output in a coalesced manner.
	__syncthreads(); // Since output_shared is reused across loop iterations
	for (int i = 0; i < 5; ++i) {
	output_shared[threadIdx.x * 5 + i] = roi[i];
	}
	__syncthreads();
	int offset = index - threadIdx.x;
	float* output_offset = output + offset * 5;
	int num_threads = min(blockDim.x, N - offset); // Active threads in block
	for (int i = 0; i < 5; ++i) {
	output_offset[num_threads * i + threadIdx.x] =
	output_shared[num_threads * i + threadIdx.x];
	}
	}
	}

	} // namespace

	template <>
	bool RMACRegionsOp<CUDAContext>::RunOnDevice() {
	const auto& X = Input(0); // Input tensor
	// RoIs

	if (X.numel() == 0) {
	return true;
	}

	int batch_size = X.dim32(0);
	int H = X.dim32(2);
	int W = X.dim32(3);

	// Compute number of regions
	int min_step = 1;
	int max_step = 6;
	ReinitializeTensor(&num_rois_, {3}, at::dtype<int>().device(CUDA)); // num_rois, Wd, Hd
	NumRMACRegionsKernel<<<
	1,
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	W,
	H,
	min_step,
	max_step,
	overlap_,
	scales_,
	num_rois_.mutable_data<int>());
	C10_CUDA_KERNEL_LAUNCH_CHECK();

	// Bit awkward, but the size of the output tensor depends on the output of
	// NumRMACRegionsKernel (number of RoIs), so need to copy that to CPU
	// to Resize() output appropriately.
	int num_rois = 0;
	context_.CopyBytesToCPU(sizeof(int), num_rois_.data<int>(), &num_rois);
	int N = batch_size * num_rois;
	auto* output = Output(0, {N, 5}, at::dtype<float>()); // [batch_id x1 y1 x2 y2]

	// Compute region coordinates
	RMACRegionsKernel<<<
	CAFFE_GET_BLOCKS(N),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	W, H, N, num_rois_.data<int>(), output->template mutable_data<float>());
	C10_CUDA_KERNEL_LAUNCH_CHECK();

	return true;
	}

	REGISTER_CUDA_OPERATOR(RMACRegions, RMACRegionsOp<CUDAContext>);

	} // namespace caffe2