caffe2/operators/bucketize_op.cu - platform/external/pytorch - Git at Google

 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/bucketize_op.h"

 #include <thrust/binary_search.h>
 #include <thrust/device_vector.h>

 namespace caffe2 {

 __global__ void BucketizeOpKernel(
     const int N,
     const int M,
     const float* bounds,
     const float* X,
     int32_t* out) {
   CUDA_1D_KERNEL_LOOP(i, N) {
     int32_t low = -1, high = M;
     while (high - low > 1) {
       const int32_t median = low + (high - low) / 2;
       if (bounds[median] < X[i]) {
         low = median;
       } else {
         high = median;
       }
     }
     out[i] = high;
   }
 }

 template <>
 bool BucketizeOp<CUDAContext>::RunOnDevice() {
   auto& input = Input(X);
   CAFFE_ENFORCE_GE(input.dim(), 1);

   auto N = input.numel();
   auto* output = Output(INDICES, input.sizes(), at::dtype<int32_t>());
   const auto* input_data = input.template data<float>();
   auto* output_data = output->template mutable_data<int32_t>();

   BucketizeOpKernel<<<
       CAFFE_GET_BLOCKS(N),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
       N,
       boundaries_device_.numel(),
       boundaries_device_.data<float>(),
       input_data,
       output_data);
   C10_CUDA_KERNEL_LAUNCH_CHECK();

   return true;
 };

 REGISTER_CUDA_OPERATOR(Bucketize, BucketizeOp<CUDAContext>);
 } // namespace caffe2

 using BucketizeCUDA = caffe2::BucketizeOp<caffe2::CUDAContext>;

 C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(
     Bucketize,
     BucketizeCUDA);
	#include "caffe2/core/context_gpu.h"
	#include "caffe2/operators/bucketize_op.h"

	#include <thrust/binary_search.h>
	#include <thrust/device_vector.h>

	namespace caffe2 {

	__global__ void BucketizeOpKernel(
	const int N,
	const int M,
	const float* bounds,
	const float* X,
	int32_t* out) {
	CUDA_1D_KERNEL_LOOP(i, N) {
	int32_t low = -1, high = M;
	while (high - low > 1) {
	const int32_t median = low + (high - low) / 2;
	if (bounds[median] < X[i]) {
	low = median;
	} else {
	high = median;
	}
	}
	out[i] = high;
	}
	}

	template <>
	bool BucketizeOp<CUDAContext>::RunOnDevice() {
	auto& input = Input(X);
	CAFFE_ENFORCE_GE(input.dim(), 1);

	auto N = input.numel();
	auto* output = Output(INDICES, input.sizes(), at::dtype<int32_t>());
	const auto* input_data = input.template data<float>();
	auto* output_data = output->template mutable_data<int32_t>();

	BucketizeOpKernel<<<
	CAFFE_GET_BLOCKS(N),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	N,
	boundaries_device_.numel(),
	boundaries_device_.data<float>(),
	input_data,
	output_data);
	C10_CUDA_KERNEL_LAUNCH_CHECK();

	return true;
	};

	REGISTER_CUDA_OPERATOR(Bucketize, BucketizeOp<CUDAContext>);
	} // namespace caffe2

	using BucketizeCUDA = caffe2::BucketizeOp<caffe2::CUDAContext>;

	C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(
	Bucketize,
	BucketizeCUDA);