| #include <cub/block/block_reduce.cuh> | 
 | #include "caffe2/core/context_gpu.h" | 
 | #include "caffe2/operators/find_op.h" | 
 | #include "caffe2/utils/cub_namespace.cuh" | 
 |  | 
 | namespace caffe2 { | 
 |  | 
 | template <typename T> | 
 | __global__ void FindKernel( | 
 |     int num_needles, | 
 |     int idx_size, | 
 |     const T* idx, | 
 |     const T* needles, | 
 |     int* out, | 
 |     int missing_value) { | 
 |   int needle_idx = blockIdx.x; // One cuda block per needle | 
 |   T q = needles[needle_idx]; | 
 |   int res = (-1); | 
 |   for (int j = threadIdx.x; j < idx_size; j += CAFFE_CUDA_NUM_THREADS) { | 
 |     if (idx[j] == q) { | 
 |       res = max(res, j); | 
 |     } | 
 |   } | 
 |   typedef cub::BlockReduce<int, CAFFE_CUDA_NUM_THREADS> BlockReduce; | 
 |   __shared__ typename BlockReduce::TempStorage temp_storage; | 
 |   int min_res = BlockReduce(temp_storage).Reduce(res, cub::Max()); | 
 |   if (threadIdx.x == 0) { | 
 |     out[needle_idx] = min_res == (-1) ? missing_value : min_res; | 
 |   } | 
 | } | 
 |  | 
 | template <> | 
 | template <typename T> | 
 | bool FindOp<CUDAContext>::DoRunWithType() { | 
 |   auto& idx = Input(0); | 
 |   auto& needles = Input(1); | 
 |  | 
 |   auto* res_indices = Output(0, needles.sizes(), at::dtype<int>()); | 
 |  | 
 |   const T* idx_data = idx.data<T>(); | 
 |   const T* needles_data = needles.data<T>(); | 
 |   int* res_data = res_indices->template mutable_data<int>(); | 
 |  | 
 |   FindKernel< | 
 |       T><<<needles.numel(), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>( | 
 |       needles.numel(), | 
 |       idx.numel(), | 
 |       idx_data, | 
 |       needles_data, | 
 |       res_data, | 
 |       missing_value_); | 
 |   C10_CUDA_KERNEL_LAUNCH_CHECK(); | 
 |  | 
 |   return true; | 
 | } | 
 |  | 
 | REGISTER_CUDA_OPERATOR(Find, FindOp<CUDAContext>) | 
 |  | 
 | } // namespace caffe2 |