| /** | 
 |  * Copyright (c) 2016-present, Facebook, Inc. | 
 |  * | 
 |  * Licensed under the Apache License, Version 2.0 (the "License"); | 
 |  * you may not use this file except in compliance with the License. | 
 |  * You may obtain a copy of the License at | 
 |  * | 
 |  *     http://www.apache.org/licenses/LICENSE-2.0 | 
 |  * | 
 |  * Unless required by applicable law or agreed to in writing, software | 
 |  * distributed under the License is distributed on an "AS IS" BASIS, | 
 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
 |  * See the License for the specific language governing permissions and | 
 |  * limitations under the License. | 
 |  */ | 
 |  | 
 | #include "caffe2/operators/unique_ops.h" | 
 |  | 
 | #include <thrust/device_vector.h> | 
 | #include <thrust/sequence.h> | 
 | #include <thrust/sort.h> | 
 | #include <thrust/system/cuda/execution_policy.h> | 
 | #include <thrust/unique.h> | 
 | #include <thrust/version.h> | 
 | #include "caffe2/core/context_gpu.h" | 
 |  | 
 | namespace caffe2 { | 
 |  | 
 | #if THRUST_VERSION >= 100800 | 
 | namespace { | 
 | __global__ void remap_kernel( | 
 |     thrust::device_ptr<int> second_order, | 
 |     thrust::device_ptr<int> order, | 
 |     int* output, | 
 |     int N, | 
 |     int K) { | 
 |   int i = blockDim.x * blockIdx.x + threadIdx.x; | 
 |   if (i >= K) | 
 |     return; | 
 |   int idx = second_order[i]; | 
 |   output[order[idx]] = i; | 
 |   // Maybe cuda 1D kernel? | 
 |   for (idx++; idx < N && (i == K - 1 || idx != second_order[i + 1]); idx++) { | 
 |     output[order[idx]] = i; | 
 |   } | 
 |   return; | 
 | } | 
 |  | 
 | } // namespace | 
 |  | 
 | template <> | 
 | template <typename T> | 
 | bool UniqueOp<CUDAContext>::DoRunWithType() { | 
 |   auto& inputTensor = Input(0); | 
 |   // use dim32 to enforce that it's fine to have remapping of type int | 
 |   int N = inputTensor.dim32(0); | 
 |   CAFFE_ENFORCE_EQ(inputTensor.dim(), 1, "Input should be a vector"); | 
 |  | 
 |   int* remapping = nullptr; | 
 |   if (REMAPPING < OutputSize()) { | 
 |     auto* remappingTensor = | 
 |         Output(REMAPPING, inputTensor.sizes(), at::dtype<int>()); | 
 |     remapping = remappingTensor->template mutable_data<int>(); | 
 |   } | 
 |  | 
 |   if (N <= 0) { | 
 |     // if the input is empty, we have nothing to do, not even launch kernel. | 
 |     /* auto* uniqueTensor = */ Output(UNIQUE, {0}, at::dtype<T>()); | 
 |     return true; | 
 |   } | 
 |  | 
 |   const T* input = inputTensor.template data<T>(); | 
 |   ReinitializeTensor(&thrust_unique_buffer_, {N}, at::dtype<T>().device(CUDA)); | 
 |   auto* buffer = thrust_unique_buffer_.template mutable_data<T>(); | 
 |   context_.CopyItemsSameDevice(inputTensor.meta(), N, input, buffer); | 
 |  | 
 |   // Create two vectors of {0, 1, ..., N-1} on CUDA device | 
 |   thrust::device_vector<int> order1(N), order2(N); | 
 |   thrust::sequence( | 
 |       thrust::cuda::par.on(context_.cuda_stream()), | 
 |       order1.begin(), | 
 |       order1.end()); | 
 |   thrust::sequence( | 
 |       thrust::cuda::par.on(context_.cuda_stream()), | 
 |       order2.begin(), | 
 |       order2.end()); | 
 |  | 
 |   // Sort the input along with order vector. So now we know where each element | 
 |   // is permutated to. For example: | 
 |   //    input1 = 1,3,5,1,5,7,9 | 
 |   //    order1 = 0,1,2,3,4,5,6 | 
 |   // Now we have: | 
 |   //    output = 1,1,3,5,5,7,9 | 
 |   //    order1 = 0,3,1,2,4,5,6 | 
 |   thrust::sort_by_key( | 
 |       thrust::cuda::par.on(context_.cuda_stream()), | 
 |       buffer, | 
 |       buffer + N, | 
 |       order1.begin()); | 
 |  | 
 |   // Use consequent unique op to get another order_buffer | 
 |   //    input2 = 1,1,3,5,5,7,9 | 
 |   //    order2 = 0,1,2,3,4,5,6 | 
 |   // Now we have: | 
 |   //    output = 1,3,5,7,9 | 
 |   //    order2 = 0,2,3,5,6 | 
 |   auto new_last = thrust::unique_by_key( | 
 |       thrust::cuda::par.on(context_.cuda_stream()), | 
 |       buffer, | 
 |       buffer + N, | 
 |       order2.begin()); | 
 |   int K = new_last.first - buffer; | 
 |  | 
 |   auto* uniqueTensor = Output(UNIQUE, {K}, at::dtype<T>()); | 
 |   T* unique = uniqueTensor->template mutable_data<T>(); | 
 |   context_.CopyItemsSameDevice(thrust_unique_buffer_.meta(), K, buffer, unique); | 
 |  | 
 |   // Compute the remapping. For example, for the number 1, if we look at | 
 |   // order2[0] and order2[1], we know that input2[0:2) are all 1. They are all | 
 |   // remapped to 0 in final input. And from order1, we know where they come | 
 |   // from. The rest is easy. | 
 |   if (remapping != nullptr) { | 
 |     // record remap | 
 |     remap_kernel<<< | 
 |         CAFFE_GET_BLOCKS(K), | 
 |         CAFFE_CUDA_NUM_THREADS, | 
 |         0, | 
 |         context_.cuda_stream()>>>( | 
 |         order2.data(), order1.data(), remapping, N, K); | 
 |     C10_CUDA_KERNEL_LAUNCH_CHECK(); | 
 |   } | 
 |   return true; | 
 | } | 
 |  | 
 | REGISTER_CUDA_OPERATOR(Unique, UniqueOp<CUDAContext>); | 
 |  | 
 | #endif // THRUST_VERSION >= 100800 | 
 | } // namespace caffe2 |