| /** |
| * Copyright (c) 2016-present, Facebook, Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include "caffe2/operators/unique_ops.h" |
| |
| #include <thrust/device_vector.h> |
| #include <thrust/sequence.h> |
| #include <thrust/sort.h> |
| #include <thrust/system/cuda/execution_policy.h> |
| #include <thrust/unique.h> |
| #include <thrust/version.h> |
| #include "caffe2/core/context_gpu.h" |
| |
| namespace caffe2 { |
| |
| #if THRUST_VERSION >= 100800 |
| namespace { |
| __global__ void remap_kernel( |
| thrust::device_ptr<int> second_order, |
| thrust::device_ptr<int> order, |
| int* output, |
| int N, |
| int K) { |
| int i = blockDim.x * blockIdx.x + threadIdx.x; |
| if (i >= K) |
| return; |
| int idx = second_order[i]; |
| output[order[idx]] = i; |
| // Maybe cuda 1D kernel? |
| for (idx++; idx < N && (i == K - 1 || idx != second_order[i + 1]); idx++) { |
| output[order[idx]] = i; |
| } |
| return; |
| } |
| |
| } // namespace |
| |
| template <> |
| template <typename T> |
| bool UniqueOp<CUDAContext>::DoRunWithType() { |
| auto& inputTensor = Input(0); |
| // use dim32 to enforce that it's fine to have remapping of type int |
| int N = inputTensor.dim32(0); |
| CAFFE_ENFORCE_EQ(inputTensor.dim(), 1, "Input should be a vector"); |
| |
| int* remapping = nullptr; |
| if (REMAPPING < OutputSize()) { |
| auto* remappingTensor = |
| Output(REMAPPING, inputTensor.sizes(), at::dtype<int>()); |
| remapping = remappingTensor->template mutable_data<int>(); |
| } |
| |
| if (N <= 0) { |
| // if the input is empty, we have nothing to do, not even launch kernel. |
| /* auto* uniqueTensor = */ Output(UNIQUE, {0}, at::dtype<T>()); |
| return true; |
| } |
| |
| const T* input = inputTensor.template data<T>(); |
| ReinitializeTensor(&thrust_unique_buffer_, {N}, at::dtype<T>().device(CUDA)); |
| auto* buffer = thrust_unique_buffer_.template mutable_data<T>(); |
| context_.CopyItemsSameDevice(inputTensor.meta(), N, input, buffer); |
| |
| // Create two vectors of {0, 1, ..., N-1} on CUDA device |
| thrust::device_vector<int> order1(N), order2(N); |
| thrust::sequence( |
| thrust::cuda::par.on(context_.cuda_stream()), |
| order1.begin(), |
| order1.end()); |
| thrust::sequence( |
| thrust::cuda::par.on(context_.cuda_stream()), |
| order2.begin(), |
| order2.end()); |
| |
| // Sort the input along with order vector. So now we know where each element |
| // is permutated to. For example: |
| // input1 = 1,3,5,1,5,7,9 |
| // order1 = 0,1,2,3,4,5,6 |
| // Now we have: |
| // output = 1,1,3,5,5,7,9 |
| // order1 = 0,3,1,2,4,5,6 |
| thrust::sort_by_key( |
| thrust::cuda::par.on(context_.cuda_stream()), |
| buffer, |
| buffer + N, |
| order1.begin()); |
| |
| // Use consequent unique op to get another order_buffer |
| // input2 = 1,1,3,5,5,7,9 |
| // order2 = 0,1,2,3,4,5,6 |
| // Now we have: |
| // output = 1,3,5,7,9 |
| // order2 = 0,2,3,5,6 |
| auto new_last = thrust::unique_by_key( |
| thrust::cuda::par.on(context_.cuda_stream()), |
| buffer, |
| buffer + N, |
| order2.begin()); |
| int K = new_last.first - buffer; |
| |
| auto* uniqueTensor = Output(UNIQUE, {K}, at::dtype<T>()); |
| T* unique = uniqueTensor->template mutable_data<T>(); |
| context_.CopyItemsSameDevice(thrust_unique_buffer_.meta(), K, buffer, unique); |
| |
| // Compute the remapping. For example, for the number 1, if we look at |
| // order2[0] and order2[1], we know that input2[0:2) are all 1. They are all |
| // remapped to 0 in final input. And from order1, we know where they come |
| // from. The rest is easy. |
| if (remapping != nullptr) { |
| // record remap |
| remap_kernel<<< |
| CAFFE_GET_BLOCKS(K), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context_.cuda_stream()>>>( |
| order2.data(), order1.data(), remapping, N, K); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| return true; |
| } |
| |
| REGISTER_CUDA_OPERATOR(Unique, UniqueOp<CUDAContext>); |
| |
| #endif // THRUST_VERSION >= 100800 |
| } // namespace caffe2 |