blob: 33e607acfdcb3da727a2c7d63e5768e09bbe8b4f [file] [log] [blame]
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "caffe2/operators/unique_ops.h"
#include <thrust/device_vector.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/unique.h>
#include <thrust/version.h>
#include "caffe2/core/context_gpu.h"
namespace caffe2 {
#if THRUST_VERSION >= 100800
namespace {
__global__ void remap_kernel(
thrust::device_ptr<int> second_order,
thrust::device_ptr<int> order,
int* output,
int N,
int K) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= K)
return;
int idx = second_order[i];
output[order[idx]] = i;
// Maybe cuda 1D kernel?
for (idx++; idx < N && (i == K - 1 || idx != second_order[i + 1]); idx++) {
output[order[idx]] = i;
}
return;
}
} // namespace
template <>
template <typename T>
bool UniqueOp<CUDAContext>::DoRunWithType() {
auto& inputTensor = Input(0);
// use dim32 to enforce that it's fine to have remapping of type int
int N = inputTensor.dim32(0);
CAFFE_ENFORCE_EQ(inputTensor.dim(), 1, "Input should be a vector");
int* remapping = nullptr;
if (REMAPPING < OutputSize()) {
auto* remappingTensor =
Output(REMAPPING, inputTensor.sizes(), at::dtype<int>());
remapping = remappingTensor->template mutable_data<int>();
}
if (N <= 0) {
// if the input is empty, we have nothing to do, not even launch kernel.
/* auto* uniqueTensor = */ Output(UNIQUE, {0}, at::dtype<T>());
return true;
}
const T* input = inputTensor.template data<T>();
ReinitializeTensor(&thrust_unique_buffer_, {N}, at::dtype<T>().device(CUDA));
auto* buffer = thrust_unique_buffer_.template mutable_data<T>();
context_.CopyItemsSameDevice(inputTensor.meta(), N, input, buffer);
// Create two vectors of {0, 1, ..., N-1} on CUDA device
thrust::device_vector<int> order1(N), order2(N);
thrust::sequence(
thrust::cuda::par.on(context_.cuda_stream()),
order1.begin(),
order1.end());
thrust::sequence(
thrust::cuda::par.on(context_.cuda_stream()),
order2.begin(),
order2.end());
// Sort the input along with order vector. So now we know where each element
// is permutated to. For example:
// input1 = 1,3,5,1,5,7,9
// order1 = 0,1,2,3,4,5,6
// Now we have:
// output = 1,1,3,5,5,7,9
// order1 = 0,3,1,2,4,5,6
thrust::sort_by_key(
thrust::cuda::par.on(context_.cuda_stream()),
buffer,
buffer + N,
order1.begin());
// Use consequent unique op to get another order_buffer
// input2 = 1,1,3,5,5,7,9
// order2 = 0,1,2,3,4,5,6
// Now we have:
// output = 1,3,5,7,9
// order2 = 0,2,3,5,6
auto new_last = thrust::unique_by_key(
thrust::cuda::par.on(context_.cuda_stream()),
buffer,
buffer + N,
order2.begin());
int K = new_last.first - buffer;
auto* uniqueTensor = Output(UNIQUE, {K}, at::dtype<T>());
T* unique = uniqueTensor->template mutable_data<T>();
context_.CopyItemsSameDevice(thrust_unique_buffer_.meta(), K, buffer, unique);
// Compute the remapping. For example, for the number 1, if we look at
// order2[0] and order2[1], we know that input2[0:2) are all 1. They are all
// remapped to 0 in final input. And from order1, we know where they come
// from. The rest is easy.
if (remapping != nullptr) {
// record remap
remap_kernel<<<
CAFFE_GET_BLOCKS(K),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
order2.data(), order1.data(), remapping, N, K);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
return true;
}
REGISTER_CUDA_OPERATOR(Unique, UniqueOp<CUDAContext>);
#endif // THRUST_VERSION >= 100800
} // namespace caffe2