caffe2/operators/unique_ops.cu - platform/external/pytorch - Git at Google

 /**
  * Copyright (c) 2016-present, Facebook, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "caffe2/operators/unique_ops.h"

 #include <thrust/device_vector.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 #include <thrust/system/cuda/execution_policy.h>
 #include <thrust/unique.h>
 #include <thrust/version.h>
 #include "caffe2/core/context_gpu.h"

 namespace caffe2 {

 #if THRUST_VERSION >= 100800
 namespace {
 __global__ void remap_kernel(
     thrust::device_ptr<int> second_order,
     thrust::device_ptr<int> order,
     int* output,
     int N,
     int K) {
   int i = blockDim.x * blockIdx.x + threadIdx.x;
   if (i >= K)
     return;
   int idx = second_order[i];
   output[order[idx]] = i;
   // Maybe cuda 1D kernel?
   for (idx++; idx < N && (i == K - 1 || idx != second_order[i + 1]); idx++) {
     output[order[idx]] = i;
   }
   return;
 }

 } // namespace

 template <>
 template <typename T>
 bool UniqueOp<CUDAContext>::DoRunWithType() {
   auto& inputTensor = Input(0);
   // use dim32 to enforce that it's fine to have remapping of type int
   int N = inputTensor.dim32(0);
   CAFFE_ENFORCE_EQ(inputTensor.dim(), 1, "Input should be a vector");

   int* remapping = nullptr;
   if (REMAPPING < OutputSize()) {
     auto* remappingTensor =
         Output(REMAPPING, inputTensor.sizes(), at::dtype<int>());
     remapping = remappingTensor->template mutable_data<int>();
   }

   if (N <= 0) {
     // if the input is empty, we have nothing to do, not even launch kernel.
     /* auto* uniqueTensor = */ Output(UNIQUE, {0}, at::dtype<T>());
     return true;
   }

   const T* input = inputTensor.template data<T>();
   ReinitializeTensor(&thrust_unique_buffer_, {N}, at::dtype<T>().device(CUDA));
   auto* buffer = thrust_unique_buffer_.template mutable_data<T>();
   context_.CopyItemsSameDevice(inputTensor.meta(), N, input, buffer);

   // Create two vectors of {0, 1, ..., N-1} on CUDA device
   thrust::device_vector<int> order1(N), order2(N);
   thrust::sequence(
       thrust::cuda::par.on(context_.cuda_stream()),
       order1.begin(),
       order1.end());
   thrust::sequence(
       thrust::cuda::par.on(context_.cuda_stream()),
       order2.begin(),
       order2.end());

   // Sort the input along with order vector. So now we know where each element
   // is permutated to. For example:
   //    input1 = 1,3,5,1,5,7,9
   //    order1 = 0,1,2,3,4,5,6
   // Now we have:
   //    output = 1,1,3,5,5,7,9
   //    order1 = 0,3,1,2,4,5,6
   thrust::sort_by_key(
       thrust::cuda::par.on(context_.cuda_stream()),
       buffer,
       buffer + N,
       order1.begin());

   // Use consequent unique op to get another order_buffer
   //    input2 = 1,1,3,5,5,7,9
   //    order2 = 0,1,2,3,4,5,6
   // Now we have:
   //    output = 1,3,5,7,9
   //    order2 = 0,2,3,5,6
   auto new_last = thrust::unique_by_key(
       thrust::cuda::par.on(context_.cuda_stream()),
       buffer,
       buffer + N,
       order2.begin());
   int K = new_last.first - buffer;

   auto* uniqueTensor = Output(UNIQUE, {K}, at::dtype<T>());
   T* unique = uniqueTensor->template mutable_data<T>();
   context_.CopyItemsSameDevice(thrust_unique_buffer_.meta(), K, buffer, unique);

   // Compute the remapping. For example, for the number 1, if we look at
   // order2[0] and order2[1], we know that input2[0:2) are all 1. They are all
   // remapped to 0 in final input. And from order1, we know where they come
   // from. The rest is easy.
   if (remapping != nullptr) {
     // record remap
     remap_kernel<<<
         CAFFE_GET_BLOCKS(K),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context_.cuda_stream()>>>(
         order2.data(), order1.data(), remapping, N, K);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
   return true;
 }

 REGISTER_CUDA_OPERATOR(Unique, UniqueOp<CUDAContext>);

 #endif // THRUST_VERSION >= 100800
 } // namespace caffe2
	/**
	* Copyright (c) 2016-present, Facebook, Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "caffe2/operators/unique_ops.h"

	#include <thrust/device_vector.h>
	#include <thrust/sequence.h>
	#include <thrust/sort.h>
	#include <thrust/system/cuda/execution_policy.h>
	#include <thrust/unique.h>
	#include <thrust/version.h>
	#include "caffe2/core/context_gpu.h"

	namespace caffe2 {

	#if THRUST_VERSION >= 100800
	namespace {
	__global__ void remap_kernel(
	thrust::device_ptr<int> second_order,
	thrust::device_ptr<int> order,
	int* output,
	int N,
	int K) {
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	if (i >= K)
	return;
	int idx = second_order[i];
	output[order[idx]] = i;
	// Maybe cuda 1D kernel?
	for (idx++; idx < N && (i == K - 1 \|\| idx != second_order[i + 1]); idx++) {
	output[order[idx]] = i;
	}
	return;
	}

	} // namespace

	template <>
	template <typename T>
	bool UniqueOp<CUDAContext>::DoRunWithType() {
	auto& inputTensor = Input(0);
	// use dim32 to enforce that it's fine to have remapping of type int
	int N = inputTensor.dim32(0);
	CAFFE_ENFORCE_EQ(inputTensor.dim(), 1, "Input should be a vector");

	int* remapping = nullptr;
	if (REMAPPING < OutputSize()) {
	auto* remappingTensor =
	Output(REMAPPING, inputTensor.sizes(), at::dtype<int>());
	remapping = remappingTensor->template mutable_data<int>();
	}

	if (N <= 0) {
	// if the input is empty, we have nothing to do, not even launch kernel.
	/* auto* uniqueTensor = */ Output(UNIQUE, {0}, at::dtype<T>());
	return true;
	}

	const T* input = inputTensor.template data<T>();
	ReinitializeTensor(&thrust_unique_buffer_, {N}, at::dtype<T>().device(CUDA));
	auto* buffer = thrust_unique_buffer_.template mutable_data<T>();
	context_.CopyItemsSameDevice(inputTensor.meta(), N, input, buffer);

	// Create two vectors of {0, 1, ..., N-1} on CUDA device
	thrust::device_vector<int> order1(N), order2(N);
	thrust::sequence(
	thrust::cuda::par.on(context_.cuda_stream()),
	order1.begin(),
	order1.end());
	thrust::sequence(
	thrust::cuda::par.on(context_.cuda_stream()),
	order2.begin(),
	order2.end());

	// Sort the input along with order vector. So now we know where each element
	// is permutated to. For example:
	// input1 = 1,3,5,1,5,7,9
	// order1 = 0,1,2,3,4,5,6
	// Now we have:
	// output = 1,1,3,5,5,7,9
	// order1 = 0,3,1,2,4,5,6
	thrust::sort_by_key(
	thrust::cuda::par.on(context_.cuda_stream()),
	buffer,
	buffer + N,
	order1.begin());

	// Use consequent unique op to get another order_buffer
	// input2 = 1,1,3,5,5,7,9
	// order2 = 0,1,2,3,4,5,6
	// Now we have:
	// output = 1,3,5,7,9
	// order2 = 0,2,3,5,6
	auto new_last = thrust::unique_by_key(
	thrust::cuda::par.on(context_.cuda_stream()),
	buffer,
	buffer + N,
	order2.begin());
	int K = new_last.first - buffer;

	auto* uniqueTensor = Output(UNIQUE, {K}, at::dtype<T>());
	T* unique = uniqueTensor->template mutable_data<T>();
	context_.CopyItemsSameDevice(thrust_unique_buffer_.meta(), K, buffer, unique);

	// Compute the remapping. For example, for the number 1, if we look at
	// order2[0] and order2[1], we know that input2[0:2) are all 1. They are all
	// remapped to 0 in final input. And from order1, we know where they come
	// from. The rest is easy.
	if (remapping != nullptr) {
	// record remap
	remap_kernel<<<
	CAFFE_GET_BLOCKS(K),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	order2.data(), order1.data(), remapping, N, K);
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	}
	return true;
	}

	REGISTER_CUDA_OPERATOR(Unique, UniqueOp<CUDAContext>);

	#endif // THRUST_VERSION >= 100800
	} // namespace caffe2