caffe2/operators/utility_ops.cu - platform/external/pytorch - Git at Google

 /**
  * Copyright (c) 2016-present, Facebook, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include <math.h>
 #include <cfloat>
 // TODO(jamesreed): I would use <cmath> here but std::isnan
 // and std::isinf are declared constexpr there and the nvidia
 // compiler throws an error because of it

 #include <thrust/device_vector.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 #include <thrust/system/cuda/execution_policy.h>
 #include <thrust/unique.h>
 #include "caffe2/core/context_gpu.h"
 #include "utility_ops.h"

 namespace caffe2 {
 CAFFE_KNOWN_TYPE(const float*);

 REGISTER_CUDA_OPERATOR(EnsureDense, EnsureDenseOp<CUDAContext>);


 __global__ void NanCheckKernel(int N, const float* X, bool* result) {
   bool has_nan = false;
   CUDA_1D_KERNEL_LOOP(i, N) {
     // Note: we have no need to do early return, since only if this fails
     // will we not need to inspect all elements. No need to optimize the
     // case that will fail.
     has_nan = has_nan || isnan(X[i]) || isinf(X[i]);
   }
   __syncthreads();
   if (has_nan) {
     result[0] = true;
   }
 }

 template <>
 bool NanCheckOp<CUDAContext>::RunOnDevice() {
   auto& X = Input(0);
   auto* Y = Output(0);
   const size_t N = X.size();
   const float* data_ptr = X.data<float>();

   scratch_.Resize(1);
   math::Set<bool, CUDAContext>(
       1, false, scratch_.mutable_data<bool>(), &context_);
   NanCheckKernel<<<
       CAFFE_GET_BLOCKS(N),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
       N, X.data<float>(), scratch_.mutable_data<bool>());

   bool result = false;
   {
     std::lock_guard<std::mutex> lock(CUDAContext::mutex());
     CUDA_ENFORCE(cudaMemcpyAsync(
         &result,
         scratch_.raw_data(),
         1,
         cudaMemcpyDefault,
         context_.cuda_stream()));
   }
   // Note: we must synchronize here so we can inspect the result
   context_.FinishDeviceComputation();

   // Print out diagnostic info if we have a NaN or inf
   if (result) {
     std::cerr << "Tensor contained NaN or inf: " << this->debug_def().input(0)
               << std::endl;

     for (int j = 0; j < InputSize(); j++) {
       TensorCPU cpu_X;
       cpu_X.ResizeLike(Input(j));
       // Hack to cause allocaiton happen here, so it won't happen
       // when we do CopyFrom. We need the mutex then because host->gpu
       // copies seem to possibly lock with NCCL.
       cpu_X.mutable_data<float>();

       {
         std::lock_guard<std::mutex> lock(CUDAContext::mutex());
         cpu_X.CopyFrom(Input(j), &context_);
       }
       context_.FinishDeviceComputation();
       std::cerr << "Input tensor: " << j << ": [" << this->debug_def().input(j)
                 << "]" << std::endl;
       tensorPrinter_.Print<float>(cpu_X);

       if (j == 0) {
         std::cerr << "NaN idxs:" << std::endl;
         auto* cpu_X_data = cpu_X.data<float>();
         for (size_t i = 0; i < cpu_X.size(); ++i) {
           if (isnan(cpu_X_data[i]) || isinf(cpu_X_data[i])) {
             std::cerr << i << " ";
           }
         }
       }
       std::cerr << std::endl;
     }
     return false;
   }

   // This op should act as an identity matrix if we don't find any NaNs/infs.
   // Copy over the data if we are not doing this in-place.
   if (&X != Y) {
     Y->CopyFrom(X, &context_);
   }
   return true;
 }

 REGISTER_CUDA_OPERATOR(NanCheck, NanCheckOp<CUDAContext>);

 __global__ void
 ElwiseMaxKernel(const float* X, const float* Y, float* maxout, const int N) {
   CUDA_1D_KERNEL_LOOP(i, N) {
     maxout[i] = max(X[i], Y[i]);
   }
 }

 template <>
 bool MaxOp<float, CUDAContext>::Compute() {
   float* output_data = Output(0)->mutable_data<float>();
   const int N = Input(0).size();

   // Run pairwise-maxes
   for (int i = 1; i < InputSize(); ++i) {
     ElwiseMaxKernel<<<
         CAFFE_GET_BLOCKS(N),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context_.cuda_stream()>>>(
         (i == 0 ? Input(0).data<float>() : Output(0)->data<float>()),
         Input(i).data<float>(),
         output_data,
         N);
   }

   return true;
 }

 REGISTER_CUDA_OPERATOR(Max, MaxOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(MaxGradient, MaxGradientOp<float, CUDAContext>);

 template <typename T>
 __global__ void
 MaxGradKernel(int N, const T* mx, const T* x, const T* go, T* gi) {
   CUDA_1D_KERNEL_LOOP(i, N) {
     gi[i] = go[i] * (mx[i] == x[i]);
   }
 }

 template <>
 bool MaxGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& output = Input(0);
   auto& grad_output = Input(1);
   const int kInputStartOffset = 2;

   const float* data = output.data<float>();

   for (int i = 0; i < OutputSize(); i++) {
     auto& input = Input(i + kInputStartOffset);
     auto* grad_input = Output(i);
     grad_input->ResizeLike(input);
     MaxGradKernel<<<
         CAFFE_GET_BLOCKS(input.size()),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context_.cuda_stream()>>>(
         input.size(),
         output.data<float>(),
         input.data<float>(),
         grad_output.data<float>(),
         grad_input->mutable_data<float>());
   }
   return true;
 }

 template<typename T_INDEX>
 __global__ void
 GatherKernel(const float* X, float* Y, const T_INDEX* indices, const int N, const int block_size) {
   for (int i = blockIdx.x; i < N; i += gridDim.x) {
     T_INDEX idx = indices[i];
     const float* src_offset = X + idx * block_size;
     float* dst_offset = Y + i   * block_size;
     for (int j = threadIdx.x; j < block_size; j += blockDim.x) {
       dst_offset[j] = src_offset[j];
     }
   }
 }

 template <>
 bool GatherOp<CUDAContext>::RunOnDevice() {
   return DispatchHelper<TensorTypes<int32_t,int64_t>>::call(
       this, OperatorBase::Input<TensorCUDA>(INDICES));
 }

 template <>
 template <typename Index>
 bool GatherOp<CUDAContext>::DoRunWithType() {
   auto& data = Input(DATA);
   auto& indices = Input(INDICES);
   auto* output = Output(0);

   CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
   auto shape = indices.dims();
   shape.insert(shape.end(), data.dims().begin() + 1, data.dims().end());
   output->Resize(shape);

   int block_size = data.size() / data.dim(0);
   auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize();
   CAFFE_ENFORCE(
       block_bytesize == data.nbytes() / data.dim(0),
       "block_bytesize should be consistent with data dim");
   int N = indices.size();

   auto src_base = static_cast<const float*>(data.raw_data());
   const Index* idxs = indices.template data<Index>();
   auto out = static_cast<float*>(output->raw_mutable_data(data.meta()));

   // return early when the input is empty, since CUDA kernel will fail for
   // empty input.
   if (N <= 0) {
     return true;
   }

   GatherKernel<<<
       std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
         src_base, out, idxs, N, block_size
       );
   return true;
 }

 REGISTER_CUDA_OPERATOR(Gather, GatherOp<CUDAContext>);

 /**
  * @brief Update slices of Y in-place with a batch of weighted X's.
  * Y[idx] = alpha[b] * X[b][i] + Y[idx]
  * i=0,...,N-1
  * b=0,...,B-1
  * idx=Indices[i]
  */
 template <typename T_INDEX>
 __global__ void AxpySliceKernel(
     const float* weight0,
     const TIndex N,
     const TIndex B,
     const TIndex slice_size,
     const float** alpha,
     const float** X,
     const T_INDEX* Indices,
     float* Y,
     const TIndex M) {
   // This implementation requires that the first weight is 1.0
   CUDA_KERNEL_ASSERT(weight0[0] == 1.0);
   for (int i = blockIdx.x; i < N; i += gridDim.x) {
     T_INDEX idx = Indices[i];
     float* y_offset = Y + (idx * slice_size);
     for (int b = 0; b < B; b++) {
       float a = *alpha[b];
       const float* x_offset = X[b] + (i * slice_size);
       for (int j = threadIdx.x; j < slice_size; j += blockDim.x) {
         atomicAdd(&y_offset[j], a * x_offset[j]);
       }
     }
   }
 }

 template <>
 bool ScatterWeightedSumOp<float, CUDAContext>::RunOnDevice() {
   return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(2));
 }

 template <>
 template <typename Index>
 bool ScatterWeightedSumOp<float,CUDAContext>::DoRunWithType() {
   CAFFE_ENFORCE_EQ(InputSize() % 2, 1);
   auto& X0 = Input(0);
   auto& weight0 = Input(1);
   auto& indices = Input(2);
   auto* output = Output(0);

   CAFFE_ENFORCE_EQ(&X0, output, "In place operation is required");
   CAFFE_ENFORCE_GT(X0.size(), 0);
   CAFFE_ENFORCE_GT(X0.ndim(), 0, "X0 has to be at least the vector");
   CAFFE_ENFORCE_EQ(weight0.size(), 1);

   TIndex M = X0.size();
   TIndex N = X0.dim(0);
   TIndex K = indices.size();
   TIndex block_size = M / N;

   T* data = output->template mutable_data<T>();

   // In order to have all device pointers of x_i (and weight_i similarly)
   // consecutively in device memory, copy pointers to a host vector and then
   // copy back into a device array.
   const TIndex B = (InputSize() - 3) / 2;
   x_data_host_.Resize(B);
   weights_host_.Resize(B);
   x_data_device_.Resize(B);
   weights_device_.Resize(B);

   const float** x_data_host = x_data_host_.mutable_data<const float*>();
   const float** weights_host = weights_host_.mutable_data<const float*>();
   const float** x_data_device = x_data_device_.mutable_data<const float*>();
   const float** weights_device = weights_device_.mutable_data<const float*>();

   for (int inp = 3; inp < InputSize(); inp += 2) {
     int idx = (inp - 3) / 2;
     x_data_host[idx] = static_cast<const float*>(Input(inp).raw_data());
     weights_host[idx] = static_cast<const float*>(Input(inp + 1).raw_data());
   }
   context_.Copy<const float*, CPUContext, CUDAContext>(
       B, x_data_host, x_data_device);
   context_.Copy<const float*, CPUContext, CUDAContext>(
       B, weights_host, weights_device);

   AxpySliceKernel<<<
       std::min<TIndex>(K, CAFFE_MAXIMUM_NUM_BLOCKS),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
       weight0.template data<float>(),
       K,
       B,
       block_size,
       weights_device,
       x_data_device,
       indices.template data<Index>(),
       data,
       M);

   return true;
 }

 REGISTER_CUDA_OPERATOR(
     ScatterWeightedSum,
     ScatterWeightedSumOp<float, CUDAContext>);

 #if THRUST_VERSION >= 100800
 __global__ void remap_kernel(
     thrust::device_ptr<int> second_order,
     thrust::device_ptr<int> order,
     int* output,
     int N,
     int K) {
   int i = blockDim.x * blockIdx.x + threadIdx.x;
   if (i >= K)
     return;
   int idx = second_order[i];
   output[order[idx]] = i;
   // Maybe cuda 1D kernel?
   for (idx++; idx < N && (i == K - 1 || idx != second_order[i + 1]); idx++) {
     output[order[idx]] = i;
   }
   return;
 }

 template <>
 template <typename T>
 void UniqueOp<CUDAContext>::DoRun() {
   auto& inputTensor = Input(0);
   // use dim32 to enforce that it's fine to have remapping of type int
   int N = inputTensor.dim32(0);
   CAFFE_ENFORCE_EQ(inputTensor.ndim(), 1, "Input should be a vector");
   auto* uniqueTensor = Output(UNIQUE);

   int* remapping = nullptr;
   if (REMAPPING < OutputSize()) {
     auto* remappingTensor = Output(REMAPPING);
     remappingTensor->ResizeLike(inputTensor);
     remapping = remappingTensor->template mutable_data<int>();
   }

   const T* input = inputTensor.template data<T>();
   thrust_unique_buffer_.Resize(N);
   auto* buffer = thrust_unique_buffer_.template mutable_data<T>();
   context_.template CopyItems<CUDAContext, CUDAContext>(
       inputTensor.meta(), N, input, buffer);

   // Create two vectors of {0, 1, ..., N-1} on CUDA device
   thrust::device_vector<int> order1(N), order2(N);
   thrust::sequence(
       thrust::cuda::par.on(context_.cuda_stream()),
       order1.begin(),
       order1.end());
   thrust::sequence(
       thrust::cuda::par.on(context_.cuda_stream()),
       order2.begin(),
       order2.end());

   // Sort the input along with order vector. So now we know where each element
   // is permutated to. For example:
   //    input1 = 1,3,5,1,5,7,9
   //    order1 = 0,1,2,3,4,5,6
   // Now we have:
   //    output = 1,1,3,5,5,7,9
   //    order1 = 0,3,1,2,4,5,6
   thrust::sort_by_key(
       thrust::cuda::par.on(context_.cuda_stream()),
       buffer,
       buffer + N,
       order1.begin());

   // Use consequent unique op to get another order_buffer
   //    input2 = 1,1,3,5,5,7,9
   //    order2 = 0,1,2,3,4,5,6
   // Now we have:
   //    output = 1,3,5,7,9
   //    order2 = 0,2,3,5,6
   auto new_last = thrust::unique_by_key(
       thrust::cuda::par.on(context_.cuda_stream()),
       buffer,
       buffer + N,
       order2.begin());
   int K = new_last.first - buffer;

   uniqueTensor->Resize(K);
   T* unique = uniqueTensor->template mutable_data<T>();
   context_.template CopyItems<CUDAContext, CUDAContext>(
       thrust_unique_buffer_.meta(), K, buffer, unique);

   // Compute the remapping. For example, for the number 1, if we look at
   // order2[0] and order2[1], we know that input2[0:2) are all 1. They are all
   // remapped to 0 in final input. And from order1, we know where they come
   // from. The rest is easy.
   if (remapping != nullptr) {
     // record remap
     remap_kernel<<<
         CAFFE_GET_BLOCKS(K),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context_.cuda_stream()>>>(
         order2.data(), order1.data(), remapping, N, K);
   }
 }
 REGISTER_CUDA_OPERATOR(Unique, UniqueOp<CUDAContext>);
 #endif // THRUST_VERSION >= 100800

 REGISTER_CUDA_OPERATOR(Size, SizeOp<CUDAContext>);

 template <typename T>
 __global__ void RangeKernel(const int n, T* Y, T offset, T step) {
   CUDA_1D_KERNEL_LOOP(index, n) {
     Y[index] = index * step + offset;
   }
 }

 template <>
 template <typename T>
 bool RangeOp<CUDAContext>::DoRunOnDevice(
     const T& start,
     const T& step,
     Tensor<CUDAContext>* output) {
   int N = output->size();
   RangeKernel<<<
       CAFFE_GET_BLOCKS(N),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(N, output->mutable_data<T>(), start, step);
   return true;
 }

 REGISTER_CUDA_OPERATOR(Range, RangeOp<CUDAContext>);
 }  // namespace caffe2
	/**
	* Copyright (c) 2016-present, Facebook, Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include <math.h>
	#include <cfloat>
	// TODO(jamesreed): I would use <cmath> here but std::isnan
	// and std::isinf are declared constexpr there and the nvidia
	// compiler throws an error because of it

	#include <thrust/device_vector.h>
	#include <thrust/sequence.h>
	#include <thrust/sort.h>
	#include <thrust/system/cuda/execution_policy.h>
	#include <thrust/unique.h>
	#include "caffe2/core/context_gpu.h"
	#include "utility_ops.h"

	namespace caffe2 {
	CAFFE_KNOWN_TYPE(const float*);

	REGISTER_CUDA_OPERATOR(EnsureDense, EnsureDenseOp<CUDAContext>);


	__global__ void NanCheckKernel(int N, const float* X, bool* result) {
	bool has_nan = false;
	CUDA_1D_KERNEL_LOOP(i, N) {
	// Note: we have no need to do early return, since only if this fails
	// will we not need to inspect all elements. No need to optimize the
	// case that will fail.
	has_nan = has_nan \|\| isnan(X[i]) \|\| isinf(X[i]);
	}
	__syncthreads();
	if (has_nan) {
	result[0] = true;
	}
	}

	template <>
	bool NanCheckOp<CUDAContext>::RunOnDevice() {
	auto& X = Input(0);
	auto* Y = Output(0);
	const size_t N = X.size();
	const float* data_ptr = X.data<float>();

	scratch_.Resize(1);
	math::Set<bool, CUDAContext>(
	1, false, scratch_.mutable_data<bool>(), &context_);
	NanCheckKernel<<<
	CAFFE_GET_BLOCKS(N),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	N, X.data<float>(), scratch_.mutable_data<bool>());

	bool result = false;
	{
	std::lock_guard<std::mutex> lock(CUDAContext::mutex());
	CUDA_ENFORCE(cudaMemcpyAsync(
	&result,
	scratch_.raw_data(),
	1,
	cudaMemcpyDefault,
	context_.cuda_stream()));
	}
	// Note: we must synchronize here so we can inspect the result
	context_.FinishDeviceComputation();

	// Print out diagnostic info if we have a NaN or inf
	if (result) {
	std::cerr << "Tensor contained NaN or inf: " << this->debug_def().input(0)
	<< std::endl;

	for (int j = 0; j < InputSize(); j++) {
	TensorCPU cpu_X;
	cpu_X.ResizeLike(Input(j));
	// Hack to cause allocaiton happen here, so it won't happen
	// when we do CopyFrom. We need the mutex then because host->gpu
	// copies seem to possibly lock with NCCL.
	cpu_X.mutable_data<float>();

	{
	std::lock_guard<std::mutex> lock(CUDAContext::mutex());
	cpu_X.CopyFrom(Input(j), &context_);
	}
	context_.FinishDeviceComputation();
	std::cerr << "Input tensor: " << j << ": [" << this->debug_def().input(j)
	<< "]" << std::endl;
	tensorPrinter_.Print<float>(cpu_X);

	if (j == 0) {
	std::cerr << "NaN idxs:" << std::endl;
	auto* cpu_X_data = cpu_X.data<float>();
	for (size_t i = 0; i < cpu_X.size(); ++i) {
	if (isnan(cpu_X_data[i]) \|\| isinf(cpu_X_data[i])) {
	std::cerr << i << " ";
	}
	}
	}
	std::cerr << std::endl;
	}
	return false;
	}

	// This op should act as an identity matrix if we don't find any NaNs/infs.
	// Copy over the data if we are not doing this in-place.
	if (&X != Y) {
	Y->CopyFrom(X, &context_);
	}
	return true;
	}

	REGISTER_CUDA_OPERATOR(NanCheck, NanCheckOp<CUDAContext>);

	__global__ void
	ElwiseMaxKernel(const float* X, const float* Y, float* maxout, const int N) {
	CUDA_1D_KERNEL_LOOP(i, N) {
	maxout[i] = max(X[i], Y[i]);
	}
	}

	template <>
	bool MaxOp<float, CUDAContext>::Compute() {
	float* output_data = Output(0)->mutable_data<float>();
	const int N = Input(0).size();

	// Run pairwise-maxes
	for (int i = 1; i < InputSize(); ++i) {
	ElwiseMaxKernel<<<
	CAFFE_GET_BLOCKS(N),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	(i == 0 ? Input(0).data<float>() : Output(0)->data<float>()),
	Input(i).data<float>(),
	output_data,
	N);
	}

	return true;
	}

	REGISTER_CUDA_OPERATOR(Max, MaxOp<float, CUDAContext>);
	REGISTER_CUDA_OPERATOR(MaxGradient, MaxGradientOp<float, CUDAContext>);

	template <typename T>
	__global__ void
	MaxGradKernel(int N, const T* mx, const T* x, const T* go, T* gi) {
	CUDA_1D_KERNEL_LOOP(i, N) {
	gi[i] = go[i] * (mx[i] == x[i]);
	}
	}

	template <>
	bool MaxGradientOp<float, CUDAContext>::RunOnDevice() {
	auto& output = Input(0);
	auto& grad_output = Input(1);
	const int kInputStartOffset = 2;

	const float* data = output.data<float>();

	for (int i = 0; i < OutputSize(); i++) {
	auto& input = Input(i + kInputStartOffset);
	auto* grad_input = Output(i);
	grad_input->ResizeLike(input);
	MaxGradKernel<<<
	CAFFE_GET_BLOCKS(input.size()),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	input.size(),
	output.data<float>(),
	input.data<float>(),
	grad_output.data<float>(),
	grad_input->mutable_data<float>());
	}
	return true;
	}

	template<typename T_INDEX>
	__global__ void
	GatherKernel(const float* X, float* Y, const T_INDEX* indices, const int N, const int block_size) {
	for (int i = blockIdx.x; i < N; i += gridDim.x) {
	T_INDEX idx = indices[i];
	const float* src_offset = X + idx * block_size;
	float* dst_offset = Y + i * block_size;
	for (int j = threadIdx.x; j < block_size; j += blockDim.x) {
	dst_offset[j] = src_offset[j];
	}
	}
	}

	template <>
	bool GatherOp<CUDAContext>::RunOnDevice() {
	return DispatchHelper<TensorTypes<int32_t,int64_t>>::call(
	this, OperatorBase::Input<TensorCUDA>(INDICES));
	}

	template <>
	template <typename Index>
	bool GatherOp<CUDAContext>::DoRunWithType() {
	auto& data = Input(DATA);
	auto& indices = Input(INDICES);
	auto* output = Output(0);

	CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
	auto shape = indices.dims();
	shape.insert(shape.end(), data.dims().begin() + 1, data.dims().end());
	output->Resize(shape);

	int block_size = data.size() / data.dim(0);
	auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize();
	CAFFE_ENFORCE(
	block_bytesize == data.nbytes() / data.dim(0),
	"block_bytesize should be consistent with data dim");
	int N = indices.size();

	auto src_base = static_cast<const float*>(data.raw_data());
	const Index* idxs = indices.template data<Index>();
	auto out = static_cast<float*>(output->raw_mutable_data(data.meta()));

	// return early when the input is empty, since CUDA kernel will fail for
	// empty input.
	if (N <= 0) {
	return true;
	}

	GatherKernel<<<
	std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	src_base, out, idxs, N, block_size
	);
	return true;
	}

	REGISTER_CUDA_OPERATOR(Gather, GatherOp<CUDAContext>);

	/**
	* @brief Update slices of Y in-place with a batch of weighted X's.
	* Y[idx] = alpha[b] * X[b][i] + Y[idx]
	* i=0,...,N-1
	* b=0,...,B-1
	* idx=Indices[i]
	*/
	template <typename T_INDEX>
	__global__ void AxpySliceKernel(
	const float* weight0,
	const TIndex N,
	const TIndex B,
	const TIndex slice_size,
	const float** alpha,
	const float** X,
	const T_INDEX* Indices,
	float* Y,
	const TIndex M) {
	// This implementation requires that the first weight is 1.0
	CUDA_KERNEL_ASSERT(weight0[0] == 1.0);
	for (int i = blockIdx.x; i < N; i += gridDim.x) {
	T_INDEX idx = Indices[i];
	float* y_offset = Y + (idx * slice_size);
	for (int b = 0; b < B; b++) {
	float a = *alpha[b];
	const float* x_offset = X[b] + (i * slice_size);
	for (int j = threadIdx.x; j < slice_size; j += blockDim.x) {
	atomicAdd(&y_offset[j], a * x_offset[j]);
	}
	}
	}
	}

	template <>
	bool ScatterWeightedSumOp<float, CUDAContext>::RunOnDevice() {
	return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(2));
	}

	template <>
	template <typename Index>
	bool ScatterWeightedSumOp<float,CUDAContext>::DoRunWithType() {
	CAFFE_ENFORCE_EQ(InputSize() % 2, 1);
	auto& X0 = Input(0);
	auto& weight0 = Input(1);
	auto& indices = Input(2);
	auto* output = Output(0);

	CAFFE_ENFORCE_EQ(&X0, output, "In place operation is required");
	CAFFE_ENFORCE_GT(X0.size(), 0);
	CAFFE_ENFORCE_GT(X0.ndim(), 0, "X0 has to be at least the vector");
	CAFFE_ENFORCE_EQ(weight0.size(), 1);

	TIndex M = X0.size();
	TIndex N = X0.dim(0);
	TIndex K = indices.size();
	TIndex block_size = M / N;

	T* data = output->template mutable_data<T>();

	// In order to have all device pointers of x_i (and weight_i similarly)
	// consecutively in device memory, copy pointers to a host vector and then
	// copy back into a device array.
	const TIndex B = (InputSize() - 3) / 2;
	x_data_host_.Resize(B);
	weights_host_.Resize(B);
	x_data_device_.Resize(B);
	weights_device_.Resize(B);

	const float** x_data_host = x_data_host_.mutable_data<const float*>();
	const float** weights_host = weights_host_.mutable_data<const float*>();
	const float** x_data_device = x_data_device_.mutable_data<const float*>();
	const float** weights_device = weights_device_.mutable_data<const float*>();

	for (int inp = 3; inp < InputSize(); inp += 2) {
	int idx = (inp - 3) / 2;
	x_data_host[idx] = static_cast<const float*>(Input(inp).raw_data());
	weights_host[idx] = static_cast<const float*>(Input(inp + 1).raw_data());
	}
	context_.Copy<const float*, CPUContext, CUDAContext>(
	B, x_data_host, x_data_device);
	context_.Copy<const float*, CPUContext, CUDAContext>(
	B, weights_host, weights_device);

	AxpySliceKernel<<<
	std::min<TIndex>(K, CAFFE_MAXIMUM_NUM_BLOCKS),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	weight0.template data<float>(),
	K,
	B,
	block_size,
	weights_device,
	x_data_device,
	indices.template data<Index>(),
	data,
	M);

	return true;
	}

	REGISTER_CUDA_OPERATOR(
	ScatterWeightedSum,
	ScatterWeightedSumOp<float, CUDAContext>);

	#if THRUST_VERSION >= 100800
	__global__ void remap_kernel(
	thrust::device_ptr<int> second_order,
	thrust::device_ptr<int> order,
	int* output,
	int N,
	int K) {
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	if (i >= K)
	return;
	int idx = second_order[i];
	output[order[idx]] = i;
	// Maybe cuda 1D kernel?
	for (idx++; idx < N && (i == K - 1 \|\| idx != second_order[i + 1]); idx++) {
	output[order[idx]] = i;
	}
	return;
	}

	template <>
	template <typename T>
	void UniqueOp<CUDAContext>::DoRun() {
	auto& inputTensor = Input(0);
	// use dim32 to enforce that it's fine to have remapping of type int
	int N = inputTensor.dim32(0);
	CAFFE_ENFORCE_EQ(inputTensor.ndim(), 1, "Input should be a vector");
	auto* uniqueTensor = Output(UNIQUE);

	int* remapping = nullptr;
	if (REMAPPING < OutputSize()) {
	auto* remappingTensor = Output(REMAPPING);
	remappingTensor->ResizeLike(inputTensor);
	remapping = remappingTensor->template mutable_data<int>();
	}

	const T* input = inputTensor.template data<T>();
	thrust_unique_buffer_.Resize(N);
	auto* buffer = thrust_unique_buffer_.template mutable_data<T>();
	context_.template CopyItems<CUDAContext, CUDAContext>(
	inputTensor.meta(), N, input, buffer);

	// Create two vectors of {0, 1, ..., N-1} on CUDA device
	thrust::device_vector<int> order1(N), order2(N);
	thrust::sequence(
	thrust::cuda::par.on(context_.cuda_stream()),
	order1.begin(),
	order1.end());
	thrust::sequence(
	thrust::cuda::par.on(context_.cuda_stream()),
	order2.begin(),
	order2.end());

	// Sort the input along with order vector. So now we know where each element
	// is permutated to. For example:
	// input1 = 1,3,5,1,5,7,9
	// order1 = 0,1,2,3,4,5,6
	// Now we have:
	// output = 1,1,3,5,5,7,9
	// order1 = 0,3,1,2,4,5,6
	thrust::sort_by_key(
	thrust::cuda::par.on(context_.cuda_stream()),
	buffer,
	buffer + N,
	order1.begin());

	// Use consequent unique op to get another order_buffer
	// input2 = 1,1,3,5,5,7,9
	// order2 = 0,1,2,3,4,5,6
	// Now we have:
	// output = 1,3,5,7,9
	// order2 = 0,2,3,5,6
	auto new_last = thrust::unique_by_key(
	thrust::cuda::par.on(context_.cuda_stream()),
	buffer,
	buffer + N,
	order2.begin());
	int K = new_last.first - buffer;

	uniqueTensor->Resize(K);
	T* unique = uniqueTensor->template mutable_data<T>();
	context_.template CopyItems<CUDAContext, CUDAContext>(
	thrust_unique_buffer_.meta(), K, buffer, unique);

	// Compute the remapping. For example, for the number 1, if we look at
	// order2[0] and order2[1], we know that input2[0:2) are all 1. They are all
	// remapped to 0 in final input. And from order1, we know where they come
	// from. The rest is easy.
	if (remapping != nullptr) {
	// record remap
	remap_kernel<<<
	CAFFE_GET_BLOCKS(K),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	order2.data(), order1.data(), remapping, N, K);
	}
	}
	REGISTER_CUDA_OPERATOR(Unique, UniqueOp<CUDAContext>);
	#endif // THRUST_VERSION >= 100800

	REGISTER_CUDA_OPERATOR(Size, SizeOp<CUDAContext>);

	template <typename T>
	__global__ void RangeKernel(const int n, T* Y, T offset, T step) {
	CUDA_1D_KERNEL_LOOP(index, n) {
	Y[index] = index * step + offset;
	}
	}

	template <>
	template <typename T>
	bool RangeOp<CUDAContext>::DoRunOnDevice(
	const T& start,
	const T& step,
	Tensor<CUDAContext>* output) {
	int N = output->size();
	RangeKernel<<<
	CAFFE_GET_BLOCKS(N),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(N, output->mutable_data<T>(), start, step);
	return true;
	}

	REGISTER_CUDA_OPERATOR(Range, RangeOp<CUDAContext>);
	} // namespace caffe2