caffe2/operators/utility_ops.cu - platform/external/pytorch - Git at Google

 #include <math.h>
 #include <cfloat>
 // TODO(jamesreed): I would use <cmath> here but std::isnan
 // and std::isinf are declared constexpr there and the nvidia
 // compiler throws an error because of it

 #include <thrust/device_vector.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 #include <thrust/system/cuda/execution_policy.h>
 #include <thrust/unique.h>
 #include "caffe2/core/context_gpu.h"
 #include "flatten_op.h"
 #include "minmax_ops.h"
 #include "utility_ops.h"

 namespace caffe2 {
 CAFFE_KNOWN_TYPE(const float*);

 REGISTER_CUDA_OPERATOR(EnsureDense, EnsureDenseOp<CUDAContext>);

 __global__ void NanCheckKernel(int N, const float* X, bool* result) {
   bool has_nan = false;
   CUDA_1D_KERNEL_LOOP(i, N) {
     // Note: we have no need to do early return, since only if this fails
     // will we not need to inspect all elements. No need to optimize the
     // case that will fail.
     has_nan = has_nan || isnan(X[i]) || isinf(X[i]);
   }
   __syncthreads();
   if (has_nan) {
     result[0] = true;
   }
 }

 template <>
 bool NanCheckOp<CUDAContext>::RunOnDevice() {
   auto& X = Input(0);
   auto* Y = Output(0);
   const size_t N = X.size();
   const float* data_ptr = X.data<float>();

   scratch_.Resize(1);
   math::Set<bool, CUDAContext>(
       1, false, scratch_.mutable_data<bool>(), &context_);
   NanCheckKernel<<<
       CAFFE_GET_BLOCKS(N),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
       N, X.data<float>(), scratch_.mutable_data<bool>());

   bool result = false;
   {
     std::lock_guard<std::mutex> lock(CUDAContext::mutex());
     CUDA_ENFORCE(cudaMemcpyAsync(
         &result,
         scratch_.raw_data(),
         1,
         cudaMemcpyDefault,
         context_.cuda_stream()));
   }
   // Note: we must synchronize here so we can inspect the result
   context_.FinishDeviceComputation();

   // Print out diagnostic info if we have a NaN or inf
   if (result) {
     std::cerr << "Tensor contained NaN or inf: " << this->debug_def().input(0)
               << std::endl;

     for (int j = 0; j < InputSize(); j++) {
       TensorCPU cpu_X;
       cpu_X.ResizeLike(Input(j));
       // Hack to cause allocaiton happen here, so it won't happen
       // when we do CopyFrom. We need the mutex then because host->gpu
       // copies seem to possibly lock with NCCL.
       cpu_X.mutable_data<float>();

       {
         std::lock_guard<std::mutex> lock(CUDAContext::mutex());
         cpu_X.CopyFrom(Input(j), &context_);
       }
       context_.FinishDeviceComputation();
       std::cerr << "Input tensor: " << j << ": [" << this->debug_def().input(j)
                 << "]" << std::endl;
       tensorPrinter_.Print<float>(cpu_X);

       if (j == 0) {
         std::cerr << "NaN idxs:" << std::endl;
         auto* cpu_X_data = cpu_X.data<float>();
         for (size_t i = 0; i < cpu_X.size(); ++i) {
           if (isnan(cpu_X_data[i]) || isinf(cpu_X_data[i])) {
             std::cerr << i << " ";
           }
         }
       }
       std::cerr << std::endl;
     }
     return false;
   }

   // This op should act as an identity matrix if we don't find any NaNs/infs.
   // Copy over the data if we are not doing this in-place.
   if (&X != Y) {
     Y->CopyFrom(X, &context_);
   }
   return true;
 }

 REGISTER_CUDA_OPERATOR(NanCheck, NanCheckOp<CUDAContext>);

 __global__ void
 ElwiseMaxKernel(const float* X, const float* Y, float* maxout, const int N) {
   CUDA_1D_KERNEL_LOOP(i, N) {
     maxout[i] = max(X[i], Y[i]);
   }
 }

 template <>
 bool MaxOp<float, CUDAContext>::Compute() {
   float* output_data = Output(0)->mutable_data<float>();
   const int N = Input(0).size();

   // Run pairwise-maxes
   for (int i = 1; i < InputSize(); ++i) {
     ElwiseMaxKernel<<<
         CAFFE_GET_BLOCKS(N),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context_.cuda_stream()>>>(
         (i == 0 ? Input(0).data<float>() : Output(0)->data<float>()),
         Input(i).data<float>(),
         output_data,
         N);
   }

   return true;
 }

 REGISTER_CUDA_OPERATOR(Max, MaxOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(MaxGradient, MaxGradientOp<float, CUDAContext>);

 __global__ void
 ElwiseMinKernel(const float* X, const float* Y, float* minout, const int N) {
   CUDA_1D_KERNEL_LOOP(i, N) {
     minout[i] = min(X[i], Y[i]);
   }
 }

 template <>
 bool MinOp<float, CUDAContext>::Compute() {
   float* output_data = Output(0)->mutable_data<float>();
   const int N = Input(0).size();

   // Run pairwise-mines
   for (int i = 1; i < InputSize(); ++i) {
     ElwiseMinKernel<<<
         CAFFE_GET_BLOCKS(N),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context_.cuda_stream()>>>(
         (i == 0 ? Input(0).data<float>() : Output(0)->data<float>()),
         Input(i).data<float>(),
         output_data,
         N);
   }

   return true;
 }

 REGISTER_CUDA_OPERATOR(Min, MinOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(MinGradient, MinGradientOp<float, CUDAContext>);

 template <typename T>
 __global__ void
 MaxMinGradKernel(int N, const T* mx, const T* x, const T* go, T* gi) {
   CUDA_1D_KERNEL_LOOP(i, N) {
     gi[i] = go[i] * (mx[i] == x[i]);
   }
 }

 template <>
 bool SelectGradientOpBase<float, CUDAContext>::RunOnDevice() {
   auto& output = Input(0);
   auto& grad_output = Input(1);
   const int kInputStartOffset = 2;

   const float* data = output.data<float>();

   for (int i = 0; i < OutputSize(); i++) {
     auto& input = Input(i + kInputStartOffset);
     auto* grad_input = Output(i);
     grad_input->ResizeLike(input);
     MaxMinGradKernel<<<
         CAFFE_GET_BLOCKS(input.size()),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context_.cuda_stream()>>>(
         input.size(),
         output.data<float>(),
         input.data<float>(),
         grad_output.data<float>(),
         grad_input->mutable_data<float>());
   }
   return true;
 }

 template <typename T_INDEX>
 __global__ void GatherKernel(
     const float* X,
     float* Y,
     const T_INDEX* indices,
     const int N,
     const int block_size) {
   for (int i = blockIdx.x; i < N; i += gridDim.x) {
     T_INDEX idx = indices[i];
     const float* src_offset = X + idx * block_size;
     float* dst_offset = Y + i * block_size;
     for (int j = threadIdx.x; j < block_size; j += blockDim.x) {
       dst_offset[j] = src_offset[j];
     }
   }
 }

 template <>
 bool GatherOp<CUDAContext>::RunOnDevice() {
   return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
       this, OperatorBase::Input<TensorCUDA>(INDICES));
 }

 template <>
 template <typename Index>
 bool GatherOp<CUDAContext>::DoRunWithType() {
   auto& data = Input(DATA);
   auto& indices = Input(INDICES);
   auto* output = Output(0);

   CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
   auto shape = indices.dims();
   shape.insert(shape.end(), data.dims().begin() + 1, data.dims().end());
   output->Resize(shape);

   int block_size = data.size() / data.dim(0);
   auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize();
   CAFFE_ENFORCE(
       block_bytesize == data.nbytes() / data.dim(0),
       "block_bytesize should be consistent with data dim");
   int N = indices.size();

   auto src_base = static_cast<const float*>(data.raw_data());
   const Index* idxs = indices.template data<Index>();
   auto out = static_cast<float*>(output->raw_mutable_data(data.meta()));

   // return early when the input is empty, since CUDA kernel will fail for
   // empty input.
   if (N <= 0) {
     return true;
   }

   GatherKernel<<<
       std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(src_base, out, idxs, N, block_size);
   return true;
 }

 REGISTER_CUDA_OPERATOR(Gather, GatherOp<CUDAContext>);

 /**
  * @brief Update slices of Y in-place with a batch of weighted X's.
  * Y[idx] = alpha[b] * X[b][i] + Y[idx]
  * i=0,...,N-1
  * b=0,...,B-1
  * idx=Indices[i]
  */
 template <typename T_INDEX>
 __global__ void AxpySliceKernel(
     const float* weight0,
     const TIndex N,
     const TIndex B,
     const TIndex slice_size,
     const float** alpha,
     const float** X,
     const T_INDEX* Indices,
     float* Y,
     const TIndex M) {
   // This implementation requires that the first weight is 1.0
   CUDA_KERNEL_ASSERT(weight0[0] == 1.0);
   for (int i = blockIdx.x; i < N; i += gridDim.x) {
     T_INDEX idx = Indices[i];
     float* y_offset = Y + (idx * slice_size);
     for (int b = 0; b < B; b++) {
       float a = *alpha[b];
       const float* x_offset = X[b] + (i * slice_size);
       for (int j = threadIdx.x; j < slice_size; j += blockDim.x) {
         atomicAdd(&y_offset[j], a * x_offset[j]);
       }
     }
   }
 }

 template <>
 bool ScatterWeightedSumOp<float, CUDAContext>::RunOnDevice() {
   return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(2));
 }

 template <>
 template <typename Index>
 bool ScatterWeightedSumOp<float, CUDAContext>::DoRunWithType() {
   CAFFE_ENFORCE_EQ(InputSize() % 2, 1);
   auto& X0 = Input(0);
   auto& weight0 = Input(1);
   auto& indices = Input(2);
   auto* output = Output(0);

   CAFFE_ENFORCE_EQ(&X0, output, "In place operation is required");
   CAFFE_ENFORCE_GT(X0.size(), 0);
   CAFFE_ENFORCE_GT(X0.ndim(), 0, "X0 has to be at least the vector");
   CAFFE_ENFORCE_EQ(weight0.size(), 1);

   TIndex M = X0.size();
   TIndex N = X0.dim(0);
   TIndex K = indices.size();
   TIndex block_size = M / N;

   T* data = output->template mutable_data<T>();

   // In order to have all device pointers of x_i (and weight_i similarly)
   // consecutively in device memory, copy pointers to a host vector and then
   // copy back into a device array.
   const TIndex B = (InputSize() - 3) / 2;
   x_data_host_.Resize(B);
   weights_host_.Resize(B);
   x_data_device_.Resize(B);
   weights_device_.Resize(B);

   const float** x_data_host = x_data_host_.mutable_data<const float*>();
   const float** weights_host = weights_host_.mutable_data<const float*>();
   const float** x_data_device = x_data_device_.mutable_data<const float*>();
   const float** weights_device = weights_device_.mutable_data<const float*>();

   for (int inp = 3; inp < InputSize(); inp += 2) {
     int idx = (inp - 3) / 2;
     x_data_host[idx] = static_cast<const float*>(Input(inp).raw_data());
     weights_host[idx] = static_cast<const float*>(Input(inp + 1).raw_data());
   }
   context_.Copy<const float*, CPUContext, CUDAContext>(
       B, x_data_host, x_data_device);
   context_.Copy<const float*, CPUContext, CUDAContext>(
       B, weights_host, weights_device);

   AxpySliceKernel<<<
       std::min<TIndex>(K, CAFFE_MAXIMUM_NUM_BLOCKS),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
       weight0.template data<float>(),
       K,
       B,
       block_size,
       weights_device,
       x_data_device,
       indices.template data<Index>(),
       data,
       M);

   return true;
 }

 REGISTER_CUDA_OPERATOR(
     ScatterWeightedSum,
     ScatterWeightedSumOp<float, CUDAContext>);

 namespace {

 template <typename Index, typename T>
 __global__ void scatter_assign_kernel(
     T* data,
     const Index* idxs,
     const T* slicesData,
     TIndex N,
     TIndex K,
     TIndex block_size) {
   for (TIndex i = blockIdx.x; i < K; i += gridDim.x) {
     Index idx = idxs[i];
     CUDA_KERNEL_ASSERT(0 <= idx && idx < N);
     const T* src = slicesData + block_size * i;
     T* dest = data + block_size * idx;
     for (TIndex j = threadIdx.x; j < block_size; j += blockDim.x) {
       dest[j] = src[j];
     }
   }
 }

 } // namespace

 template <>
 template <typename Index, typename T>
 void ScatterAssignOp<CUDAContext>::DoScatterAssign(
     T* data,
     const Index* idxs,
     const T* slicesData,
     TIndex N,
     TIndex K,
     TIndex block_size) {
   scatter_assign_kernel<<<
       std::min(K, static_cast<TIndex>(CAFFE_MAXIMUM_NUM_BLOCKS)),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(data, idxs, slicesData, N, K, block_size);
 }

 REGISTER_CUDA_OPERATOR(ScatterAssign, ScatterAssignOp<CUDAContext>);

 REGISTER_CUDA_OPERATOR(Size, SizeOp<CUDAContext>);

 template <typename T>
 __global__ void RangeKernel(const int n, T* Y, T offset, T step) {
   CUDA_1D_KERNEL_LOOP(index, n) {
     Y[index] = index * step + offset;
   }
 }

 template <>
 template <typename T>
 bool RangeOp<CUDAContext>::DoRunOnDevice(
     const T& start,
     const T& step,
     Tensor<CUDAContext>* output) {
   int N = output->size();
   RangeKernel<<<
       CAFFE_GET_BLOCKS(N),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(N, output->mutable_data<T>(), start, step);
   return true;
 }

 REGISTER_CUDA_OPERATOR(Range, RangeOp<CUDAContext>);
 } // namespace caffe2
	#include <math.h>
	#include <cfloat>
	// TODO(jamesreed): I would use <cmath> here but std::isnan
	// and std::isinf are declared constexpr there and the nvidia
	// compiler throws an error because of it

	#include <thrust/device_vector.h>
	#include <thrust/sequence.h>
	#include <thrust/sort.h>
	#include <thrust/system/cuda/execution_policy.h>
	#include <thrust/unique.h>
	#include "caffe2/core/context_gpu.h"
	#include "flatten_op.h"
	#include "minmax_ops.h"
	#include "utility_ops.h"

	namespace caffe2 {
	CAFFE_KNOWN_TYPE(const float*);

	REGISTER_CUDA_OPERATOR(EnsureDense, EnsureDenseOp<CUDAContext>);

	__global__ void NanCheckKernel(int N, const float* X, bool* result) {
	bool has_nan = false;
	CUDA_1D_KERNEL_LOOP(i, N) {
	// Note: we have no need to do early return, since only if this fails
	// will we not need to inspect all elements. No need to optimize the
	// case that will fail.
	has_nan = has_nan \|\| isnan(X[i]) \|\| isinf(X[i]);
	}
	__syncthreads();
	if (has_nan) {
	result[0] = true;
	}
	}

	template <>
	bool NanCheckOp<CUDAContext>::RunOnDevice() {
	auto& X = Input(0);
	auto* Y = Output(0);
	const size_t N = X.size();
	const float* data_ptr = X.data<float>();

	scratch_.Resize(1);
	math::Set<bool, CUDAContext>(
	1, false, scratch_.mutable_data<bool>(), &context_);
	NanCheckKernel<<<
	CAFFE_GET_BLOCKS(N),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	N, X.data<float>(), scratch_.mutable_data<bool>());

	bool result = false;
	{
	std::lock_guard<std::mutex> lock(CUDAContext::mutex());
	CUDA_ENFORCE(cudaMemcpyAsync(
	&result,
	scratch_.raw_data(),
	1,
	cudaMemcpyDefault,
	context_.cuda_stream()));
	}
	// Note: we must synchronize here so we can inspect the result
	context_.FinishDeviceComputation();

	// Print out diagnostic info if we have a NaN or inf
	if (result) {
	std::cerr << "Tensor contained NaN or inf: " << this->debug_def().input(0)
	<< std::endl;

	for (int j = 0; j < InputSize(); j++) {
	TensorCPU cpu_X;
	cpu_X.ResizeLike(Input(j));
	// Hack to cause allocaiton happen here, so it won't happen
	// when we do CopyFrom. We need the mutex then because host->gpu
	// copies seem to possibly lock with NCCL.
	cpu_X.mutable_data<float>();

	{
	std::lock_guard<std::mutex> lock(CUDAContext::mutex());
	cpu_X.CopyFrom(Input(j), &context_);
	}
	context_.FinishDeviceComputation();
	std::cerr << "Input tensor: " << j << ": [" << this->debug_def().input(j)
	<< "]" << std::endl;
	tensorPrinter_.Print<float>(cpu_X);

	if (j == 0) {
	std::cerr << "NaN idxs:" << std::endl;
	auto* cpu_X_data = cpu_X.data<float>();
	for (size_t i = 0; i < cpu_X.size(); ++i) {
	if (isnan(cpu_X_data[i]) \|\| isinf(cpu_X_data[i])) {
	std::cerr << i << " ";
	}
	}
	}
	std::cerr << std::endl;
	}
	return false;
	}

	// This op should act as an identity matrix if we don't find any NaNs/infs.
	// Copy over the data if we are not doing this in-place.
	if (&X != Y) {
	Y->CopyFrom(X, &context_);
	}
	return true;
	}

	REGISTER_CUDA_OPERATOR(NanCheck, NanCheckOp<CUDAContext>);

	__global__ void
	ElwiseMaxKernel(const float* X, const float* Y, float* maxout, const int N) {
	CUDA_1D_KERNEL_LOOP(i, N) {
	maxout[i] = max(X[i], Y[i]);
	}
	}

	template <>
	bool MaxOp<float, CUDAContext>::Compute() {
	float* output_data = Output(0)->mutable_data<float>();
	const int N = Input(0).size();

	// Run pairwise-maxes
	for (int i = 1; i < InputSize(); ++i) {
	ElwiseMaxKernel<<<
	CAFFE_GET_BLOCKS(N),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	(i == 0 ? Input(0).data<float>() : Output(0)->data<float>()),
	Input(i).data<float>(),
	output_data,
	N);
	}

	return true;
	}

	REGISTER_CUDA_OPERATOR(Max, MaxOp<float, CUDAContext>);
	REGISTER_CUDA_OPERATOR(MaxGradient, MaxGradientOp<float, CUDAContext>);

	__global__ void
	ElwiseMinKernel(const float* X, const float* Y, float* minout, const int N) {
	CUDA_1D_KERNEL_LOOP(i, N) {
	minout[i] = min(X[i], Y[i]);
	}
	}

	template <>
	bool MinOp<float, CUDAContext>::Compute() {
	float* output_data = Output(0)->mutable_data<float>();
	const int N = Input(0).size();

	// Run pairwise-mines
	for (int i = 1; i < InputSize(); ++i) {
	ElwiseMinKernel<<<
	CAFFE_GET_BLOCKS(N),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	(i == 0 ? Input(0).data<float>() : Output(0)->data<float>()),
	Input(i).data<float>(),
	output_data,
	N);
	}

	return true;
	}

	REGISTER_CUDA_OPERATOR(Min, MinOp<float, CUDAContext>);
	REGISTER_CUDA_OPERATOR(MinGradient, MinGradientOp<float, CUDAContext>);

	template <typename T>
	__global__ void
	MaxMinGradKernel(int N, const T* mx, const T* x, const T* go, T* gi) {
	CUDA_1D_KERNEL_LOOP(i, N) {
	gi[i] = go[i] * (mx[i] == x[i]);
	}
	}

	template <>
	bool SelectGradientOpBase<float, CUDAContext>::RunOnDevice() {
	auto& output = Input(0);
	auto& grad_output = Input(1);
	const int kInputStartOffset = 2;

	const float* data = output.data<float>();

	for (int i = 0; i < OutputSize(); i++) {
	auto& input = Input(i + kInputStartOffset);
	auto* grad_input = Output(i);
	grad_input->ResizeLike(input);
	MaxMinGradKernel<<<
	CAFFE_GET_BLOCKS(input.size()),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	input.size(),
	output.data<float>(),
	input.data<float>(),
	grad_output.data<float>(),
	grad_input->mutable_data<float>());
	}
	return true;
	}

	template <typename T_INDEX>
	__global__ void GatherKernel(
	const float* X,
	float* Y,
	const T_INDEX* indices,
	const int N,
	const int block_size) {
	for (int i = blockIdx.x; i < N; i += gridDim.x) {
	T_INDEX idx = indices[i];
	const float* src_offset = X + idx * block_size;
	float* dst_offset = Y + i * block_size;
	for (int j = threadIdx.x; j < block_size; j += blockDim.x) {
	dst_offset[j] = src_offset[j];
	}
	}
	}

	template <>
	bool GatherOp<CUDAContext>::RunOnDevice() {
	return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
	this, OperatorBase::Input<TensorCUDA>(INDICES));
	}

	template <>
	template <typename Index>
	bool GatherOp<CUDAContext>::DoRunWithType() {
	auto& data = Input(DATA);
	auto& indices = Input(INDICES);
	auto* output = Output(0);

	CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
	auto shape = indices.dims();
	shape.insert(shape.end(), data.dims().begin() + 1, data.dims().end());
	output->Resize(shape);

	int block_size = data.size() / data.dim(0);
	auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize();
	CAFFE_ENFORCE(
	block_bytesize == data.nbytes() / data.dim(0),
	"block_bytesize should be consistent with data dim");
	int N = indices.size();

	auto src_base = static_cast<const float*>(data.raw_data());
	const Index* idxs = indices.template data<Index>();
	auto out = static_cast<float*>(output->raw_mutable_data(data.meta()));

	// return early when the input is empty, since CUDA kernel will fail for
	// empty input.
	if (N <= 0) {
	return true;
	}

	GatherKernel<<<
	std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(src_base, out, idxs, N, block_size);
	return true;
	}

	REGISTER_CUDA_OPERATOR(Gather, GatherOp<CUDAContext>);

	/**
	* @brief Update slices of Y in-place with a batch of weighted X's.
	* Y[idx] = alpha[b] * X[b][i] + Y[idx]
	* i=0,...,N-1
	* b=0,...,B-1
	* idx=Indices[i]
	*/
	template <typename T_INDEX>
	__global__ void AxpySliceKernel(
	const float* weight0,
	const TIndex N,
	const TIndex B,
	const TIndex slice_size,
	const float** alpha,
	const float** X,
	const T_INDEX* Indices,
	float* Y,
	const TIndex M) {
	// This implementation requires that the first weight is 1.0
	CUDA_KERNEL_ASSERT(weight0[0] == 1.0);
	for (int i = blockIdx.x; i < N; i += gridDim.x) {
	T_INDEX idx = Indices[i];
	float* y_offset = Y + (idx * slice_size);
	for (int b = 0; b < B; b++) {
	float a = *alpha[b];
	const float* x_offset = X[b] + (i * slice_size);
	for (int j = threadIdx.x; j < slice_size; j += blockDim.x) {
	atomicAdd(&y_offset[j], a * x_offset[j]);
	}
	}
	}
	}

	template <>
	bool ScatterWeightedSumOp<float, CUDAContext>::RunOnDevice() {
	return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(2));
	}

	template <>
	template <typename Index>
	bool ScatterWeightedSumOp<float, CUDAContext>::DoRunWithType() {
	CAFFE_ENFORCE_EQ(InputSize() % 2, 1);
	auto& X0 = Input(0);
	auto& weight0 = Input(1);
	auto& indices = Input(2);
	auto* output = Output(0);

	CAFFE_ENFORCE_EQ(&X0, output, "In place operation is required");
	CAFFE_ENFORCE_GT(X0.size(), 0);
	CAFFE_ENFORCE_GT(X0.ndim(), 0, "X0 has to be at least the vector");
	CAFFE_ENFORCE_EQ(weight0.size(), 1);

	TIndex M = X0.size();
	TIndex N = X0.dim(0);
	TIndex K = indices.size();
	TIndex block_size = M / N;

	T* data = output->template mutable_data<T>();

	// In order to have all device pointers of x_i (and weight_i similarly)
	// consecutively in device memory, copy pointers to a host vector and then
	// copy back into a device array.
	const TIndex B = (InputSize() - 3) / 2;
	x_data_host_.Resize(B);
	weights_host_.Resize(B);
	x_data_device_.Resize(B);
	weights_device_.Resize(B);

	const float** x_data_host = x_data_host_.mutable_data<const float*>();
	const float** weights_host = weights_host_.mutable_data<const float*>();
	const float** x_data_device = x_data_device_.mutable_data<const float*>();
	const float** weights_device = weights_device_.mutable_data<const float*>();

	for (int inp = 3; inp < InputSize(); inp += 2) {
	int idx = (inp - 3) / 2;
	x_data_host[idx] = static_cast<const float*>(Input(inp).raw_data());
	weights_host[idx] = static_cast<const float*>(Input(inp + 1).raw_data());
	}
	context_.Copy<const float*, CPUContext, CUDAContext>(
	B, x_data_host, x_data_device);
	context_.Copy<const float*, CPUContext, CUDAContext>(
	B, weights_host, weights_device);

	AxpySliceKernel<<<
	std::min<TIndex>(K, CAFFE_MAXIMUM_NUM_BLOCKS),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	weight0.template data<float>(),
	K,
	B,
	block_size,
	weights_device,
	x_data_device,
	indices.template data<Index>(),
	data,
	M);

	return true;
	}

	REGISTER_CUDA_OPERATOR(
	ScatterWeightedSum,
	ScatterWeightedSumOp<float, CUDAContext>);

	namespace {

	template <typename Index, typename T>
	__global__ void scatter_assign_kernel(
	T* data,
	const Index* idxs,
	const T* slicesData,
	TIndex N,
	TIndex K,
	TIndex block_size) {
	for (TIndex i = blockIdx.x; i < K; i += gridDim.x) {
	Index idx = idxs[i];
	CUDA_KERNEL_ASSERT(0 <= idx && idx < N);
	const T* src = slicesData + block_size * i;
	T* dest = data + block_size * idx;
	for (TIndex j = threadIdx.x; j < block_size; j += blockDim.x) {
	dest[j] = src[j];
	}
	}
	}

	} // namespace

	template <>
	template <typename Index, typename T>
	void ScatterAssignOp<CUDAContext>::DoScatterAssign(
	T* data,
	const Index* idxs,
	const T* slicesData,
	TIndex N,
	TIndex K,
	TIndex block_size) {
	scatter_assign_kernel<<<
	std::min(K, static_cast<TIndex>(CAFFE_MAXIMUM_NUM_BLOCKS)),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(data, idxs, slicesData, N, K, block_size);
	}

	REGISTER_CUDA_OPERATOR(ScatterAssign, ScatterAssignOp<CUDAContext>);

	REGISTER_CUDA_OPERATOR(Size, SizeOp<CUDAContext>);

	template <typename T>
	__global__ void RangeKernel(const int n, T* Y, T offset, T step) {
	CUDA_1D_KERNEL_LOOP(index, n) {
	Y[index] = index * step + offset;
	}
	}

	template <>
	template <typename T>
	bool RangeOp<CUDAContext>::DoRunOnDevice(
	const T& start,
	const T& step,
	Tensor<CUDAContext>* output) {
	int N = output->size();
	RangeKernel<<<
	CAFFE_GET_BLOCKS(N),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(N, output->mutable_data<T>(), start, step);
	return true;
	}

	REGISTER_CUDA_OPERATOR(Range, RangeOp<CUDAContext>);
	} // namespace caffe2