caffe2/sgd/adagrad_op_gpu.cu - platform/external/pytorch - Git at Google

 #include <algorithm>

 #include <cub/block/block_reduce.cuh>
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/sgd/adagrad_op.h"

 namespace caffe2 {

 __global__ void AdagradUpdate(
     int N,
     const float* w,
     const float* g,
     const float* h,
     float* nw,
     float* nh,
     float epsilon,
     float decay,
     const float* lr,
     float weight_decay = 0.f) {
   CUDA_1D_KERNEL_LOOP(i, N) {
     float gi = g[i] + weight_decay * w[i];
     float hi = nh[i] = decay * h[i] + gi * gi;
     nw[i] = w[i] + lr[0] * gi / (sqrtf(hi) + epsilon);
   }
 }

 template <>
 void adagrad_update<CUDAContext>(
     int N,
     const float* w,
     const float* g,
     const float* h,
     float* nw,
     float* nh,
     float epsilon,
     float decay,
     const float* lr,
     CUDAContext* context,
     float weight_decay) {
   AdagradUpdate<<<
       CAFFE_GET_BLOCKS(N),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context->cuda_stream()>>>(
       N, w, g, h, nw, nh, epsilon, decay, lr, weight_decay);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }

 template <typename SIndex, typename THalf>
 __global__ void SparseAdagradKernel(
     const size_t N,
     const size_t grad_slice_sz,
     const float epsilon,
     THalf* param,
     THalf* param_mom,
     const SIndex* indices,
     const float* grad,
     const float* lr,
     float weight_decay = 0.f) {
   const float LR = lr[0];
   CUDA_1D_KERNEL_LOOP(i, N) {
     const size_t gradIdx = i;
     const SIndex index = indices[i / grad_slice_sz];
     const size_t paramIdx = index * grad_slice_sz + (i % grad_slice_sz);

     float gi = grad[gradIdx] + weight_decay * param[paramIdx];
     float mom_new = gi * gi + param_mom[paramIdx];
     param_mom[paramIdx] = mom_new;
     float param_new = LR * gi / (sqrtf(mom_new) + epsilon) + param[paramIdx];
     param[paramIdx] = param_new;
   }
 }

 /**
  * Calculate RowwiseSparseAdagrad
  * M: gradients.dims[0]
  * N: gradients.size_from_dim(1)
  * grad: pointer to the gradients
  * param: pointer to weights
  * param_mom: pointer to the momentum
  * indices: keys
  */
 template <typename SIndex>
 __global__ void RowWiseSparseAdagradKernel(
     const int M,
     const int N,
     const float epsilon,
     float* param,
     float* param_mom,
     const SIndex* indices,
     const float* grad,
     const float* lr,
     float weight_decay = 0.f) {
   typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
   __shared__ BlockReduce::TempStorage temp_storage;
   int valid = min(N, CAFFE_CUDA_NUM_THREADS);
   // in case gridDim is smaller than M
   for (int i = blockIdx.x; i < M; i += gridDim.x) {
     const SIndex index = indices[i];
     float sum_squares = 0.0;
     __shared__ float row_sum_squares_avg;

     // in case N is bigger than block size which is 512 by default
     for (int j = threadIdx.x; j < N; j += blockDim.x) {
       const float x_ij = grad[i * N + j] + weight_decay * param[index * N + j];
       sum_squares += x_ij * x_ij;
     }
     float reduce_result = BlockReduce(temp_storage).Sum(sum_squares, valid);
     if (threadIdx.x == 0) {
       row_sum_squares_avg = reduce_result / (float)N;
       param_mom[index] += row_sum_squares_avg;
     }
     __syncthreads();
     // update param
     float step = lr[0] / (sqrtf(param_mom[index]) + epsilon);
     for (int j = threadIdx.x; j < N; j += blockDim.x) {
       const float x_ij = grad[i * N + j] + weight_decay * param[index * N + j];
       param[index * N + j] = param[index * N + j] + x_ij * step;
     }
   }
 }

 template <typename T, class Context>
 class CUDASparseAdagradOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   CUDASparseAdagradOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         epsilon_(this->template GetSingleArgument<float>("epsilon", 1e-5f)),
         weight_decay_(
             this->template GetSingleArgument<float>("weight_decay", 0.f)) {
     VLOG(1) << "gradient optimization operator in use: "
             << "CUDASparseAdagradOp"
             << " weight_decay_=" << weight_decay_;
     const T decay = this->template GetSingleArgument<T>("decay", 1.0f);
     CAFFE_ENFORCE_EQ(decay, 1.0, "Decay is not supported for SparseAdagradOp");
   }

   bool RunOnDevice() override {
     // Enforce shapes
     CAFFE_ENFORCE_EQ(Input(PARAM).size(), Input(MOMENT_1).size());
     CAFFE_ENFORCE_EQ(Input(LR).size(), 1);
     CAFFE_ENFORCE_EQ(
         Input(PARAM).size_from_dim(1),
         Input(GRAD).size_from_dim(Input(INDICES).ndim()));

     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
         this, Input(INDICES));
   }

   template <typename IndexType>
   bool DoRunWithType() {
     auto n = Input(INDICES).size();
     if (n == 0) {
       return true;
     }
     return DispatchHelper<TensorTypes2<float, at::Half>, IndexType>::call(
         this, Input(PARAM));
   }

   template <typename IndexType, typename THalf>
   bool DoRunWithType2() {
     const auto* lr = Input(LR).template data<T>();
     const auto* indices = Input(INDICES).template data<IndexType>();
     const auto* gradIn = Input(GRAD).template data<T>();
     const auto* paramIn = Input(PARAM).template data<THalf>();
     const auto* momentIn = Input(MOMENT_1).template data<THalf>();
     auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<THalf>();
     auto* momentOut = Output(OUTPUT_MOMENT_1)->template mutable_data<THalf>();

     auto N = Input(GRAD).size();
     auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim());
     if (N == 0) {
       // empty grad, nothing to do here, not even launching the kernel
       return true;
     }
     SparseAdagradKernel<IndexType, THalf>
         <<<CAFFE_GET_BLOCKS(N),
            CAFFE_CUDA_NUM_THREADS,
            0,
            context_.cuda_stream()>>>(
             N,
             grad_slice_sz,
             epsilon_,
             Output(OUTPUT_PARAM)->template mutable_data<THalf>(),
             Output(OUTPUT_MOMENT_1)->template mutable_data<THalf>(),
             Input(INDICES).template data<IndexType>(),
             Input(GRAD).template data<float>(),
             Input(LR).template data<float>(),
             weight_decay_);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
     return true;
   }

  protected:
   T epsilon_;
   T weight_decay_;
   INPUT_TAGS(PARAM, MOMENT_1, INDICES, GRAD, LR);
   OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1);
 };

 template <>
 template <typename SIndex>
 bool RowWiseSparseAdagradOp<CUDAContext>::DoRunWithType() {
   auto N = Input(GRAD).size();
   if (N == 0) {
     // empty grad, nothing to do here, not even launching the kernel
     return true;
   }
   // size of the 1st dimension of the input gradient
   auto GRAD_M = Input(GRAD).dim32(0);
   auto GRAD_N = N / GRAD_M;

   // Cases with GRAND_N < 128 can have more swarms if number of threads is lower
   int num_threads = CAFFE_CUDA_NUM_THREADS;
   if (GRAD_N < num_threads) {
     num_threads = GRAD_N;
   }

   // each thread block will handle multiple rows of the input and output
   RowWiseSparseAdagradKernel<<<
       std::min(GRAD_M, CAFFE_MAXIMUM_NUM_BLOCKS),
       num_threads,
       0,
       context_.cuda_stream()>>>(
       GRAD_M,
       GRAD_N,
       epsilon_,
       Output(OUTPUT_PARAM)->template mutable_data<float>(),
       Output(OUTPUT_MOMENT_1)->template mutable_data<float>(),
       Input(INDICES).template data<SIndex>(),
       Input(GRAD).template data<float>(),
       Input(LR).template data<float>(),
       weight_decay_);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
   return true;
 }

 REGISTER_CUDA_OPERATOR(Adagrad, AdagradOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(SparseAdagrad, CUDASparseAdagradOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(
     RowWiseSparseAdagrad,
     RowWiseSparseAdagradOp<CUDAContext>);
 } // namespace caffe2
	#include <algorithm>

	#include <cub/block/block_reduce.cuh>
	#include "caffe2/core/common_gpu.h"
	#include "caffe2/core/context_gpu.h"
	#include "caffe2/sgd/adagrad_op.h"

	namespace caffe2 {

	__global__ void AdagradUpdate(
	int N,
	const float* w,
	const float* g,
	const float* h,
	float* nw,
	float* nh,
	float epsilon,
	float decay,
	const float* lr,
	float weight_decay = 0.f) {
	CUDA_1D_KERNEL_LOOP(i, N) {
	float gi = g[i] + weight_decay * w[i];
	float hi = nh[i] = decay * h[i] + gi * gi;
	nw[i] = w[i] + lr[0] * gi / (sqrtf(hi) + epsilon);
	}
	}

	template <>
	void adagrad_update<CUDAContext>(
	int N,
	const float* w,
	const float* g,
	const float* h,
	float* nw,
	float* nh,
	float epsilon,
	float decay,
	const float* lr,
	CUDAContext* context,
	float weight_decay) {
	AdagradUpdate<<<
	CAFFE_GET_BLOCKS(N),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context->cuda_stream()>>>(
	N, w, g, h, nw, nh, epsilon, decay, lr, weight_decay);
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	}

	template <typename SIndex, typename THalf>
	__global__ void SparseAdagradKernel(
	const size_t N,
	const size_t grad_slice_sz,
	const float epsilon,
	THalf* param,
	THalf* param_mom,
	const SIndex* indices,
	const float* grad,
	const float* lr,
	float weight_decay = 0.f) {
	const float LR = lr[0];
	CUDA_1D_KERNEL_LOOP(i, N) {
	const size_t gradIdx = i;
	const SIndex index = indices[i / grad_slice_sz];
	const size_t paramIdx = index * grad_slice_sz + (i % grad_slice_sz);

	float gi = grad[gradIdx] + weight_decay * param[paramIdx];
	float mom_new = gi * gi + param_mom[paramIdx];
	param_mom[paramIdx] = mom_new;
	float param_new = LR * gi / (sqrtf(mom_new) + epsilon) + param[paramIdx];
	param[paramIdx] = param_new;
	}
	}

	/**
	* Calculate RowwiseSparseAdagrad
	* M: gradients.dims[0]
	* N: gradients.size_from_dim(1)
	* grad: pointer to the gradients
	* param: pointer to weights
	* param_mom: pointer to the momentum
	* indices: keys
	*/
	template <typename SIndex>
	__global__ void RowWiseSparseAdagradKernel(
	const int M,
	const int N,
	const float epsilon,
	float* param,
	float* param_mom,
	const SIndex* indices,
	const float* grad,
	const float* lr,
	float weight_decay = 0.f) {
	typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
	__shared__ BlockReduce::TempStorage temp_storage;
	int valid = min(N, CAFFE_CUDA_NUM_THREADS);
	// in case gridDim is smaller than M
	for (int i = blockIdx.x; i < M; i += gridDim.x) {
	const SIndex index = indices[i];
	float sum_squares = 0.0;
	__shared__ float row_sum_squares_avg;

	// in case N is bigger than block size which is 512 by default
	for (int j = threadIdx.x; j < N; j += blockDim.x) {
	const float x_ij = grad[i * N + j] + weight_decay * param[index * N + j];
	sum_squares += x_ij * x_ij;
	}
	float reduce_result = BlockReduce(temp_storage).Sum(sum_squares, valid);
	if (threadIdx.x == 0) {
	row_sum_squares_avg = reduce_result / (float)N;
	param_mom[index] += row_sum_squares_avg;
	}
	__syncthreads();
	// update param
	float step = lr[0] / (sqrtf(param_mom[index]) + epsilon);
	for (int j = threadIdx.x; j < N; j += blockDim.x) {
	const float x_ij = grad[i * N + j] + weight_decay * param[index * N + j];
	param[index * N + j] = param[index * N + j] + x_ij * step;
	}
	}
	}

	template <typename T, class Context>
	class CUDASparseAdagradOp final : public Operator<Context> {
	public:
	USE_OPERATOR_CONTEXT_FUNCTIONS;
	CUDASparseAdagradOp(const OperatorDef& operator_def, Workspace* ws)
	: Operator<Context>(operator_def, ws),
	epsilon_(this->template GetSingleArgument<float>("epsilon", 1e-5f)),
	weight_decay_(
	this->template GetSingleArgument<float>("weight_decay", 0.f)) {
	VLOG(1) << "gradient optimization operator in use: "
	<< "CUDASparseAdagradOp"
	<< " weight_decay_=" << weight_decay_;
	const T decay = this->template GetSingleArgument<T>("decay", 1.0f);
	CAFFE_ENFORCE_EQ(decay, 1.0, "Decay is not supported for SparseAdagradOp");
	}

	bool RunOnDevice() override {
	// Enforce shapes
	CAFFE_ENFORCE_EQ(Input(PARAM).size(), Input(MOMENT_1).size());
	CAFFE_ENFORCE_EQ(Input(LR).size(), 1);
	CAFFE_ENFORCE_EQ(
	Input(PARAM).size_from_dim(1),
	Input(GRAD).size_from_dim(Input(INDICES).ndim()));

	return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
	this, Input(INDICES));
	}

	template <typename IndexType>
	bool DoRunWithType() {
	auto n = Input(INDICES).size();
	if (n == 0) {
	return true;
	}
	return DispatchHelper<TensorTypes2<float, at::Half>, IndexType>::call(
	this, Input(PARAM));
	}

	template <typename IndexType, typename THalf>
	bool DoRunWithType2() {
	const auto* lr = Input(LR).template data<T>();
	const auto* indices = Input(INDICES).template data<IndexType>();
	const auto* gradIn = Input(GRAD).template data<T>();
	const auto* paramIn = Input(PARAM).template data<THalf>();
	const auto* momentIn = Input(MOMENT_1).template data<THalf>();
	auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<THalf>();
	auto* momentOut = Output(OUTPUT_MOMENT_1)->template mutable_data<THalf>();

	auto N = Input(GRAD).size();
	auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim());
	if (N == 0) {
	// empty grad, nothing to do here, not even launching the kernel
	return true;
	}
	SparseAdagradKernel<IndexType, THalf>
	<<<CAFFE_GET_BLOCKS(N),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	N,
	grad_slice_sz,
	epsilon_,
	Output(OUTPUT_PARAM)->template mutable_data<THalf>(),
	Output(OUTPUT_MOMENT_1)->template mutable_data<THalf>(),
	Input(INDICES).template data<IndexType>(),
	Input(GRAD).template data<float>(),
	Input(LR).template data<float>(),
	weight_decay_);
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	return true;
	}

	protected:
	T epsilon_;
	T weight_decay_;
	INPUT_TAGS(PARAM, MOMENT_1, INDICES, GRAD, LR);
	OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1);
	};

	template <>
	template <typename SIndex>
	bool RowWiseSparseAdagradOp<CUDAContext>::DoRunWithType() {
	auto N = Input(GRAD).size();
	if (N == 0) {
	// empty grad, nothing to do here, not even launching the kernel
	return true;
	}
	// size of the 1st dimension of the input gradient
	auto GRAD_M = Input(GRAD).dim32(0);
	auto GRAD_N = N / GRAD_M;

	// Cases with GRAND_N < 128 can have more swarms if number of threads is lower
	int num_threads = CAFFE_CUDA_NUM_THREADS;
	if (GRAD_N < num_threads) {
	num_threads = GRAD_N;
	}

	// each thread block will handle multiple rows of the input and output
	RowWiseSparseAdagradKernel<<<
	std::min(GRAD_M, CAFFE_MAXIMUM_NUM_BLOCKS),
	num_threads,
	0,
	context_.cuda_stream()>>>(
	GRAD_M,
	GRAD_N,
	epsilon_,
	Output(OUTPUT_PARAM)->template mutable_data<float>(),
	Output(OUTPUT_MOMENT_1)->template mutable_data<float>(),
	Input(INDICES).template data<SIndex>(),
	Input(GRAD).template data<float>(),
	Input(LR).template data<float>(),
	weight_decay_);
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	return true;
	}

	REGISTER_CUDA_OPERATOR(Adagrad, AdagradOp<CUDAContext>);
	REGISTER_CUDA_OPERATOR(SparseAdagrad, CUDASparseAdagradOp<float, CUDAContext>);
	REGISTER_CUDA_OPERATOR(
	RowWiseSparseAdagrad,
	RowWiseSparseAdagradOp<CUDAContext>);
	} // namespace caffe2