caffe2/sgd/adadelta_op_gpu.cu - platform/external/pytorch - Git at Google

 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/sgd/adadelta_op.h"

 namespace caffe2 {

 namespace {

 __global__ void AdadeltaUpdateKernel(
     int N,
     const float* w,
     const float* g,
     const float* h,
     const float* d,
     const float epsilon,
     const float decay,
     const float* lr,
     float* nw,
     float* nh,
     float* nd) {
   CUDA_1D_KERNEL_LOOP(i, N) {
     float gi = g[i];
     float di = d[i];
     float hi = nh[i] = decay * h[i] + (1.0f - decay) * gi * gi;
     float ng = sqrtf(di + epsilon) * rsqrtf(hi + epsilon) * gi;
     nw[i] = w[i] + lr[0] * ng;
     nd[i] = decay * di + (1.0f - decay) * ng * ng;
   }
 }

 template <>
 void AdadeltaUpdate<CUDAContext>(
     int N,
     const float* w,
     const float* g,
     const float* h,
     const float* d,
     const float epsilon,
     const float decay,
     const float* lr,
     float* nw,
     float* nh,
     float* nd,
     CUDAContext* context) {
   AdadeltaUpdateKernel<<<
       CAFFE_GET_BLOCKS(N),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context->cuda_stream()>>>(N, w, g, h, d, epsilon, decay, lr, nw, nh, nd);
 }

 } // namespace

 template <typename SIndex, typename THalf>
 __global__ void SparseAdadeltaKernel(
     const size_t N,
     const size_t grad_slice_sz,
     const float epsilon,
     const float decay,
     const SIndex* indices,
     const float* grad,
     const float* lr,
     THalf* param,
     THalf* param_mom,
     THalf* param_mom_delta) {
   const float LR = lr[0];
   CUDA_1D_KERNEL_LOOP(i, N) {
     const size_t gradIdx = i;
     const SIndex index = indices[i / grad_slice_sz];
     const size_t paramIdx = index * grad_slice_sz + (i % grad_slice_sz);

     float mom_new = decay * param_mom[paramIdx] +
         (1.0f - decay) * grad[gradIdx] * grad[gradIdx];
     param_mom[paramIdx] = mom_new;
     float grad_new = sqrtf(epsilon + param_mom_delta[paramIdx]) *
         rsqrtf(mom_new + epsilon) * grad[gradIdx];
     float param_new = LR * grad_new + param[paramIdx];
     param[paramIdx] = param_new;
     float mom_delta_new = decay * param_mom_delta[paramIdx] +
         (1.0f - decay) * grad_new * grad_new;
     param_mom_delta[paramIdx] = mom_delta_new;
   }
 }

 template <class Context>
 class CUDASparseAdadeltaOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   CUDASparseAdadeltaOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         OP_SINGLE_ARG(float, "epsilon", epsilon_, 1e-5f),
         OP_SINGLE_ARG(float, "decay", decay_, 0.95f) {}

   bool RunOnDevice() override {
     // Enforce shapes
     CAFFE_ENFORCE_EQ(Input(PARAM).size(), Input(MOMENT_GRAD).size());
     CAFFE_ENFORCE_EQ(Input(PARAM).size(), Input(MOMENT_DELTA).size());
     CAFFE_ENFORCE_EQ(Input(LR).size(), 1);
     CAFFE_ENFORCE_EQ(
         Input(PARAM).size_from_dim(1),
         Input(GRAD).size_from_dim(Input(INDICES).ndim()));

     // Enforce domain constraints on attributes
     CAFFE_ENFORCE_GE(epsilon_, 0.0f);
     CAFFE_ENFORCE_GT(decay_, 0.0f);
     CAFFE_ENFORCE_LT(decay_, 1.0f);

     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
         this, Input(INDICES));
   }

   template <typename IndexType>
   bool DoRunWithType() {
     auto n = Input(INDICES).size();
     if (n == 0) {
       return true;
     }
     return DispatchHelper<TensorTypes2<float, at::Half>, IndexType>::call(
         this, Input(PARAM));
   }

   template <typename IndexType, typename THalf>
   bool DoRunWithType2() {
     const auto* lr = Input(LR).template data<float>();
     const auto* indices = Input(INDICES).template data<IndexType>();
     const auto* gradIn = Input(GRAD).template data<float>();
     const auto* paramIn = Input(PARAM).template data<THalf>();
     const auto* momentIn = Input(MOMENT_GRAD).template data<THalf>();
     const auto* momentDeltaIn = Input(MOMENT_DELTA).template data<THalf>();
     auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<THalf>();
     auto* momentOut =
         Output(OUTPUT_MOMENT_GRAD)->template mutable_data<THalf>();
     auto* momentDeltaOut =
         Output(OUTPUT_MOMENT_DELTA)->template mutable_data<THalf>();

     auto N = Input(GRAD).size();
     auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim());
     if (N == 0) {
       // empty grad, nothing to do here, not even launching the kernel
       return true;
     }
     SparseAdadeltaKernel<IndexType, THalf>
         <<<CAFFE_GET_BLOCKS(N),
            CAFFE_CUDA_NUM_THREADS,
            0,
            context_.cuda_stream()>>>(
             N,
             grad_slice_sz,
             epsilon_,
             decay_,
             indices,
             gradIn,
             lr,
             paramOut,
             momentOut,
             momentDeltaOut);
     return true;
   }

  protected:
   const float epsilon_;
   const float decay_;
   INPUT_TAGS(PARAM, MOMENT_GRAD, MOMENT_DELTA, INDICES, GRAD, LR);
   OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_GRAD, OUTPUT_MOMENT_DELTA);
 };

 REGISTER_CUDA_OPERATOR(Adadelta, AdadeltaOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(SparseAdadelta, CUDASparseAdadeltaOp<CUDAContext>);
 } // namespace caffe2
	#include "caffe2/core/common_gpu.h"
	#include "caffe2/core/context_gpu.h"
	#include "caffe2/sgd/adadelta_op.h"

	namespace caffe2 {

	namespace {

	__global__ void AdadeltaUpdateKernel(
	int N,
	const float* w,
	const float* g,
	const float* h,
	const float* d,
	const float epsilon,
	const float decay,
	const float* lr,
	float* nw,
	float* nh,
	float* nd) {
	CUDA_1D_KERNEL_LOOP(i, N) {
	float gi = g[i];
	float di = d[i];
	float hi = nh[i] = decay * h[i] + (1.0f - decay) * gi * gi;
	float ng = sqrtf(di + epsilon) * rsqrtf(hi + epsilon) * gi;
	nw[i] = w[i] + lr[0] * ng;
	nd[i] = decay * di + (1.0f - decay) * ng * ng;
	}
	}

	template <>
	void AdadeltaUpdate<CUDAContext>(
	int N,
	const float* w,
	const float* g,
	const float* h,
	const float* d,
	const float epsilon,
	const float decay,
	const float* lr,
	float* nw,
	float* nh,
	float* nd,
	CUDAContext* context) {
	AdadeltaUpdateKernel<<<
	CAFFE_GET_BLOCKS(N),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context->cuda_stream()>>>(N, w, g, h, d, epsilon, decay, lr, nw, nh, nd);
	}

	} // namespace

	template <typename SIndex, typename THalf>
	__global__ void SparseAdadeltaKernel(
	const size_t N,
	const size_t grad_slice_sz,
	const float epsilon,
	const float decay,
	const SIndex* indices,
	const float* grad,
	const float* lr,
	THalf* param,
	THalf* param_mom,
	THalf* param_mom_delta) {
	const float LR = lr[0];
	CUDA_1D_KERNEL_LOOP(i, N) {
	const size_t gradIdx = i;
	const SIndex index = indices[i / grad_slice_sz];
	const size_t paramIdx = index * grad_slice_sz + (i % grad_slice_sz);

	float mom_new = decay * param_mom[paramIdx] +
	(1.0f - decay) * grad[gradIdx] * grad[gradIdx];
	param_mom[paramIdx] = mom_new;
	float grad_new = sqrtf(epsilon + param_mom_delta[paramIdx]) *
	rsqrtf(mom_new + epsilon) * grad[gradIdx];
	float param_new = LR * grad_new + param[paramIdx];
	param[paramIdx] = param_new;
	float mom_delta_new = decay * param_mom_delta[paramIdx] +
	(1.0f - decay) * grad_new * grad_new;
	param_mom_delta[paramIdx] = mom_delta_new;
	}
	}

	template <class Context>
	class CUDASparseAdadeltaOp final : public Operator<Context> {
	public:
	USE_OPERATOR_CONTEXT_FUNCTIONS;
	CUDASparseAdadeltaOp(const OperatorDef& operator_def, Workspace* ws)
	: Operator<Context>(operator_def, ws),
	OP_SINGLE_ARG(float, "epsilon", epsilon_, 1e-5f),
	OP_SINGLE_ARG(float, "decay", decay_, 0.95f) {}

	bool RunOnDevice() override {
	// Enforce shapes
	CAFFE_ENFORCE_EQ(Input(PARAM).size(), Input(MOMENT_GRAD).size());
	CAFFE_ENFORCE_EQ(Input(PARAM).size(), Input(MOMENT_DELTA).size());
	CAFFE_ENFORCE_EQ(Input(LR).size(), 1);
	CAFFE_ENFORCE_EQ(
	Input(PARAM).size_from_dim(1),
	Input(GRAD).size_from_dim(Input(INDICES).ndim()));

	// Enforce domain constraints on attributes
	CAFFE_ENFORCE_GE(epsilon_, 0.0f);
	CAFFE_ENFORCE_GT(decay_, 0.0f);
	CAFFE_ENFORCE_LT(decay_, 1.0f);

	return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
	this, Input(INDICES));
	}

	template <typename IndexType>
	bool DoRunWithType() {
	auto n = Input(INDICES).size();
	if (n == 0) {
	return true;
	}
	return DispatchHelper<TensorTypes2<float, at::Half>, IndexType>::call(
	this, Input(PARAM));
	}

	template <typename IndexType, typename THalf>
	bool DoRunWithType2() {
	const auto* lr = Input(LR).template data<float>();
	const auto* indices = Input(INDICES).template data<IndexType>();
	const auto* gradIn = Input(GRAD).template data<float>();
	const auto* paramIn = Input(PARAM).template data<THalf>();
	const auto* momentIn = Input(MOMENT_GRAD).template data<THalf>();
	const auto* momentDeltaIn = Input(MOMENT_DELTA).template data<THalf>();
	auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<THalf>();
	auto* momentOut =
	Output(OUTPUT_MOMENT_GRAD)->template mutable_data<THalf>();
	auto* momentDeltaOut =
	Output(OUTPUT_MOMENT_DELTA)->template mutable_data<THalf>();

	auto N = Input(GRAD).size();
	auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim());
	if (N == 0) {
	// empty grad, nothing to do here, not even launching the kernel
	return true;
	}
	SparseAdadeltaKernel<IndexType, THalf>
	<<<CAFFE_GET_BLOCKS(N),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	N,
	grad_slice_sz,
	epsilon_,
	decay_,
	indices,
	gradIn,
	lr,
	paramOut,
	momentOut,
	momentDeltaOut);
	return true;
	}

	protected:
	const float epsilon_;
	const float decay_;
	INPUT_TAGS(PARAM, MOMENT_GRAD, MOMENT_DELTA, INDICES, GRAD, LR);
	OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_GRAD, OUTPUT_MOMENT_DELTA);
	};

	REGISTER_CUDA_OPERATOR(Adadelta, AdadeltaOp<CUDAContext>);
	REGISTER_CUDA_OPERATOR(SparseAdadelta, CUDASparseAdadeltaOp<CUDAContext>);
	} // namespace caffe2