caffe2/sgd/yellowfin_op_gpu.cu - platform/external/pytorch - Git at Google

 // YellowFin: An automatic tuner for momentum SGD
 // (https://arxiv.org/abs/1706.03471)

 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/sgd/yellowfin_op.h"

 namespace caffe2 {

 __global__ void GetLrMuKernel(
     const float* g_norm2_max_deb,
     const float* g_norm2_min_deb,
     const float* distance_deb,
     const float* variance,
     float* mu,
     float* lr) {
   const float curv_ratio = sqrtf(*g_norm2_max_deb / *g_norm2_min_deb);
   const float mu_limit = (curv_ratio - 1.0f) / (curv_ratio + 1.0f);
   const float pre_p = *distance_deb * *g_norm2_min_deb;
   const float p = (pre_p * pre_p) / (2.0f * *variance);
   const float w3 = (-sqrtf(p * p + 4.0f / 27.0f * p * p * p) - p) / 2.0f;
   const float w3_sign = copysignf(1.0f, w3);
   const float w = w3_sign * powf(fabsf(w3), 1.0f / 3.0f);
   const float y = w - p / 3.0f / w;
   const float root = y + 1.0f;
   *mu = fmaxf(root * root, mu_limit * mu_limit);
   *lr = powf(1.0f - sqrtf(*mu), 2) / *g_norm2_min_deb;
 }

 template <>
 void YellowFinOp<float, CUDAContext>::GetLrMu() {
   // Finding root of cubic formula for YF's Single Step
   GetLrMuKernel<<<1, 1, 0, context_.cuda_stream()>>>(
       g_norm2_max_deb_, g_norm2_min_deb_, distance_deb_, variance_, mu_, lr_);
   MovingAverage(1, mu_, mu_avg_, mu_avg_out_, mu_deb_);
   MovingAverage(1, lr_, lr_avg_, lr_avg_out_, lr_deb_);
 }

 __global__ void MomentumSgdKernel(
     const int N,
     const float* mu_ptr,
     const float* lr_ptr,
     const float* param,
     const float* grad,
     const float* moment,
     float* param_out,
     float* moment_out,
     bool nesterov) {
   const float mu = *mu_ptr;
   const float lr = *lr_ptr;
   if (!nesterov) {
     CUDA_1D_KERNEL_LOOP(i, N) {
       moment_out[i] = mu * moment[i] * lr * grad[i];
       param_out[i] = param[i] - moment_out[i];
     }
   } else {
     CUDA_1D_KERNEL_LOOP(i, N) {
       const float moment_i = moment[i];
       moment_out[i] = mu * moment_i + lr * grad[i];
       param_out[i] = param[i] - (1 + mu) * moment_out[i] + mu * moment_i;
     }
   }
 }

 template <>
 void YellowFinOp<float, CUDAContext>::MomentumSgdUpdate() {
   MomentumSgdKernel<<<
       CAFFE_GET_BLOCKS(D_),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
       D_,
       mu_avg_out_,
       lr_avg_out_,
       param_,
       grad_,
       moment_,
       param_out_,
       moment_out_,
       nesterov_);
 }

 REGISTER_CUDA_OPERATOR(YellowFin, YellowFinOp<float, CUDAContext>);

 } // namespace caffe2
	// YellowFin: An automatic tuner for momentum SGD
	// (https://arxiv.org/abs/1706.03471)

	#include "caffe2/core/common_gpu.h"
	#include "caffe2/core/context_gpu.h"
	#include "caffe2/sgd/yellowfin_op.h"

	namespace caffe2 {

	__global__ void GetLrMuKernel(
	const float* g_norm2_max_deb,
	const float* g_norm2_min_deb,
	const float* distance_deb,
	const float* variance,
	float* mu,
	float* lr) {
	const float curv_ratio = sqrtf(g_norm2_max_deb / g_norm2_min_deb);
	const float mu_limit = (curv_ratio - 1.0f) / (curv_ratio + 1.0f);
	const float pre_p = distance_deb *g_norm2_min_deb;
	const float p = (pre_p * pre_p) / (2.0f * *variance);
	const float w3 = (-sqrtf(p * p + 4.0f / 27.0f * p * p * p) - p) / 2.0f;
	const float w3_sign = copysignf(1.0f, w3);
	const float w = w3_sign * powf(fabsf(w3), 1.0f / 3.0f);
	const float y = w - p / 3.0f / w;
	const float root = y + 1.0f;
	mu = fmaxf(root root, mu_limit * mu_limit);
	lr = powf(1.0f - sqrtf(mu), 2) / *g_norm2_min_deb;
	}

	template <>
	void YellowFinOp<float, CUDAContext>::GetLrMu() {
	// Finding root of cubic formula for YF's Single Step
	GetLrMuKernel<<<1, 1, 0, context_.cuda_stream()>>>(
	g_norm2_max_deb_, g_norm2_min_deb_, distance_deb_, variance_, mu_, lr_);
	MovingAverage(1, mu_, mu_avg_, mu_avg_out_, mu_deb_);
	MovingAverage(1, lr_, lr_avg_, lr_avg_out_, lr_deb_);
	}

	__global__ void MomentumSgdKernel(
	const int N,
	const float* mu_ptr,
	const float* lr_ptr,
	const float* param,
	const float* grad,
	const float* moment,
	float* param_out,
	float* moment_out,
	bool nesterov) {
	const float mu = *mu_ptr;
	const float lr = *lr_ptr;
	if (!nesterov) {
	CUDA_1D_KERNEL_LOOP(i, N) {
	moment_out[i] = mu * moment[i] * lr * grad[i];
	param_out[i] = param[i] - moment_out[i];
	}
	} else {
	CUDA_1D_KERNEL_LOOP(i, N) {
	const float moment_i = moment[i];
	moment_out[i] = mu * moment_i + lr * grad[i];
	param_out[i] = param[i] - (1 + mu) * moment_out[i] + mu * moment_i;
	}
	}
	}

	template <>
	void YellowFinOp<float, CUDAContext>::MomentumSgdUpdate() {
	MomentumSgdKernel<<<
	CAFFE_GET_BLOCKS(D_),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	D_,
	mu_avg_out_,
	lr_avg_out_,
	param_,
	grad_,
	moment_,
	param_out_,
	moment_out_,
	nesterov_);
	}

	REGISTER_CUDA_OPERATOR(YellowFin, YellowFinOp<float, CUDAContext>);

	} // namespace caffe2