caffe2/sgd/yellowfin_op.cc - platform/external/pytorch - Git at Google

 #include "caffe2/sgd/yellowfin_op.h"

 namespace caffe2 {

 REGISTER_CPU_OPERATOR(YellowFin, YellowFinOp<float, CPUContext>);
 OPERATOR_SCHEMA(YellowFin)
     .NumInputs(10)
     .NumOutputs(8)
     .AllowInplace(
         {{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {7, 7}})
     .SetDoc(R"DOC(

 Computes the YellowFin update (https://arxiv.org/abs/1706.03471) and performs
 momentum SGD optimization step. lr and mu are not being shared between
 parameters. curv_win, g_avg, g2_avg and scalars_memory are just auxiliary
 memory for computing moving averages (see the publication). Takes arguments
 beta: coefficient for moving averages,
 curv_win_width: timeframe when average squared gradient is being stored,
 epsilon: for numerical purposes,
 nesterov and zero_debias for debias of moving average.

 )DOC")
     .Input(0, "param", "Parameters to be updated")
     .Input(1, "moment", "Momentum")
     .Input(2, "lr", "Learning rate")
     .Input(3, "mu", "Momentum coefficient")
     .Input(4, "curv_win", "Memory for latest curvature ranges")
     .Input(5, "g_avg", "Moving average of gradient")
     .Input(6, "g2_avg", "Moving average of squared gradient")
     .Input(7, "scalars_memory", "Memory for stateful scalars")
     .Input(8, "grad", "Gradient computed")
     .Input(9, "iter", "Iteration number")
     .Output(0, "output_param", "Parameters to be updated")
     .Output(1, "output_moment", "Momentum")
     .Output(2, "output_lr", "Output learning rate")
     .Output(3, "output_mu", "Output momentum coefficient")
     .Output(4, "output_curv_win", "Output memory for latest curvature ranges")
     .Output(5, "output_g_avg", "Output moving average of gradient")
     .Output(6, "output_g2_avg", "Output moving average of squared gradient")
     .Output(7, "output_scalars_memory", "Output memory for stateful scalars")
     .Arg("beta", "Default 0.999")
     .Arg("curv_win_width", "Default 20")
     .Arg("epsilon", "Default 1e-6")
     .Arg("nesterov", "Default false")
     .Arg("zero_debias", "Default true");

 SHOULD_NOT_DO_GRADIENT(YellowFin);

 #define CAFFE2_YELLOWFIN_GETLRMU(T)                                         \
   template <>                                                               \
   void YellowFinOp<T, CPUContext>::GetLrMu() {                              \
     const T curv_ratio = std::sqrt(*g_norm2_max_deb_ / *g_norm2_min_deb_);  \
     const T mu_limit = (curv_ratio - 1.0f) / (curv_ratio + 1.0f);           \
     const T pre_p = *distance_deb_ * *g_norm2_min_deb_;                     \
     const T p = (pre_p * pre_p) / (2.0f * *variance_);                      \
     const T w3 = (-std::sqrt(p * p + 4.0f / 27.0f * p * p * p) - p) / 2.0f; \
     const T w3_sign = w3 > 0.0f ? 1.0f : -1.0f;                             \
     const T w = w3_sign * std::pow(std::abs(w3), 1.0f / 3.0f);              \
     const T y = w - p / 3.0f / w;                                           \
     const T root = y + 1.0f;                                                \
     *mu_ = std::max(root * root, mu_limit * mu_limit);                      \
     *lr_ = std::pow(1.0f - std::sqrt(*mu_), 2) / *g_norm2_min_deb_;         \
     MovingAverage(1, mu_, mu_avg_, mu_avg_out_, mu_deb_);                   \
     MovingAverage(1, lr_, lr_avg_, lr_avg_out_, lr_deb_);                   \
   }

 CAFFE2_YELLOWFIN_GETLRMU(float)
 #undef CAFFE2_YELLOWFIN_GETLRMU

 // Usually moment_ == moment_out_ && param_ == param_out_
 #define CAFFE2_YELLOWFIN_MOMENTUMSGDUPDATE(T)                                  \
   template <>                                                                  \
   void YellowFinOp<T, CPUContext>::MomentumSgdUpdate() {                       \
     const T mu = *mu_avg_out_;                                                 \
     const T lr = *lr_avg_out_;                                                 \
     if (!nesterov_) {                                                          \
       for (int i = 0; i < D_; ++i) {                                           \
         moment_out_[i] = mu * moment_[i] + lr * grad_[i];                      \
         param_out_[i] = param_[i] - moment_out_[i];                            \
       }                                                                        \
     } else {                                                                   \
       for (int i = 0; i < D_; ++i) {                                           \
         const T moment_i = moment_[i];                                         \
         moment_out_[i] = mu * moment_i + lr * grad_[i];                        \
         param_out_[i] = param_[i] - (1 + mu) * moment_out_[i] + mu * moment_i; \
       }                                                                        \
     }                                                                          \
   }

 CAFFE2_YELLOWFIN_MOMENTUMSGDUPDATE(float)
 #undef CAFFE2_YELLOWFIN_MOMENTUMSGDUPDATE

 } // caffe2
	#include "caffe2/sgd/yellowfin_op.h"

	namespace caffe2 {

	REGISTER_CPU_OPERATOR(YellowFin, YellowFinOp<float, CPUContext>);
	OPERATOR_SCHEMA(YellowFin)
	.NumInputs(10)
	.NumOutputs(8)
	.AllowInplace(
	{{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {7, 7}})
	.SetDoc(R"DOC(

	Computes the YellowFin update (https://arxiv.org/abs/1706.03471) and performs
	momentum SGD optimization step. lr and mu are not being shared between
	parameters. curv_win, g_avg, g2_avg and scalars_memory are just auxiliary
	memory for computing moving averages (see the publication). Takes arguments
	beta: coefficient for moving averages,
	curv_win_width: timeframe when average squared gradient is being stored,
	epsilon: for numerical purposes,
	nesterov and zero_debias for debias of moving average.

	)DOC")
	.Input(0, "param", "Parameters to be updated")
	.Input(1, "moment", "Momentum")
	.Input(2, "lr", "Learning rate")
	.Input(3, "mu", "Momentum coefficient")
	.Input(4, "curv_win", "Memory for latest curvature ranges")
	.Input(5, "g_avg", "Moving average of gradient")
	.Input(6, "g2_avg", "Moving average of squared gradient")
	.Input(7, "scalars_memory", "Memory for stateful scalars")
	.Input(8, "grad", "Gradient computed")
	.Input(9, "iter", "Iteration number")
	.Output(0, "output_param", "Parameters to be updated")
	.Output(1, "output_moment", "Momentum")
	.Output(2, "output_lr", "Output learning rate")
	.Output(3, "output_mu", "Output momentum coefficient")
	.Output(4, "output_curv_win", "Output memory for latest curvature ranges")
	.Output(5, "output_g_avg", "Output moving average of gradient")
	.Output(6, "output_g2_avg", "Output moving average of squared gradient")
	.Output(7, "output_scalars_memory", "Output memory for stateful scalars")
	.Arg("beta", "Default 0.999")
	.Arg("curv_win_width", "Default 20")
	.Arg("epsilon", "Default 1e-6")
	.Arg("nesterov", "Default false")
	.Arg("zero_debias", "Default true");

	SHOULD_NOT_DO_GRADIENT(YellowFin);

	#define CAFFE2_YELLOWFIN_GETLRMU(T) \
	template <> \
	void YellowFinOp<T, CPUContext>::GetLrMu() { \
	const T curv_ratio = std::sqrt(g_norm2_max_deb_ / g_norm2_min_deb_); \
	const T mu_limit = (curv_ratio - 1.0f) / (curv_ratio + 1.0f); \
	const T pre_p = distance_deb_ *g_norm2_min_deb_; \
	const T p = (pre_p * pre_p) / (2.0f * *variance_); \
	const T w3 = (-std::sqrt(p * p + 4.0f / 27.0f * p * p * p) - p) / 2.0f; \
	const T w3_sign = w3 > 0.0f ? 1.0f : -1.0f; \
	const T w = w3_sign * std::pow(std::abs(w3), 1.0f / 3.0f); \
	const T y = w - p / 3.0f / w; \
	const T root = y + 1.0f; \
	mu_ = std::max(root root, mu_limit * mu_limit); \
	lr_ = std::pow(1.0f - std::sqrt(mu_), 2) / *g_norm2_min_deb_; \
	MovingAverage(1, mu_, mu_avg_, mu_avg_out_, mu_deb_); \
	MovingAverage(1, lr_, lr_avg_, lr_avg_out_, lr_deb_); \
	}

	CAFFE2_YELLOWFIN_GETLRMU(float)
	#undef CAFFE2_YELLOWFIN_GETLRMU

	// Usually moment_ == moment_out_ && param_ == param_out_
	#define CAFFE2_YELLOWFIN_MOMENTUMSGDUPDATE(T) \
	template <> \
	void YellowFinOp<T, CPUContext>::MomentumSgdUpdate() { \
	const T mu = *mu_avg_out_; \
	const T lr = *lr_avg_out_; \
	if (!nesterov_) { \
	for (int i = 0; i < D_; ++i) { \
	moment_out_[i] = mu * moment_[i] + lr * grad_[i]; \
	param_out_[i] = param_[i] - moment_out_[i]; \
	} \
	} else { \
	for (int i = 0; i < D_; ++i) { \
	const T moment_i = moment_[i]; \
	moment_out_[i] = mu * moment_i + lr * grad_[i]; \
	param_out_[i] = param_[i] - (1 + mu) * moment_out_[i] + mu * moment_i; \
	} \
	} \
	}

	CAFFE2_YELLOWFIN_MOMENTUMSGDUPDATE(float)
	#undef CAFFE2_YELLOWFIN_MOMENTUMSGDUPDATE

	} // caffe2