Implementation of cyclical learning rate (#23914)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/23914
Implementation of cyclical learning rate, see https://arxiv.org/pdf/1506.01186.pdf
Test Plan: canary: https://fburl.com/fblearner/siqb34md
Reviewed By: chenshouyuan
Differential Revision: D16632831
fbshipit-source-id: 20bd9d7fb61af5a8b594b039c5d434a0cc96fadc
diff --git a/caffe2/sgd/learning_rate_functors.h b/caffe2/sgd/learning_rate_functors.h
index 538c544..aec2435 100644
--- a/caffe2/sgd/learning_rate_functors.h
+++ b/caffe2/sgd/learning_rate_functors.h
@@ -250,6 +250,24 @@
std::map<int64_t, std::unique_ptr<LearningRateFunctor<T>>> sub_policies_;
};
+// Cyclical: return a learning rate with period 2 * stepsize and
+// lower bound base_lr, upper bound max_lr.
+// See https://arxiv.org/pdf/1506.01186.pdf
+template <typename T>
+class CyclicalLearningRate : public LearningRateFunctor<T> {
+ public:
+ CyclicalLearningRate(const T base_lr, const T max_lr, const int stepsize)
+ : base_lr_(base_lr), max_lr_(max_lr), stepsize_(stepsize) {}
+ T operator()(const int64_t iter) const override {
+ int cycle = static_cast<int>((iter / (2 * stepsize_)) + 1);
+ T x = abs(static_cast<T>(iter) / stepsize_ - 2 * cycle + 1);
+ return (1 + (T(max_lr_) / T(base_lr_) - 1) * std::max(T(0.0), (1 - x)));
+ }
+ T base_lr_;
+ T max_lr_;
+ int stepsize_;
+};
+
} // namespace caffe2
#endif // CAFFE2_SGD_LEARNING_RATE_FUNCTORS_H_
diff --git a/caffe2/sgd/learning_rate_op.cc b/caffe2/sgd/learning_rate_op.cc
index 2072b80..aa49715 100644
--- a/caffe2/sgd/learning_rate_op.cc
+++ b/caffe2/sgd/learning_rate_op.cc
@@ -32,6 +32,7 @@
`alter`: uses `active_first`, `active_period`, `inactive_period`
`hill`: uses those in both `linearWarmup` and `inv`, plus `end_multiplier`
`composite`: uses `sub_policy_num_iters` and additional args with format
+ `cyclic`: uses `max_lr`, `stepsize`
sub_policy_{sub_policy_index}_{sub_policy_arg}, for example:
sub_policy_0_policy: "exp", sub_policy_0_gamma: 0.99,
sub_policy_0_lr_scale: 1.2
@@ -40,6 +41,7 @@
Optional:
`stepsize`: defaults to 0
+ `max_lr`: defaults to 0.005
`gamma`: defaults to 0
`power`: defaults to 0
`num_iter`: defaults to 0
@@ -68,6 +70,7 @@
.Arg("power", "(float, default 1.0) used only for inv policy type")
.Arg("gamma", "(float, default 1.0) momentum of change")
.Arg("stepsize", "(float, default 1.0) sampling rate on iterations")
+ .Arg("max_lr", "(float, default 0.005) max learning rate")
.Arg("active_first", "(boolean, default True) in alter policy")
.Arg("active_period", "(int64_t, required) in alter policy")
.Arg("inactive_period", "(int64_t, required) in alter policy")
diff --git a/caffe2/sgd/learning_rate_op.h b/caffe2/sgd/learning_rate_op.h
index b355ed9..8a57a31 100644
--- a/caffe2/sgd/learning_rate_op.h
+++ b/caffe2/sgd/learning_rate_op.h
@@ -171,6 +171,14 @@
createLearningRateFunctor(sub_policy, sub_policy_arg_prefix_str)));
}
return new CompositeLearningRate<T>(sub_policies);
+ } else if (policy == "cyclical") {
+ T max_lr =
+ this->template GetSingleArgument<float>(arg_prefix + "max_lr", 0.005);
+ int stepsize =
+ this->template GetSingleArgument<int>(arg_prefix + "stepsize", 0);
+ DCHECK_GT(stepsize, 0);
+ DCHECK_GE(max_lr, base_lr_);
+ return new CyclicalLearningRate<T>(base_lr_, max_lr, stepsize);
} else {
CAFFE_THROW("Unknown learning rate policy: ", policy);
return NULL;