blob: d9633b0216f8a897a9b8135a3f3bbd451d00448c [file] [log] [blame]
#ifndef CAFFE2_SGD_LEARNING_RATE_FUNCTORS_H_
#define CAFFE2_SGD_LEARNING_RATE_FUNCTORS_H_
#include <list>
#include <map>
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
namespace caffe2 {
// LearningRateFunctor is a functor that when fed with an iter number, produces
// the learning rate for the corresponding iteration.
template <typename T>
class LearningRateFunctor {
public:
virtual ~LearningRateFunctor() {}
virtual T operator()(const int64_t iter) const = 0;
};
// Fixed: not changing the learning rate at all.
template <typename T>
class FixedLearningRate : public LearningRateFunctor<T> {
public:
T operator()(const int64_t /*iter*/) const override {
return 1.;
}
};
// Alter: alternatate learning rate with active_period and inactive_period.
// update for for a duration of active_period and then stop for a duration of
// inactive_period if active_first, and vice versa
template <typename T>
class AlternateLearningRate : public LearningRateFunctor<T> {
public:
AlternateLearningRate(
const int64_t active_period,
const int64_t inactive_period,
const bool active_first)
: active_period_(active_period),
inactive_period_(inactive_period),
active_first_(active_first) {}
T operator()(const int64_t iter) const override {
if (iter % (active_period_ + inactive_period_) <
(active_first_ ? active_period_ : inactive_period_)) {
return active_first_ ? 1. : 0.;
} else {
return active_first_ ? 0. : 1.;
};
};
int64_t active_period_;
int64_t inactive_period_;
bool active_first_;
};
// Step: return gamma ^ (floor(iter / step))
template <typename T>
class StepLearningRate : public LearningRateFunctor<T> {
public:
StepLearningRate(const int stepsize, const T gamma)
: stepsize_(stepsize), gamma_(gamma) {}
T operator()(const int64_t iter) const override {
return std::pow(gamma_, static_cast<T>(iter / stepsize_));
}
int stepsize_;
T gamma_;
};
// Exp: return gamma ^ iter
template <typename T>
class ExpLearningRate : public LearningRateFunctor<T> {
public:
explicit ExpLearningRate(const T gamma) : gamma_(gamma) {}
T operator()(const int64_t iter) const override {
return std::pow(gamma_, static_cast<T>(iter));
}
T gamma_;
};
// Gate: return multiplier_1 if before num_iter, else multiplier_2
template <typename T>
class GateLearningRate : public LearningRateFunctor<T> {
public:
GateLearningRate(
const T multiplier_1,
const T multiplier_2,
const int64_t num_iter)
: multiplier_1_(multiplier_1),
multiplier_2_(multiplier_2),
num_iter_(num_iter) {}
T operator()(const int64_t iter) const override {
if (iter >= int64_t(num_iter_)) {
return T(multiplier_2_);
}
return T(multiplier_1_);
}
T multiplier_1_;
T multiplier_2_;
uint64_t num_iter_;
};
// Inv: return (1 + gamma * iter) ^ (-power)
template <typename T>
class InvLearningRate : public LearningRateFunctor<T> {
public:
InvLearningRate(const T gamma, const T power)
: gamma_(gamma), power_(power) {}
T operator()(const int64_t iter) const override {
return std::pow(T(1) + gamma_ * iter, -power_);
}
T gamma_;
T power_;
};
// Poly: return (1 - iter/max_iter) ^ (power)
template <typename T>
class PolyLearningRate : public LearningRateFunctor<T> {
public:
PolyLearningRate(const T power, const int64_t max_iter)
: power_(power), max_iter_(max_iter) {}
T operator()(const int64_t iter) const override {
return std::pow(1 - T(iter) / T(max_iter_), power_);
}
T power_;
uint64_t max_iter_;
};
// LinearWarmup: return max(iter/num_iter, 1)
template <typename T>
class LinearWarmupLearningRate : public LearningRateFunctor<T> {
public:
LinearWarmupLearningRate(const T start_multiplier, const int64_t num_iter)
: start_multiplier_(start_multiplier), num_iter_(num_iter) {}
T operator()(const int64_t iter) const override {
if (iter >= int64_t(num_iter_)) {
return 1.;
}
return start_multiplier_ +
(1. - start_multiplier_) * T(iter) / T(num_iter_);
}
T start_multiplier_;
uint64_t num_iter_;
};
// ConstantWarmup: return scale when iter < num_iter, and 1 otherwise
template <typename T>
class ConstantWarmupLearningRate : public LearningRateFunctor<T> {
public:
ConstantWarmupLearningRate(const T multiplier, const int64_t num_iter)
: multiplier_(multiplier), num_iter_(num_iter) {}
T operator()(const int64_t iter) const override {
if (iter >= int64_t(num_iter_)) {
return 1.;
}
return T(multiplier_);
}
T multiplier_;
uint64_t num_iter_;
};
// ConstantWarmup: return scale when iter < num_iter, and 1 otherwise
template <typename T>
class PieceWarmupLearningRate : public LearningRateFunctor<T> {
public:
PieceWarmupLearningRate(
const T m1,
const int64_t n1,
const T m2,
const int64_t n2,
const T m3)
: m1_(m1), m2_(m2), m3_(m3), n1_(n1), n2_(n2){};
T operator()(const int64_t iter) const override {
if (iter < int64_t(n1_)) {
return m1_;
} else if (iter < int64_t(n2_)) {
return m2_;
}
return m3_;
}
T m1_, m2_, m3_;
uint64_t n1_, n2_;
};
// hill: the learning rate changes according to following 3 stages
// 1) linear warmup (increasing) at first num_iter steps from start_multiplier
// 2) inverse shrink (decreasing) afterwards (gamma, power)
// 3) lower bounded by end_multiplier
template <typename T>
class HillLearningRate : public LearningRateFunctor<T> {
public:
HillLearningRate(
const int64_t num_iter,
const T start_multiplier,
const T gamma,
const T power,
const T end_multiplier)
: linear_warmup_lr_(start_multiplier, num_iter),
inv_lr_(gamma, power),
num_iter_(num_iter),
end_multiplier_(end_multiplier) {}
T operator()(const int64_t iter) const override {
if (iter < num_iter_) {
return linear_warmup_lr_(iter);
} else {
return std::max(end_multiplier_, inv_lr_(iter - num_iter_));
}
}
LinearWarmupLearningRate<T> linear_warmup_lr_;
InvLearningRate<T> inv_lr_;
int64_t num_iter_;
T end_multiplier_;
};
template <typename T>
class CompositeLearningRateItem {
public:
CompositeLearningRateItem(
int64_t num_iter,
float lr_scale,
LearningRateFunctor<T>* policy)
: num_iter_(num_iter), lr_scale_(lr_scale), policy_(policy) {}
int64_t num_iter_;
float lr_scale_;
LearningRateFunctor<T>* policy_;
};
// composite: the learning policy changes according to current iteration #
template <typename T>
class CompositeLearningRate : public LearningRateFunctor<T> {
public:
CompositeLearningRate(
const std::list<CompositeLearningRateItem<T>>& sub_policies) {
DCHECK_GT(sub_policies.size(), 0);
int64_t num_iter_start = 1;
for (auto it = sub_policies.begin(); it != sub_policies.end(); ++it) {
DCHECK_GT(it->num_iter_, 0);
sub_policies_[num_iter_start].reset(it->policy_);
sub_policy_lr_scales_[num_iter_start] = it->lr_scale_;
num_iter_start += it->num_iter_;
}
}
T operator()(const int64_t iter) const override {
auto sub_policy = sub_policies_.upper_bound(iter);
DCHECK(sub_policy != sub_policies_.begin());
--sub_policy;
auto sub_policy_lr_scale = sub_policy_lr_scales_.upper_bound(iter);
DCHECK(sub_policy_lr_scale != sub_policy_lr_scales_.begin());
--sub_policy_lr_scale;
return ((*sub_policy->second)(iter)) * (sub_policy_lr_scale->second);
}
private:
std::map<int64_t, std::unique_ptr<LearningRateFunctor<T>>> sub_policies_;
std::map<int64_t, float> sub_policy_lr_scales_;
};
// Cyclical: return a learning rate with period 2 * stepsize and
// lower bound base_lr, upper bound max_lr.
// See https://arxiv.org/pdf/1506.01186.pdf
template <typename T>
class CyclicalLearningRate : public LearningRateFunctor<T> {
public:
CyclicalLearningRate(
const T base_lr,
const T max_lr,
const int stepsize,
const T decay)
: base_lr_(base_lr),
max_lr_(max_lr),
stepsize_(stepsize),
decay_(decay) {}
T operator()(const int64_t iter) const override {
int64_t cycle = static_cast<int>((iter / (2 * stepsize_)) + 1);
T x = abs(static_cast<T>(iter) / stepsize_ - 2 * cycle + 1);
return 1 +
(T(abs(max_lr_)) / T(abs(base_lr_)) - 1) * std::max(T(0.0), (1 - x)) *
std::pow(decay_, static_cast<int>(iter / (2 * stepsize_)));
}
T base_lr_;
T max_lr_;
int stepsize_;
T decay_;
};
// constantThenLinearWarmup: first use a constant multiplier
// and then ramp up to the global lr
template <typename T>
class ConstantThenLinearWarmupLearningRate : public LearningRateFunctor<T> {
public:
ConstantThenLinearWarmupLearningRate(
const T start_warmup_multiplier,
const int64_t constant_warmup_num_iter,
const int64_t linear_warmup_num_iter)
: constant_warmup_num_iter_(constant_warmup_num_iter),
linear_warmup_num_iter_(linear_warmup_num_iter),
constant_warmup_lr_(start_warmup_multiplier, constant_warmup_num_iter),
linear_warmup_lr_(start_warmup_multiplier, linear_warmup_num_iter) {}
T operator()(const int64_t iter) const override {
if (iter < constant_warmup_num_iter_) {
return constant_warmup_lr_(iter);
} else if (iter < constant_warmup_num_iter_ + linear_warmup_num_iter_) {
return linear_warmup_lr_(iter - constant_warmup_num_iter_);
} else {
return 1.0;
}
}
int64_t constant_warmup_num_iter_;
int64_t linear_warmup_num_iter_;
ConstantWarmupLearningRate<T> constant_warmup_lr_;
LinearWarmupLearningRate<T> linear_warmup_lr_;
};
// CompositeCyclicalLearningRate: first use a constant multiplier
// and then ramp up to the global lr, and then use a cyclical learning rate
template <typename T>
class CompositeCyclicalLearningRate : public LearningRateFunctor<T> {
public:
CompositeCyclicalLearningRate(
const T base_lr,
const T start_warmup_multiplier,
const int64_t constant_warmup_num_iter,
const int64_t linear_warmup_num_iter,
const T cyclical_max_lr,
const int cyclical_step_size,
const T cyclical_decay)
: constant_warmup_num_iter_(constant_warmup_num_iter),
linear_warmup_num_iter_(linear_warmup_num_iter),
constant_then_linear_warmup_lr_(
start_warmup_multiplier,
constant_warmup_num_iter,
linear_warmup_num_iter),
cyclical_lr_(
base_lr,
cyclical_max_lr,
cyclical_step_size,
cyclical_decay) {}
T operator()(const int64_t iter) const override {
if (iter < constant_warmup_num_iter_ + linear_warmup_num_iter_) {
return constant_then_linear_warmup_lr_(iter);
}
return cyclical_lr_(
iter - constant_warmup_num_iter_ - linear_warmup_num_iter_);
}
int64_t constant_warmup_num_iter_;
int64_t linear_warmup_num_iter_;
ConstantThenLinearWarmupLearningRate<T> constant_then_linear_warmup_lr_;
CyclicalLearningRate<T> cyclical_lr_;
};
} // namespace caffe2
#endif // CAFFE2_SGD_LEARNING_RATE_FUNCTORS_H_