| #ifndef CAFFE2_SGD_LEARNING_RATE_FUNCTORS_H_ | 
 | #define CAFFE2_SGD_LEARNING_RATE_FUNCTORS_H_ | 
 |  | 
 | #include <cmath> | 
 | #include <list> | 
 | #include <map> | 
 |  | 
 | #ifdef _MSC_VER | 
 | #ifndef _USE_MATH_DEFINES | 
 | #define _USE_MATH_DEFINES | 
 | #endif | 
 | #include <math.h> | 
 | #endif // _MSC_VER | 
 |  | 
 | #include "caffe2/core/context.h" | 
 | #include "caffe2/core/operator.h" | 
 |  | 
 | namespace caffe2 { | 
 |  | 
 | // LearningRateFunctor is a functor that when fed with an iter number, produces | 
 | // the learning rate for the corresponding iteration. | 
 | template <typename T> | 
 | class LearningRateFunctor { | 
 |  public: | 
 |   virtual ~LearningRateFunctor() {} | 
 |   virtual T operator()(const int64_t iter) const = 0; | 
 | }; | 
 |  | 
 | // Fixed: not changing the learning rate at all. | 
 | template <typename T> | 
 | class FixedLearningRate : public LearningRateFunctor<T> { | 
 |  public: | 
 |   T operator()(const int64_t /*iter*/) const override { | 
 |     return 1.; | 
 |   } | 
 | }; | 
 |  | 
 | // Alter: alternatate learning rate with active_period and inactive_period. | 
 | // update for a duration of active_period and then stop for a duration of | 
 | // inactive_period if active_first, and vice versa | 
 | template <typename T> | 
 | class AlternateLearningRate : public LearningRateFunctor<T> { | 
 |  public: | 
 |   AlternateLearningRate( | 
 |       const int64_t active_period, | 
 |       const int64_t inactive_period, | 
 |       const bool active_first) | 
 |       : active_period_(active_period), | 
 |         inactive_period_(inactive_period), | 
 |         active_first_(active_first) {} | 
 |   T operator()(const int64_t iter) const override { | 
 |     if (iter % (active_period_ + inactive_period_) < | 
 |         (active_first_ ? active_period_ : inactive_period_)) { | 
 |       return active_first_ ? 1. : 0.; | 
 |     } else { | 
 |       return active_first_ ? 0. : 1.; | 
 |     }; | 
 |   }; | 
 |  | 
 |   int64_t active_period_; | 
 |   int64_t inactive_period_; | 
 |   bool active_first_; | 
 | }; | 
 |  | 
 | // Step: return gamma ^ (floor(iter / step)) | 
 | template <typename T> | 
 | class StepLearningRate : public LearningRateFunctor<T> { | 
 |  public: | 
 |   StepLearningRate(const int stepsize, const T gamma) | 
 |       : stepsize_(stepsize), gamma_(gamma) {} | 
 |   T operator()(const int64_t iter) const override { | 
 |     return std::pow(gamma_, static_cast<T>(iter / stepsize_)); | 
 |   } | 
 |  | 
 |   int stepsize_; | 
 |   T gamma_; | 
 | }; | 
 |  | 
 | // Exp: return gamma ^ iter | 
 | template <typename T> | 
 | class ExpLearningRate : public LearningRateFunctor<T> { | 
 |  public: | 
 |   explicit ExpLearningRate(const T gamma) : gamma_(gamma) {} | 
 |   T operator()(const int64_t iter) const override { | 
 |     return std::pow(gamma_, static_cast<T>(iter)); | 
 |   } | 
 |  | 
 |   T gamma_; | 
 | }; | 
 |  | 
 | // Gate: return multiplier_1 if before num_iter, else multiplier_2 | 
 | template <typename T> | 
 | class GateLearningRate : public LearningRateFunctor<T> { | 
 |  public: | 
 |   GateLearningRate( | 
 |       const T multiplier_1, | 
 |       const T multiplier_2, | 
 |       const int64_t num_iter) | 
 |       : multiplier_1_(multiplier_1), | 
 |         multiplier_2_(multiplier_2), | 
 |         num_iter_(num_iter) {} | 
 |   T operator()(const int64_t iter) const override { | 
 |     if (iter >= int64_t(num_iter_)) { | 
 |       return T(multiplier_2_); | 
 |     } | 
 |     return T(multiplier_1_); | 
 |   } | 
 |   T multiplier_1_; | 
 |   T multiplier_2_; | 
 |   uint64_t num_iter_; | 
 | }; | 
 |  | 
 | // Inv: return (1 + gamma * iter) ^ (-power) | 
 | template <typename T> | 
 | class InvLearningRate : public LearningRateFunctor<T> { | 
 |  public: | 
 |   InvLearningRate(const T gamma, const T power) | 
 |       : gamma_(gamma), power_(power) {} | 
 |   T operator()(const int64_t iter) const override { | 
 |     return std::pow(T(1) + gamma_ * iter, -power_); | 
 |   } | 
 |   T gamma_; | 
 |   T power_; | 
 | }; | 
 |  | 
 | // Poly: return (1 - iter/max_iter) ^ (power) | 
 | template <typename T> | 
 | class PolyLearningRate : public LearningRateFunctor<T> { | 
 |  public: | 
 |   PolyLearningRate(const T power, const int64_t max_iter) | 
 |       : power_(power), max_iter_(max_iter) {} | 
 |   T operator()(const int64_t iter) const override { | 
 |     return std::pow(1 - T(iter) / T(max_iter_), power_); | 
 |   } | 
 |   T power_; | 
 |   uint64_t max_iter_; | 
 | }; | 
 |  | 
 | // LinearWarmup: return max(iter/num_iter, 1) | 
 | template <typename T> | 
 | class LinearWarmupLearningRate : public LearningRateFunctor<T> { | 
 |  public: | 
 |   LinearWarmupLearningRate(const T start_multiplier, const int64_t num_iter) | 
 |       : start_multiplier_(start_multiplier), num_iter_(num_iter) {} | 
 |   T operator()(const int64_t iter) const override { | 
 |     if (iter >= int64_t(num_iter_)) { | 
 |       return 1.; | 
 |     } | 
 |     return start_multiplier_ + | 
 |         (1. - start_multiplier_) * T(iter) / T(num_iter_); | 
 |   } | 
 |   T start_multiplier_; | 
 |   uint64_t num_iter_; | 
 | }; | 
 |  | 
 | // ConstantWarmup: return scale when iter < num_iter, and 1 otherwise | 
 | template <typename T> | 
 | class ConstantWarmupLearningRate : public LearningRateFunctor<T> { | 
 |  public: | 
 |   ConstantWarmupLearningRate(const T multiplier, const int64_t num_iter) | 
 |       : multiplier_(multiplier), num_iter_(num_iter) {} | 
 |   T operator()(const int64_t iter) const override { | 
 |     if (iter >= int64_t(num_iter_)) { | 
 |       return 1.; | 
 |     } | 
 |     return T(multiplier_); | 
 |   } | 
 |   T multiplier_; | 
 |   uint64_t num_iter_; | 
 | }; | 
 |  | 
 | // ConstantWarmup: return scale when iter < num_iter, and 1 otherwise | 
 | template <typename T> | 
 | class PieceWarmupLearningRate : public LearningRateFunctor<T> { | 
 |  public: | 
 |   PieceWarmupLearningRate( | 
 |       const T m1, | 
 |       const int64_t n1, | 
 |       const T m2, | 
 |       const int64_t n2, | 
 |       const T m3) | 
 |       : m1_(m1), m2_(m2), m3_(m3), n1_(n1), n2_(n2){}; | 
 |  | 
 |   T operator()(const int64_t iter) const override { | 
 |     if (iter < int64_t(n1_)) { | 
 |       return m1_; | 
 |     } else if (iter < int64_t(n2_)) { | 
 |       return m2_; | 
 |     } | 
 |     return m3_; | 
 |   } | 
 |  | 
 |   T m1_, m2_, m3_; | 
 |   uint64_t n1_, n2_; | 
 | }; | 
 |  | 
 | // hill: the learning rate changes according to following 3 stages | 
 | // 1) linear warmup (increasing) at first num_iter steps from start_multiplier | 
 | // 2) inverse shrink (decreasing) afterwards (gamma, power) | 
 | // 3) lower bounded by end_multiplier | 
 | template <typename T> | 
 | class HillLearningRate : public LearningRateFunctor<T> { | 
 |  public: | 
 |   HillLearningRate( | 
 |       const int64_t num_iter, | 
 |       const T start_multiplier, | 
 |       const T gamma, | 
 |       const T power, | 
 |       const T end_multiplier) | 
 |       : linear_warmup_lr_(start_multiplier, num_iter), | 
 |         inv_lr_(gamma, power), | 
 |         num_iter_(num_iter), | 
 |         end_multiplier_(end_multiplier) {} | 
 |   T operator()(const int64_t iter) const override { | 
 |     if (iter < num_iter_) { | 
 |       return linear_warmup_lr_(iter); | 
 |     } else { | 
 |       return std::max(end_multiplier_, inv_lr_(iter - num_iter_)); | 
 |     } | 
 |   } | 
 |   LinearWarmupLearningRate<T> linear_warmup_lr_; | 
 |   InvLearningRate<T> inv_lr_; | 
 |   int64_t num_iter_; | 
 |   T end_multiplier_; | 
 | }; | 
 |  | 
 | // slope: the learning rate changes according to 2 stages | 
 | // 1) constantWarmup with multiplier_1 | 
 | // 2) linearly shink to multiplier_2: | 
 | //  max{ | 
 | //     multiplier_1 + (iter - num_iter_1) * (multiplier_2 - multiplier_1) / (num_iter_2 - num_iter_1), | 
 | //     multiplier_2 | 
 | //  } | 
 | template <typename T> | 
 | class SlopeLearningRate : public LearningRateFunctor<T> { | 
 |  public: | 
 |   SlopeLearningRate( | 
 |       const int64_t num_iter_1, | 
 |       const T multiplier_1, | 
 |       const T num_iter_2, | 
 |       const T multiplier_2) | 
 |       : num_iter_1_(num_iter_1), | 
 |         multiplier_1_(multiplier_1), | 
 |         num_iter_2_(num_iter_2), | 
 |         multiplier_2_(multiplier_2) {} | 
 |   T operator()(const int64_t iter) const override { | 
 |     if (iter < num_iter_1_) { | 
 |       return multiplier_1_; | 
 |     } else { | 
 |       return std::max( | 
 |         multiplier_2_, | 
 |         multiplier_1_ + (iter - num_iter_1_) * (multiplier_2_ - multiplier_1_) / (num_iter_2_ - num_iter_1_) | 
 |       ); | 
 |     } | 
 |   } | 
 |   int64_t num_iter_1_; | 
 |   T multiplier_1_; | 
 |   int64_t num_iter_2_; | 
 |   T multiplier_2_; | 
 | }; | 
 |  | 
 | template <typename T> | 
 | class CompositeLearningRateItem { | 
 |  public: | 
 |   CompositeLearningRateItem( | 
 |       int64_t num_iter, | 
 |       float lr_scale, | 
 |       LearningRateFunctor<T>* policy) | 
 |       : num_iter_(num_iter), lr_scale_(lr_scale), policy_(policy) {} | 
 |   int64_t num_iter_; | 
 |   float lr_scale_; | 
 |   LearningRateFunctor<T>* policy_; | 
 | }; | 
 |  | 
 | // composite: the learning policy changes according to current iteration # | 
 | template <typename T> | 
 | class CompositeLearningRate : public LearningRateFunctor<T> { | 
 |  public: | 
 |   CompositeLearningRate( | 
 |       const std::list<CompositeLearningRateItem<T>>& sub_policies) { | 
 |     TORCH_DCHECK_GT(sub_policies.size(), 0); | 
 |     int64_t num_iter_start = 1; | 
 |     for (auto it = sub_policies.begin(); it != sub_policies.end(); ++it) { | 
 |       TORCH_DCHECK_GT(it->num_iter_, 0); | 
 |       sub_policies_[num_iter_start].reset(it->policy_); | 
 |       sub_policy_lr_scales_[num_iter_start] = it->lr_scale_; | 
 |       num_iter_start += it->num_iter_; | 
 |     } | 
 |   } | 
 |   T operator()(const int64_t iter) const override { | 
 |     auto sub_policy = sub_policies_.upper_bound(iter); | 
 |     DCHECK(sub_policy != sub_policies_.begin()); | 
 |     --sub_policy; | 
 |     auto sub_policy_lr_scale = sub_policy_lr_scales_.upper_bound(iter); | 
 |     DCHECK(sub_policy_lr_scale != sub_policy_lr_scales_.begin()); | 
 |     --sub_policy_lr_scale; | 
 |     return ((*sub_policy->second)(iter)) * (sub_policy_lr_scale->second); | 
 |   } | 
 |  | 
 |  private: | 
 |   std::map<int64_t, std::unique_ptr<LearningRateFunctor<T>>> sub_policies_; | 
 |   std::map<int64_t, float> sub_policy_lr_scales_; | 
 | }; | 
 |  | 
 | // Cyclical: return a learning rate with period 2 * stepsize and | 
 | // lower bound base_lr, upper bound max_lr. | 
 | // See https://arxiv.org/pdf/1506.01186.pdf | 
 | template <typename T> | 
 | class CyclicalLearningRate : public LearningRateFunctor<T> { | 
 |  public: | 
 |   CyclicalLearningRate( | 
 |       const T base_lr, | 
 |       const T max_lr, | 
 |       const int stepsize, | 
 |       const T decay) | 
 |       : base_lr_(base_lr), | 
 |         max_lr_(max_lr), | 
 |         stepsize_(stepsize), | 
 |         decay_(decay) {} | 
 |   T operator()(const int64_t iter) const override { | 
 |     int64_t cycle = static_cast<int>((iter / (2 * stepsize_)) + 1); | 
 |     T x = std::abs(static_cast<T>(iter) / stepsize_ - 2 * cycle + 1); | 
 |     return 1 + | 
 |         (T(std::abs(max_lr_)) / T(std::abs(base_lr_)) - 1) * std::max(T(0.0), (1 - x)) * | 
 |         std::pow(decay_, static_cast<int>(iter / (2 * stepsize_))); | 
 |   } | 
 |   T base_lr_; | 
 |   T max_lr_; | 
 |   int stepsize_; | 
 |   T decay_; | 
 | }; | 
 |  | 
 | // Cosine: return a learning rate with a cosine schedule | 
 | // lower bound min_lr, upper bound max_lr. | 
 | // See https://arxiv.org/pdf/1608.03983.pdf | 
 | template <typename T> | 
 | class CosineLearningRate : public LearningRateFunctor<T> { | 
 |  public: | 
 |   CosineLearningRate( | 
 |       const T min_lr, | 
 |       const T max_lr, | 
 |       const int64_t period, | 
 |       const T t_mult, | 
 |       const T lr_shrink) | 
 |       : min_lr_(min_lr), | 
 |         max_lr_(max_lr), | 
 |         period_(period), | 
 |         t_mult_(t_mult), | 
 |         lr_shrink_(lr_shrink) {} | 
 |   T operator()(const int64_t iter) const override { | 
 |     T i, t_i, t_curr; | 
 |     if (t_mult_ != 1.0) { | 
 |       // the period is changed every time | 
 |       i = floor( | 
 |           log(1 - double(iter) / double(period_) * (1.0 - t_mult_)) / | 
 |           log(t_mult_)); | 
 |       t_i = pow(t_mult_, i) * period_; | 
 |       t_curr = iter - (1.0 - pow(t_mult_, i)) / (1.0 - t_mult_) * period_; | 
 |     } else { | 
 |       // fixed period | 
 |       i = floor(double(iter) / double(period_)); | 
 |       t_i = period_; | 
 |       t_curr = iter - t_i * i; | 
 |     } | 
 |     T lr_shrink = pow(lr_shrink_, i); | 
 |     T min_lr = min_lr_ * lr_shrink; | 
 |     T max_lr = max_lr_ * lr_shrink; | 
 |     T final_lr = | 
 |         min_lr + 0.5 * (max_lr - min_lr) * (1 + cos(M_PI * t_curr / t_i)); | 
 |     return final_lr; | 
 |   } | 
 |   T min_lr_; | 
 |   T max_lr_; | 
 |   int64_t period_; | 
 |   T t_mult_; | 
 |   T lr_shrink_; | 
 | }; | 
 |  | 
 | // constantThenLinearWarmup: first use a constant multiplier | 
 | // and then ramp up to the global lr | 
 | template <typename T> | 
 | class ConstantThenLinearWarmupLearningRate : public LearningRateFunctor<T> { | 
 |  public: | 
 |   ConstantThenLinearWarmupLearningRate( | 
 |       const T start_warmup_multiplier, | 
 |       const int64_t constant_warmup_num_iter, | 
 |       const int64_t linear_warmup_num_iter) | 
 |       : constant_warmup_num_iter_(constant_warmup_num_iter), | 
 |         linear_warmup_num_iter_(linear_warmup_num_iter), | 
 |         constant_warmup_lr_(start_warmup_multiplier, constant_warmup_num_iter), | 
 |         linear_warmup_lr_(start_warmup_multiplier, linear_warmup_num_iter) {} | 
 |  | 
 |   T operator()(const int64_t iter) const override { | 
 |     if (iter < constant_warmup_num_iter_) { | 
 |       return constant_warmup_lr_(iter); | 
 |     } else if (iter < constant_warmup_num_iter_ + linear_warmup_num_iter_) { | 
 |       return linear_warmup_lr_(iter - constant_warmup_num_iter_); | 
 |     } else { | 
 |       return 1.0; | 
 |     } | 
 |   } | 
 |   int64_t constant_warmup_num_iter_; | 
 |   int64_t linear_warmup_num_iter_; | 
 |   ConstantWarmupLearningRate<T> constant_warmup_lr_; | 
 |   LinearWarmupLearningRate<T> linear_warmup_lr_; | 
 | }; | 
 |  | 
 | // CompositeCosineLearningRate: first use a constant multiplier | 
 | // and then ramp up to the global lr, and then use a cosine learning rate | 
 | template <typename T> | 
 | class CompositeCosineLearningRate : public LearningRateFunctor<T> { | 
 |  public: | 
 |   CompositeCosineLearningRate( | 
 |       const T start_warmup_multiplier, | 
 |       const int64_t constant_warmup_num_iter, | 
 |       const int64_t linear_warmup_num_iter, | 
 |       const T cosine_min_lr, | 
 |       const T cosine_max_lr, | 
 |       const int64_t cosine_period, | 
 |       const T consine_t_mult, | 
 |       const T cosine_lr_shrink) | 
 |       : constant_warmup_num_iter_(constant_warmup_num_iter), | 
 |         linear_warmup_num_iter_(linear_warmup_num_iter), | 
 |         constant_then_linear_warmup_lr_( | 
 |             start_warmup_multiplier, | 
 |             constant_warmup_num_iter, | 
 |             linear_warmup_num_iter), | 
 |         cosine_lr_( | 
 |             cosine_min_lr, | 
 |             cosine_max_lr, | 
 |             cosine_period, | 
 |             consine_t_mult, | 
 |             cosine_lr_shrink) {} | 
 |  | 
 |   T operator()(const int64_t iter) const override { | 
 |     if (iter < constant_warmup_num_iter_ + linear_warmup_num_iter_) { | 
 |       return constant_then_linear_warmup_lr_(iter); | 
 |     } | 
 |     return cosine_lr_( | 
 |         iter - constant_warmup_num_iter_ - linear_warmup_num_iter_); | 
 |   } | 
 |  | 
 |   int64_t constant_warmup_num_iter_; | 
 |   int64_t linear_warmup_num_iter_; | 
 |   ConstantThenLinearWarmupLearningRate<T> constant_then_linear_warmup_lr_; | 
 |   CosineLearningRate<T> cosine_lr_; | 
 | }; | 
 |  | 
 | // CompositeCyclicalLearningRate: first use a constant multiplier | 
 | // and then ramp up to the global lr, and then use a cyclical learning rate | 
 | template <typename T> | 
 | class CompositeCyclicalLearningRate : public LearningRateFunctor<T> { | 
 |  public: | 
 |   CompositeCyclicalLearningRate( | 
 |       const T base_lr, | 
 |       const T start_warmup_multiplier, | 
 |       const int64_t constant_warmup_num_iter, | 
 |       const int64_t linear_warmup_num_iter, | 
 |       const T cyclical_max_lr, | 
 |       const int cyclical_step_size, | 
 |       const T cyclical_decay) | 
 |       : constant_warmup_num_iter_(constant_warmup_num_iter), | 
 |         linear_warmup_num_iter_(linear_warmup_num_iter), | 
 |         constant_then_linear_warmup_lr_( | 
 |             start_warmup_multiplier, | 
 |             constant_warmup_num_iter, | 
 |             linear_warmup_num_iter), | 
 |         cyclical_lr_( | 
 |             base_lr, | 
 |             cyclical_max_lr, | 
 |             cyclical_step_size, | 
 |             cyclical_decay) {} | 
 |  | 
 |   T operator()(const int64_t iter) const override { | 
 |     if (iter < constant_warmup_num_iter_ + linear_warmup_num_iter_) { | 
 |       return constant_then_linear_warmup_lr_(iter); | 
 |     } | 
 |     return cyclical_lr_( | 
 |         iter - constant_warmup_num_iter_ - linear_warmup_num_iter_); | 
 |   } | 
 |  | 
 |   int64_t constant_warmup_num_iter_; | 
 |   int64_t linear_warmup_num_iter_; | 
 |   ConstantThenLinearWarmupLearningRate<T> constant_then_linear_warmup_lr_; | 
 |   CyclicalLearningRate<T> cyclical_lr_; | 
 | }; | 
 |  | 
 | } // namespace caffe2 | 
 |  | 
 | #endif // CAFFE2_SGD_LEARNING_RATE_FUNCTORS_H_ |