| #include "caffe2/sgd/lars_op.h" |
| |
| namespace caffe2 { |
| |
| template <> |
| void LarsOp<float, CPUContext>::ComputeLearningRate( |
| const float* wd, |
| const float* trust, |
| const float* lr_max, |
| float offset, |
| float lr_min, |
| float* X_norm, |
| float* dX_norm, |
| float* lr_rescaled) { |
| float val = 1.0; |
| |
| if (*X_norm > 0) { |
| val = (*trust) / (*dX_norm / *X_norm + (*wd) + offset); |
| } |
| *lr_rescaled = fmaxf(fminf(val, *lr_max), lr_min); |
| } |
| |
| REGISTER_CPU_OPERATOR(Lars, LarsOp<float, CPUContext>); |
| |
| OPERATOR_SCHEMA(Lars) |
| .NumInputs(5) |
| .NumOutputs(1) |
| .SetDoc(R"DOC( |
| Implement Layer-wise Adaptive Rate Scaling (LARS) with clipping. Before adding weight |
| decay, given a parameter tensor X and its gradient dX, the local learning rate |
| for X will be |
| |
| local_lr = trust * norm(X) / ( norm(dX) + wd * norm(X) + offset * norm(X) ) |
| |
| = trust / ( norm(dX) / norm(X) + wd + offset ), |
| |
| where offset is a preset hyper-parameter to avoid numerical issue and trust |
| indicates how much we trust the layer to change its parameters during one update. |
| In this implementation, we uses l2 norm and the computed local learning rate is |
| clipped based on the upper bound lr_max and the lower bound lr_min: |
| |
| local_lr = min(local_lr, lr_max) and local_lr = max(local_lr, lr_min) |
| |
| )DOC") |
| .Input(0, "X", "Parameter tensor") |
| .Input(1, "dX", "Gradient tensor") |
| .Input(2, "wd", "Weight decay") |
| .Input(3, "trust", "Trust") |
| .Input(4, "lr_max", "Upper bound of learning rate") |
| .Output(0, "lr_rescaled", "Rescaled local learning rate") |
| .Arg("offset", "rescaling offset parameter") |
| .Arg("lr_min", "minimum learning rate for clipping"); |
| |
| SHOULD_NOT_DO_GRADIENT(Lars); |
| } // namespace caffe2 |