blob: 95db375b2a546f2e972c43f6b0cb979d9527e8f2 [file] [log] [blame]
#include "adagrad_op.h"
namespace caffe2 {
namespace {
REGISTER_CPU_OPERATOR(Adagrad, AdagradOp<float, CPUContext>);
OPERATOR_SCHEMA(Adagrad)
.NumInputs(4)
.NumOutputs(2)
.AllowInplace({{0, 0}, {1, 1}})
.SetDoc(R"DOC(
Computes the AdaGrad update for an input gradient and accumulated
history. Concretely, given inputs (param, grad, history, learning_rate),
computes
new_history = history + square(grad)
new_grad = learning_rate * grad / (sqrt(new_history) + epsilon)
new_param = param + new_grad
and returns (new_param, new_history).
)DOC")
.Input(0, "param", "Parameters to be updated")
.Input(1, "moment", "Moment history")
.Input(2, "grad", "Gradient computed")
.Input(3, "lr", "learning rate")
.Output(0, "output_param", "Updated parameters")
.Output(1, "output_moment", "Updated moment")
.Arg("epsilon", "Default 1e-5");
REGISTER_CPU_OPERATOR(SparseAdagrad, SparseAdagradOp<float, CPUContext>);
OPERATOR_SCHEMA(SparseAdagrad)
.NumInputs(5)
.NumOutputs(2)
.AllowInplace({{0, 0}, {1, 1}})
.SetDoc(R"DOC(
Given inputs (param, history, indices, grad, lr), runs the dense AdaGrad
update on (param, grad, history[indices], lr), and returns (new_param,
new_history) as in the dense case.
)DOC")
.Input(0, "param", "Parameters to be updated")
.Input(1, "moment", "Moment history")
.Input(2, "indices", "Sparse indices")
.Input(3, "grad", "Gradient computed")
.Input(4, "lr", "learning rate")
.Output(0, "output_param", "Updated parameters")
.Output(1, "output_moment_1", "Updated moment")
.Arg("epsilon", "Default 1e-5");
SHOULD_NOT_DO_GRADIENT(Adagrad);
SHOULD_NOT_DO_GRADIENT(SparseAdagrad);
}
}