| from __future__ import absolute_import |
| from __future__ import division |
| from __future__ import print_function |
| |
| from caffe2.python import core |
| from caffe2.proto import caffe2_pb2 |
| |
| |
| def _build_lr(model, base_learning_rate, policy="fixed", iter_val=0, |
| **other_lr_params): |
| |
| # Add training operators. |
| with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): |
| ITER = model.param_init_net.ConstantFill([], "ITER", shape=[1], |
| value=iter_val, |
| dtype=core.DataType.INT32) |
| |
| model.net.Iter(ITER, ITER) |
| |
| # There is one interesting thing here: since we are minimizing, we are |
| # doing "descent" so the learning rate is set to be negative. |
| LR = model.net.LearningRate( |
| [ITER], |
| "LR", |
| base_lr=-base_learning_rate, |
| policy=policy, |
| **other_lr_params |
| ) |
| return LR, ITER |
| |
| |
| def _dedup(model, dedup_indices, grad): |
| assert (isinstance(grad, core.GradientSlice)) |
| # TODO(dzhulgakov): find a better place to do deduplication |
| if dedup_indices: |
| return model.net.DeduplicateGradientSlices(grad) |
| else: |
| return grad |
| |
| |
| def build_sgd(model, base_learning_rate, policy="fixed", **other_lr_params): |
| LR, _ = _build_lr(model, base_learning_rate, policy, **other_lr_params) |
| |
| ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) |
| for param, grad in model.GetOptimizationPairs().items(): |
| if isinstance(grad, core.GradientSlice): |
| model.ScatterWeightedSum( |
| [param, ONE, grad.indices, grad.values, LR], param |
| ) |
| else: |
| model.WeightedSum([param, ONE, grad, LR], param) |
| |
| |
| def build_ftrl(model, dedup_indices=False, engine="SIMD", **params): |
| if engine == "SIMD": |
| assert core.IsOperator('Ftrl_ENGINE_SIMD') |
| assert core.IsOperator('SparseFtrl_ENGINE_SIMD') |
| for param, grad in model.GetOptimizationPairs().items(): |
| # allocate additional args of the same shape as main weights |
| nz = model.param_init_net.ConstantFill( |
| [param], |
| param + "_ftrl_nz", |
| extra_shape=[2], |
| value=0.0 |
| ) |
| if isinstance(grad, core.GradientSlice): |
| g = _dedup(model, dedup_indices, grad) |
| model.SparseFtrl([param, nz, g.indices, g.values], |
| [param, nz], engine=engine, **params) |
| else: |
| model.Ftrl([param, nz, grad], [param, nz], engine=engine, **params) |
| |
| |
| def build_adagrad(model, base_learning_rate, dedup_indices=False, |
| parameters=None, **params): |
| LR, _ = _build_lr(model, base_learning_rate, policy="fixed") |
| param_to_grad = model.GetOptimizationPairs(parameters) |
| |
| for param, grad in param_to_grad.items(): |
| # allocate additional args of the same shape as main weights |
| moment = model.param_init_net.ConstantFill( |
| [param], |
| param + "_square_sum", |
| value=0.0 |
| ) |
| if isinstance(grad, core.GradientSlice): |
| g = _dedup(model, dedup_indices, grad) |
| model.SparseAdagrad( |
| [param, moment, g.indices, g.values, LR], [param, moment], |
| **params |
| ) |
| |
| else: |
| model.Adagrad([param, moment, grad, LR], [param, moment], **params) |
| |
| |
| def build_adam(model, base_learning_rate, dedup_indices=False, iter_val=0, |
| **params): |
| LR, ITER = _build_lr(model, base_learning_rate, policy="fixed", |
| iter_val=iter_val) |
| for param, grad in model.GetOptimizationPairs().items(): |
| # allocate additional args of the same shape as main weights |
| # TODO(nvivek): Fuse input moments if perf critical. |
| # Currently keeping it separate to keep the math cleaner |
| m1 = model.param_init_net.ConstantFill( |
| [param], |
| param + "_first_moment", |
| value=0.0 |
| ) |
| m2 = model.param_init_net.ConstantFill( |
| [param], |
| param + "_second_moment", |
| value=0.0 |
| ) |
| if isinstance(grad, core.GradientSlice): |
| g = _dedup(model, dedup_indices, grad) |
| model.SparseAdam( |
| [param, m1, m2, g.indices, g.values, LR, ITER], [param, m1, m2], |
| **params |
| ) |
| |
| else: |
| model.Adam([param, m1, m2, grad, LR, ITER], [param, m1, m2], |
| **params) |