blob: 64637767867a00ff12e3f15a71e1e8f56ff62a1c [file] [log] [blame]
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from caffe2.python import core
from caffe2.proto import caffe2_pb2
def _build_lr(model, base_learning_rate, policy="fixed", iter_val=0,
**other_lr_params):
# Add training operators.
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
ITER = model.param_init_net.ConstantFill([], "ITER", shape=[1],
value=iter_val,
dtype=core.DataType.INT32)
model.net.Iter(ITER, ITER)
# There is one interesting thing here: since we are minimizing, we are
# doing "descent" so the learning rate is set to be negative.
LR = model.net.LearningRate(
[ITER],
"LR",
base_lr=-base_learning_rate,
policy=policy,
**other_lr_params
)
return LR, ITER
def _dedup(model, dedup_indices, grad):
assert (isinstance(grad, core.GradientSlice))
# TODO(dzhulgakov): find a better place to do deduplication
if dedup_indices:
return model.net.DeduplicateGradientSlices(grad)
else:
return grad
def build_sgd(model, base_learning_rate, policy="fixed", **other_lr_params):
LR, _ = _build_lr(model, base_learning_rate, policy, **other_lr_params)
ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
for param, grad in model.GetOptimizationPairs().items():
if isinstance(grad, core.GradientSlice):
model.ScatterWeightedSum(
[param, ONE, grad.indices, grad.values, LR], param
)
else:
model.WeightedSum([param, ONE, grad, LR], param)
def build_ftrl(model, dedup_indices=False, engine="SIMD", **params):
if engine == "SIMD":
assert core.IsOperator('Ftrl_ENGINE_SIMD')
assert core.IsOperator('SparseFtrl_ENGINE_SIMD')
for param, grad in model.GetOptimizationPairs().items():
# allocate additional args of the same shape as main weights
nz = model.param_init_net.ConstantFill(
[param],
param + "_ftrl_nz",
extra_shape=[2],
value=0.0
)
if isinstance(grad, core.GradientSlice):
g = _dedup(model, dedup_indices, grad)
model.SparseFtrl([param, nz, g.indices, g.values],
[param, nz], engine=engine, **params)
else:
model.Ftrl([param, nz, grad], [param, nz], engine=engine, **params)
def build_adagrad(model, base_learning_rate, dedup_indices=False,
parameters=None, **params):
LR, _ = _build_lr(model, base_learning_rate, policy="fixed")
param_to_grad = model.GetOptimizationPairs(parameters)
for param, grad in param_to_grad.items():
# allocate additional args of the same shape as main weights
moment = model.param_init_net.ConstantFill(
[param],
param + "_square_sum",
value=0.0
)
if isinstance(grad, core.GradientSlice):
g = _dedup(model, dedup_indices, grad)
model.SparseAdagrad(
[param, moment, g.indices, g.values, LR], [param, moment],
**params
)
else:
model.Adagrad([param, moment, grad, LR], [param, moment], **params)
def build_adam(model, base_learning_rate, dedup_indices=False, iter_val=0,
**params):
LR, ITER = _build_lr(model, base_learning_rate, policy="fixed",
iter_val=iter_val)
for param, grad in model.GetOptimizationPairs().items():
# allocate additional args of the same shape as main weights
# TODO(nvivek): Fuse input moments if perf critical.
# Currently keeping it separate to keep the math cleaner
m1 = model.param_init_net.ConstantFill(
[param],
param + "_first_moment",
value=0.0
)
m2 = model.param_init_net.ConstantFill(
[param],
param + "_second_moment",
value=0.0
)
if isinstance(grad, core.GradientSlice):
g = _dedup(model, dedup_indices, grad)
model.SparseAdam(
[param, m1, m2, g.indices, g.values, LR, ITER], [param, m1, m2],
**params
)
else:
model.Adam([param, m1, m2, grad, LR, ITER], [param, m1, m2],
**params)