blob: 2ba2896d6150f57716341bb63b32b2223b348b72 [file] [log] [blame]
## @package optimizer
# Module caffe2.python.optimizer
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from collections import namedtuple
from past.builtins import basestring
from caffe2.python import core, scope
from caffe2.python.modeling import parameter_info
from caffe2.proto import caffe2_pb2
_OPTIMIZER_ITERATION_NAME = "optimizer_iteration"
AuxOptimizerParams = namedtuple("AuxOptimizerParams", ["local", "shared"])
class Optimizer(object):
def __init__(self):
self._aux_params = AuxOptimizerParams(local=[], shared=[])
'''
Adds optimization operators to the net for given parameter and its gradient
Parameter is specified by either 'param' being a ParameterInfo object.
In this case param.grad has to be set
Or by 'param' being a BlobReference and 'grad' being a BlobReference for its
gradient.
'''
def __call__(self, net, param_init_net, param, grad=None):
if grad is None:
assert isinstance(param, parameter_info.ParameterInfo)
assert param.grad is not None
else:
if isinstance(param, basestring):
param = core.BlobReference(param)
param = parameter_info.ParameterInfo(
param_id=None, param=param, grad=grad)
self._run(net, param_init_net, param)
def _run(self, net, param_init_net, param_info):
raise Exception("Not Impelemented")
@staticmethod
def build_lr(net, param_init_net, base_learning_rate,
learning_rate_blob="lr", policy="fixed",
iter_val=0, **kwargs):
if not param_init_net.BlobIsDefined(_OPTIMIZER_ITERATION_NAME):
# Add training operators.
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
iteration = param_init_net.ConstantFill(
[], _OPTIMIZER_ITERATION_NAME, shape=[1],
value=iter_val,
dtype=core.DataType.INT64)
iter_mutex = param_init_net.CreateMutex([], ["iteration_mutex"])
net.AtomicIter([iter_mutex, iteration], [iteration])
else:
iteration = param_init_net.GetBlobRef(_OPTIMIZER_ITERATION_NAME)
# There is one interesting thing here: since we are minimizing, we are
# doing "descent" so the learning rate is set to be negative.
lr = net.LearningRate(
[iteration],
learning_rate_blob,
base_lr=-base_learning_rate,
policy=policy,
**kwargs
)
return lr, iteration
@staticmethod
def dedup(net, sparse_dedup_aggregator, grad):
assert (isinstance(grad, core.GradientSlice))
if sparse_dedup_aggregator:
return net.DeduplicateGradientSlices(
grad, aggregator=sparse_dedup_aggregator)
else:
return grad
def get_auxiliary_parameters(self):
"""Returns a list of auxiliary parameters.
Returns:
aux_params: A namedtuple, AuxParams.
aux_params.local stores a list of blobs. Each blob is a local
auxiliary parameter. A local auxiliary parameter is a parameter in
parallel to a learning rate parameter. Take adagrad as an example,
the local auxiliary parameter is the squared sum parameter, because
every learning rate has a squared sum associated with it.
aux_params.shared also stores a list of blobs. Each blob is a shared
auxiliary parameter. A shared auxiliary parameter is a parameter
that is shared across all the learning rate parameters. Take adam as
an example, the iteration parameter is a shared parameter, because
all the learning rates share the same iteration parameter.
"""
return self._aux_params
# TODO(xlwang): In transfer learning, parameter initialized from pretrained
# model might require a different learning rate than otherwise initialized.
# To this end, here we implement a python solution where
# `base_learning_rate` is scaled by `scale`, by calling
# `scale_learning_rate`; Alternatively, we can achieve same effect by
# rewriting the LearningRate operator in C++
# Note that it is the responsibility of specific optimizer to decide what
# logic should be used for `scale_learning_rate`
def scale_learning_rate(self, *args, **kwargs):
raise NotImplementedError(
"Optimizer Need to Implement `scale_learning_rate` method.")
class SgdOptimizer(Optimizer):
def __init__(self, base_learning_rate=0.01, policy='fixed',
momentum=0.0, nesterov=1, **kwargs):
super(SgdOptimizer, self).__init__()
self.base_learning_rate = base_learning_rate
self.policy = policy
self.momentum = momentum
self.nesterov = nesterov
self.init_kwargs = kwargs
def _run(self, net, param_init_net, param_info):
param = param_info.blob
grad = param_info.grad
if self.base_learning_rate == 0:
return
assert self.base_learning_rate > 0
# We need negative sign for LR when used directly with WeightedSum
# below.
lr_sign = -1 if self.momentum else 1
lr, _ = self.build_lr(
net, param_init_net,
base_learning_rate=self.base_learning_rate * lr_sign,
learning_rate_blob=str(param) + "_lr",
policy=self.policy,
**(self.init_kwargs)
)
dev = scope.CurrentDeviceScope()
if dev is None:
dev = core.DeviceOption(caffe2_pb2.CPU)
# Each GPU/CPU must have its own ONE blob, thus modify the name
# to include device information.
ONE = param_init_net.ConstantFill(
[],
"ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
shape=[1],
value=1.0
)
self._aux_params.shared.append(ONE)
if self.momentum > 0:
momentum_data = param_init_net.ConstantFill(
param, str(param) + "_momentum", value=0.)
self._aux_params.local.append(momentum_data)
if isinstance(grad, core.GradientSlice):
assert self.momentum == 0., "Doesn't support momentum for sparse"
net.ScatterWeightedSum(
[param, ONE, grad.indices, grad.values, lr],
param
)
else:
if self.momentum > 0.:
net.MomentumSGDUpdate(
[grad, momentum_data, lr, param],
[grad, momentum_data, param],
momentum=self.momentum,
nesterov=self.nesterov)
else:
coeff = lr
net.WeightedSum(
[param, ONE, grad, coeff],
param
)
def scale_learning_rate(self, scale):
self.base_learning_rate *= scale
return
class MultiPrecisionSgdOptimizer(SgdOptimizer):
def __init__(self, base_learning_rate=0.1, momentum=0.0,
policy="fixed", nesterov=1, **kwargs):
super(SgdOptimizer, self).__init__()
self.base_learning_rate = base_learning_rate
self.momentum = momentum
self.policy = policy
self.nesterov = nesterov
self.init_kwargs = kwargs
def _run(self, net, param_init_net, param_info):
param = param_info.blob
param_fp32 = param_info.blob_copy[core.DataType.FLOAT] \
if param_info.blob_copy is not None else None
# If we have a straight fp32 parameter, run the base class
if param_fp32 == None:
return SgdOptimizer._run(self, net, param_init_net, param_info)
grad = param_info.grad
if self.base_learning_rate == 0:
return
assert self.base_learning_rate > 0
lr, _ = self.build_lr(
net, param_init_net,
base_learning_rate=-self.base_learning_rate,
learning_rate_blob=param + "_lr",
policy=self.policy,
**(self.init_kwargs)
)
momentum_data = param_init_net.ConstantFill(
param_fp32, str(param) + "_momentum", value=0.)
self._aux_params.local.append(momentum_data)
assert not isinstance(grad, core.GradientSlice), \
"Doesn't support sparse gradients"
# Copy gradient to fp32
grad_fp32 = net.HalfToFloat(grad, grad + "_fp32")
# update (fused) in fp32
net.MomentumSGDUpdate(
[grad_fp32, momentum_data, lr, param_fp32],
[grad_fp32, momentum_data, param_fp32],
momentum=self.momentum,
nesterov=self.nesterov)
# Copy updated param back to fp16
net.FloatToHalf(param_fp32, param)
class WeightDecayBuilder(Optimizer):
def __init__(self, weight_decay):
self.weight_decay = weight_decay
def _run(self, net, param_init_net, param_info):
dev = scope.CurrentDeviceScope()
if dev is None:
dev = core.DeviceOption(caffe2_pb2.CPU)
ONE = param_init_net.ConstantFill(
[],
"ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
shape=[1],
value=1.0
)
WD = param_init_net.ConstantFill(
[], "wd_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
shape=[1], value=self.weight_decay
)
if isinstance(param_info.grad, core.GradientSlice):
assert "Weight decay does not yet support sparse gradients"
else:
net.WeightedSum(
[param_info.grad, ONE, param_info.blob, WD],
param_info.grad,
)
class AdagradOptimizer(Optimizer):
def __init__(self, alpha=0.01, epsilon=1e-4, policy="fixed",
sparse_dedup_aggregator=None, engine='', **kwargs):
super(AdagradOptimizer, self).__init__()
self.alpha = alpha
self.epsilon = epsilon
self.policy = policy
self.sparse_dedup_aggregator = sparse_dedup_aggregator
self.engine = engine
self.init_kwargs = kwargs
def _run(self, net, param_init_net, param_info):
param = param_info.blob
grad = param_info.grad
if self.alpha <= 0:
return
lr, _ = self.build_lr(
net, param_init_net,
base_learning_rate=self.alpha,
learning_rate_blob=str(param) + "_lr",
policy=self.policy,
**(self.init_kwargs)
)
param_squared_sum = param_init_net.ConstantFill(
[param],
str(param) + "_squared_sum",
value=0.0
)
self._aux_params.local.append(param_squared_sum)
if isinstance(grad, core.GradientSlice):
grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
net.SparseAdagrad(
[param, param_squared_sum, grad.indices, grad.values, lr],
[param, param_squared_sum],
epsilon=self.epsilon,
engine=self.engine
)
else:
net.Adagrad(
[param, param_squared_sum, grad, lr],
[param, param_squared_sum],
epsilon=self.epsilon,
engine=self.engine
)
def scale_learning_rate(self, scale):
self.alpha *= scale
return
class FtrlOptimizer(Optimizer):
def __init__(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0,
sparse_dedup_aggregator=None, engine=''):
super(FtrlOptimizer, self).__init__()
self.alpha = alpha
self.beta = beta
self.lambda1 = lambda1
self.lambda2 = lambda2
self.sparse_dedup_aggregator = sparse_dedup_aggregator
self.engine = engine
def _run(self, net, param_init_net, param_info):
param = param_info.blob
grad = param_info.grad
if self.alpha <= 0:
return
nz = param_init_net.ConstantFill(
[param],
str(param) + "_ftrl_nz",
extra_shape=[2],
value=0.0
)
self._aux_params.local.append(nz)
if isinstance(grad, core.GradientSlice):
grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
net.SparseFtrl(
[param, nz, grad.indices, grad.values],
[param, nz],
engine=self.engine,
alpha=self.alpha,
beta=self.beta,
lambda1=self.lambda1,
lambda2=self.lambda2
)
else:
net.Ftrl(
[param, nz, grad],
[param, nz],
engine=self.engine,
alpha=self.alpha,
beta=self.beta,
lambda1=self.lambda1,
lambda2=self.lambda2
)
def scale_learning_rate(self, scale):
self.alpha *= scale
return
class AdamOptimizer(Optimizer):
def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
policy='fixed', sparse_dedup_aggregator=None,
engine='', **kwargs):
super(AdamOptimizer, self).__init__()
self.alpha = alpha
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.policy = policy
self.sparse_dedup_aggregator = sparse_dedup_aggregator
self.engine = engine
self.init_kwargs = kwargs
def _run(self, net, param_init_net, param_info):
param = param_info.blob
grad = param_info.grad
if self.alpha <= 0:
return
lr, iteration = self.build_lr(
net, param_init_net,
base_learning_rate=self.alpha,
learning_rate_blob=str(param) + "_lr",
policy=self.policy,
**(self.init_kwargs)
)
m1 = param_init_net.ConstantFill(
[param],
param + "_first_moment",
value=0.0
)
m2 = param_init_net.ConstantFill(
[param],
param + "_second_moment",
value=0.0
)
self._aux_params.shared.append(iteration)
self._aux_params.local.append(m1)
self._aux_params.local.append(m2)
if isinstance(grad, core.GradientSlice):
grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
net.SparseAdam(
[param, m1, m2, grad.indices, grad.values, lr, iteration],
[param, m1, m2],
beta1=self.beta1,
beta2=self.beta2,
epsilon=self.epsilon
)
else:
net.Adam(
[param, m1, m2, grad, lr, iteration],
[param, m1, m2],
beta1=self.beta1,
beta2=self.beta2,
epsilon=self.epsilon)
def scale_learning_rate(self, scale):
self.alpha *= scale
return
def _get_param_to_device(model):
# Infer blob devices by going through the net and param_init_net
# ops and observing the device used to create or use the blob.
param_to_device = core.InferBlobDevices(model.net)
param_to_device.update(core.InferBlobDevices(model.param_init_net))
return param_to_device
def _build(model, optimizer, weights_only=False):
param_to_device = _get_param_to_device(model)
# Validate there are no duplicate params
model.Validate()
# Call optimizer for each param
for param_info in model.GetOptimizationParamInfo():
if weights_only:
if param_info.blob not in model.weights:
continue
param_name = str(param_info.blob)
# We first check if parameter's device has been inferred. If not,
# we check the gradient. This can happen if parameter is not output
# by any blob but created by a FetchBlob.
device = None
if param_name in param_to_device:
device = param_to_device[param_name]
else:
if isinstance(param_info.grad, core.GradientSlice):
grad = param_info.grad
if str(grad.values) in param_to_device:
device = param_to_device[str(grad.values)]
elif str(grad.indices) in param_to_device:
device = param_to_device[str(grad.indices)]
else:
grad_name = str(param_info.grad)
if grad_name in param_to_device:
device = param_to_device[grad_name]
assert device is not None,\
"Cannot infer device for {}: no op creates it".format(param_name)
with core.DeviceScope(device):
optimizer(model.net, model.param_init_net, param_info)
return optimizer
def add_weight_decay(model, weight_decay):
"""Adds a decay to weights in the model.
This is a form of L2 regularization.
Args:
weight_decay: strength of the regularization
"""
_build(
model,
WeightDecayBuilder(weight_decay=weight_decay),
weights_only=True,
)
def build_sgd(model, base_learning_rate, **kwargs):
sgd_optimizer = SgdOptimizer(base_learning_rate, **kwargs)
return _build(model, sgd_optimizer)
def build_multi_precision_sgd(model, base_learning_rate, **kwargs):
multi_prec_sgd_optimizer = MultiPrecisionSgdOptimizer(
base_learning_rate, **kwargs
)
return _build(model, multi_prec_sgd_optimizer)
def build_ftrl(model, engine="SIMD", **kwargs):
if engine == "SIMD":
assert core.IsOperator('Ftrl_ENGINE_SIMD')
assert core.IsOperator('SparseFtrl_ENGINE_SIMD')
ftrl_optimizer = FtrlOptimizer(engine=engine, **kwargs)
return _build(model, ftrl_optimizer)
def build_adagrad(model, base_learning_rate, parameters=None, **kwargs):
adagrad_optimizer = AdagradOptimizer(alpha=base_learning_rate, **kwargs)
return _build(model, adagrad_optimizer)
def build_adam(model, base_learning_rate, **kwargs):
adam_optimizer = AdamOptimizer(alpha=base_learning_rate, **kwargs)
return _build(model, adam_optimizer)