blob: ea44d54bded7e7e4600c4372ff95749a27ad7479 [file] [log] [blame]
from caffe2.proto import caffe2_pb2
from collections import Counter, defaultdict
from pycaffe2 import utils
def GetGradientName(name):
"""The function that returns the gradient name for a blob."""
return name + '_grad'
class BlobReference(object):
"""A wrapper around a blob in a net.
BlobReference gives us a way to refer to the network that the blob is
generated from. Note that blobs are, essentially, just strings in the current
workspace.
"""
def __init__(self, name, net):
self._name = name
self._from_net = net
def __str__(self):
return self._name
def Net(self):
return self._from_net
def Grad(self):
return GetGradientName(self._name)
def __getattr__(self, op_type):
"""A wrapper allowing one to initiate operators from a blob reference.
Example: for a blob reference b that comes from network n, doing
b.Relu(...)
is equivalent to doing
net.Relu([b], ...)
"""
def _CreateAndAddToNet(inputs=[], *args, **kwargs):
"""Internal function that routes the operator generation to the network's
__getattr__ function.
"""
# add self to the input list.
inputs.insert(0, self)
return self._from_net.__getattr__(op_type)(inputs, *args, **kwargs)
return _CreateAndAddToNet
def CreateOperator(operator_type):
"""A function wrapper that allows one to create operators based on the
operator type. The type should be a string corresponding to an operator
registered with Caffe2.
"""
def ReallyCreate(inputs, outputs, name='', device_option=None,
arg=None, **kwargs):
operator = caffe2_pb2.OperatorDef()
operator.type = operator_type
operator.name = name
if type(inputs) is str or type(inputs) is BlobReference:
inputs = [inputs]
elif type(inputs) is not list:
raise ValueError("Unknown input format: %s." % str(inputs))
if type(outputs) is str or type(outputs) is BlobReference:
outputs = [outputs]
elif type(outputs) is not list:
raise ValueError("Unknown output format: %s of type %s."
% (str(outputs), type(outputs)))
operator.input.extend([str(i) for i in inputs])
operator.output.extend([str(o) for o in outputs])
if device_option:
operator.device_option.CopyFrom(device_option)
# random seed is defined in the device option, so we need to do special
# care.
if 'random_seed' in kwargs:
operator.device_option.random_seed = kwargs['random_seed']
del kwargs['random_seed']
# Add given arguments that do not need parsing
if arg:
operator.arg.extend(arg)
# Add all other arguments
for key, value in kwargs.iteritems():
operator.arg.add().CopyFrom(utils.MakeArgument(key, value))
return operator
return ReallyCreate
class GradientRegistry(object):
"""GradientRegistry holds the mapping from operators to their gradients."""
gradient_registry_ = {}
@classmethod
def RegisterGradient(cls, op_type):
"""A decorator for registering gradient mappings."""
def Wrapper(func):
cls.gradient_registry_[op_type] = func
return func
return Wrapper
@classmethod
def GetGradient(cls, op):
try:
gradient_ops = cls.gradient_registry_[op.type](op)
except KeyError as err:
raise KeyError('No gradient registered for op: %s' % op.type)
if gradient_ops is None:
return []
if type(gradient_ops) is not list:
gradient_ops = [gradient_ops]
if op.HasField("device_option"):
for gradient_op in gradient_ops:
gradient_op.device_option.CopyFrom(op.device_option)
return gradient_ops
class Net(object):
operator_registry_ = {}
def __init__(self, name):
if type(name) is caffe2_pb2.NetDef:
# We rae initializing a network by a NetDef. In this case, we will
# initialize our network with the given netdef.
self._net = caffe2_pb2.NetDef()
self._net.CopyFrom(name)
# Set the next name index properly.
existing_names = set(
sum([list(op.input) for op in self._net.op], []) +
sum([list(op.output) for op in self._net.op], []))
prefix_len = len(self._net.name + '_blob_')
autogen_indices = [int(name[prefix_len:]) for name in existing_names
if name.startswith(self._net.name + '_blob_')]
if len(autogen_indices):
self._next_name_index = max(autogen_indices) + 1
else:
self._next_name_index = 0
else:
self._net = caffe2_pb2.NetDef()
self._net.name = name
self._next_name_index = 0
def __str__(self):
return self._net.name
def Proto(self):
return self._net
def NextName(self):
"""Returns the next name to be used, if you do not want to explicitly
name your blob."""
output_name = self._net.name + '_blob_' + str(self._next_name_index)
self._next_name_index += 1
return str(output_name)
def AddGradientOperators(self, skip=0):
"""Add the gradient for operators in the net.
Inputs:
skip: skips the first n operators. This is provided mainly because a lot
of nets may use the first few operators for data generation like stuff
which really do not need to have gradients.
Currently, this is hard-coded for float operators if there are branches
(i.e. a blob is used as input to multiple operators). This is because the
inserted SplitOp is hard-coded for float (its gradient, SumOp, is float
only). Supporting other formats is a todo item.
"""
# (1) Make sure that the network is "legal" in terms of computing gradients:
# for every blob there is only going to be one operator that generates it.
all_outputs = sum([list(op.output) for op in self._net.op], [])
if len(all_outputs) != len(set(all_outputs)):
# There is some output that is produced by multiple operators. This is not
# good.
raise RuntimeError("Some blobs are produced multiple times. A count is "
"as follows: " + str(Counter(all_outputs)))
# (2) For cases when a blob is being used by multiple operators, we will
# need to take special care. Currently, we will ask the operators to compute
# the gradients, and add aggregation operators to get the final gradient.
input_counts = Counter(
sum([list(op.input) for op in self._net.op[skip:]], []))
multiple_use_blobs = set(
[key for key in input_counts if input_counts[key] > 1])
# Now, if there are multiple use blobs, it means that we are going to have
# shared parameters, and we want to make special care for them. The
# conventional strategy in Caffe is to insert a SplitLayer that splits
# the input into multiple blobs, with the split layer automatically
# accumulates gradients. This makes the AddGradientOperators stateful, in
# the sense that it may change existing operators, which is not desired.
# In this implementation we will keep the original operator, and instead
# manually modify the gradient names.
num_ops_before_grad = len(self._net.op)
# Obtain the gradient operators that we need. Note that these gradient
# operators are going to be refined, as we need to figure out shared
# parameters.
gradient_ops = sum(
[GradientRegistry.GetGradient(self._net.op[i])
for i in xrange(num_ops_before_grad - 1, skip - 1, -1)], [])
if len(multiple_use_blobs) == 0:
# There is no concern about shared parameters, so we can simply skip.
self._net.op.extend(gradient_ops)
return
# Now, if there are multiple use operators, we will need to figure out
# any gradients that are overwriting each other.
# To do this, we do a first pass over the gradients to count the number
# of occurences of the gradients.
gradient_occurences = defaultdict(int)
for blob_name in multiple_use_blobs:
count = 0
blob_gradient_name = GetGradientName(blob_name)
for op in gradient_ops:
for output_name in op.output:
if output_name == blob_gradient_name:
gradient_occurences[blob_gradient_name] += 1
# For anything that only has one gradient blob generated, we don't need
# to take any special care.
for key in gradient_occurences.keys():
if gradient_occurences[key] == 1:
del gradient_occurences[key]
# Now, we add the gradient ops back to the network, modifying the
# gradient names on the fly.
grad_encountered = defaultdict(int)
for op in gradient_ops:
additional_sum_ops = []
for i, grad_name in enumerate(op.output):
if grad_name in gradient_occurences:
# rename the gradient to an intermediate name
op.output[i] = (
'_%s_autosplit_%d' % (grad_name, grad_encountered[grad_name]))
grad_encountered[grad_name] += 1
if grad_encountered[grad_name] == gradient_occurences[grad_name]:
# We have encountered all gradient names; time to add a SumOp.
additional_sum_ops.append(
CreateOperator('Sum')(
['_%s_autosplit_%d' % (grad_name, i)
for i in range(gradient_occurences[grad_name])],
[grad_name]))
# After re-writing the outputs, we can safely add it to the network.
self._net.op.extend([op])
self._net.op.extend(additional_sum_ops)
return
def RunAllOnGPU(self, gpu_id=0):
"""A convenient function to run everything on the GPU."""
device_option = caffe2_pb2.DeviceOption()
device_option.device_type = caffe2_pb2.CUDA
device_option.cuda_gpu_id = gpu_id
self._net.device_option.CopyFrom(device_option)
def __getattr__(self, operator_type):
if operator_type in self.__class__.operator_registry_:
# Not finished. Operator registry allows one to define custon functions,
# but so far that functionality is not complete.
return self.__class__.operator_registry_
def _CreateAndAddToSelf(inputs, outputs=None, **kwargs):
if outputs is None:
# If we do not specify an output, we will assume that this operator
# produces one output in this case.
outputs = self.NextName()
elif type(outputs) is int:
# In this case, we will auto-fill the given number of outputs with
# auto-generated names.
outputs = [self.NextName() for i in range(outputs)]
op = CreateOperator(operator_type)(inputs, outputs, **kwargs)
self._net.op.extend([op])
if len(op.output) == 0:
return
elif len(op.output) == 1:
return BlobReference(str(op.output[0]), self)
else:
return tuple(BlobReference(str(o), self) for o in op.output)
return _CreateAndAddToSelf
class ExecutionStep(object):
def __init__(self, name):
self._step = caffe2_pb2.ExecutionStep()
self._step.name = name
def __init__(self, name, nets, num_iter=None):
self._step = caffe2_pb2.ExecutionStep()
self._step.name = name
if type(nets) is Net:
nets = [nets]
self._step.network.extend([str(n) for n in nets])
if num_iter is not None:
self._step.num_iter = num_iter
def __str__(self):
return self._step.name
def Proto(self):
return self._step
def SetIter(self, num_iter):
self._step.num_iter = num_iter
def AddSubstep(self, substep):
self._step.substep.add().CopyFrom(substep)
def AddNet(self, net):
self._step.network.add(str(net))
class Plan(object):
def __init__(self, name):
self._plan = caffe2_pb2.PlanDef()
self._plan.name = name
def __str__(self):
return self._plan.name
def Proto(self):
return self._plan
def AddNets(self, nets):
for net in nets:
self._plan.network.add().CopyFrom(net.Proto())
def AddStep(self, step):
self._plan.execution_step.add().CopyFrom(step.Proto())