caffe2/python/data_parallel_model.py - platform/external/pytorch - Git at Google

 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

 from types import FunctionType
 from functools import wraps
 import six

 from caffe2.python import cnn, dyndep, scope, workspace, core
 from caffe2.proto import caffe2_pb2

 dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/nccl:nccl_ops")


 DATAPARALLEL_OPS = [
     "Conv",
     "ConvTranspose",
     "GroupConv",
     "FC",
     "FC_Decomp",
     "FC_Prune",
     "FC_Sparse",
     "LRN",
     "Dropout",
     "MaxPool",
     "AveragePool",
     "Concat",
     "DepthConcat",
     "Relu",
     "Transpose",
     "SpatialBN",
     "Accuracy",
     "Adam",
     "AveragedLoss",
     "Cast",
     "LabelCrossEntropy",
     "LearningRate",
     "Print",
     "Scale",
     "Snapshot",
     "Softmax",
     "StopGradient",
     "Summarize",
     "Sum",
     "Tanh",
     "WeightedSum",
     "SquaredL2Distance",
 ]


 class _GPUDataParallelMetaClass(type):
     """A meta class to patch method in order to distribute them over multiple
     GPUs.
     """
     _devices = []

     @staticmethod
     def _data_parallel_wrapper(op):
         @wraps(op)
         def wrapped(cls, blob_in, blob_out, *args, **kwargs):
             # Helpers to extract a device specific blob or a global blob
             def self_or_item(d, key):
                 if isinstance(d, dict):
                     assert key in d
                     return d[key]
                 return d

             def get_input(gpu_id):
                 if isinstance(blob_in, list):
                     return [self_or_item(blob, gpu_id) for blob in blob_in]
                 return self_or_item(blob_in, gpu_id)

             def get_output(gpu_id):
                 return self_or_item(blob_out, gpu_id)

             # If we have explicit device scope, we do not parallelize
             if cls.explicit_scope():
                 return op(
                     cls,
                     blob_in,
                     blob_out,
                     *args,
                     **kwargs)

             devices = _GPUDataParallelMetaClass._devices
             results = {}
             for gpu_id in devices:
                 with core.NameScope("gpu_{}".format(gpu_id)):
                     device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
                     with core.DeviceScope(device):
                         result = op(
                             cls,
                             get_input(gpu_id),
                             get_output(gpu_id),
                             *args,
                             **kwargs)
                         results[gpu_id] = result
             return results

         return wrapped

     def __new__(meta, classname, bases, class_dict):
         assert len(bases) == 1, "Expects only one base class"
         base = bases[0]
         assert base is cnn.CNNModelHelper, "Base class should be CNNModelHelper"
         new_class_dict = {}
         for name, attr in base.__dict__.items():
             if name not in DATAPARALLEL_OPS:
                 continue
             attr = _GPUDataParallelMetaClass._data_parallel_wrapper(attr)
             new_class_dict[name] = attr
         for name, attr in class_dict.items():
             if name in new_class_dict:
                 continue
             if isinstance(attr, FunctionType):
                 if name in DATAPARALLEL_OPS:
                     new_class_dict[name] = \
                         _GPUDataParallelMetaClass._data_parallel_wrapper(attr)
                 else:
                     new_class_dict[name] = attr
         return super(_GPUDataParallelMetaClass, meta).__new__(
             meta, classname, bases, new_class_dict)


 @six.add_metaclass(_GPUDataParallelMetaClass)
 class GPUDataParallelModel(cnn.CNNModelHelper):
     """A helper class that extends CNNModelHelper to support multi GPUs
     data parallel training.
     """
     def __init__(self, devices, *args, **kwargs):
         assert len(devices) >= 1, "Should have at least 1 GPU devices"
         assert len(devices) <= workspace.NumCudaDevices(), \
             "Requested # of devices {} is greater than the # of GPUs {}".\
             format(devices, workspace.NumCudaDevices())
         _GPUDataParallelMetaClass._devices = devices
         self._devices = devices
         self._explicit_scope = False
         self._gradient_reduce_all_added = False
         self._mpi_comm = None
         super(GPUDataParallelModel, self).__init__(*args, **kwargs)

     def explicit_scope(self):
         return self._explicit_scope

     def _call(self, name, *args, **kwargs):
         return super(GPUDataParallelModel, self).__getattr__(
             name)(*args, **kwargs)

     # TODO(denisy): try out decorators to avoid this code below
     def Accuracy(self, *args, **kwargs):
         return self._call("Accuracy", *args, **kwargs)

     def Adam(self, *args, **kwargs):
         return self._call("Adam", *args, **kwargs)

     def AveragedLoss(self, *args, **kwargs):
         return self._call("AveragedLoss", *args, **kwargs)

     def Cast(self, *args, **kwargs):
         return self._call("Cast", *args, **kwargs)

     def LabelCrossEntropy(self, *args, **kwargs):
         return self._call("LabelCrossEntropy", *args, **kwargs)

     def LearningRate(self, *args, **kwargs):
         return self._call("LearningRate", *args, **kwargs)

     def Print(self, *args, **kwargs):
         return self._call("Print", *args, **kwargs)

     def Scale(self, *args, **kwargs):
         return self._call("Scale", *args, **kwargs)

     def Snapshot(self, *args, **kwargs):
         return self._call("Snapshot", *args, **kwargs)

     def Softmax(self, *args, **kwargs):
         return self._call("Softmax", *args, **kwargs)

     def StopGradient(self, *args, **kwargs):
         return self._call("StopGradient", *args, **kwargs)

     def Sum(self, *args, **kwargs):
         return self._call("Sum", *args, **kwargs)

     def Summarize(self, *args, **kwargs):
         return self._call("Summarize", *args, **kwargs)

     def Tanh(self, *args, **kwargs):
         return self._call("Tanh", *args, **kwargs)

     def WeightedSum(self, *args, **kwargs):
         return self._call("WeightedSum", *args, **kwargs)

     def SquaredL2Distance(self, *args, **kwargs):
         return self._call("SquaredL2Distance", *args, **kwargs)

     def SetMPIComm(self, mpi_comm):
         self._mpi_comm = mpi_comm

     def FinalizeSetup(self):
         self.param_init_net.RunAllOnGPU()
         self.RunAllOnGPU()

         # If MPI enabled, broadcast params from master
         if (self._mpi_comm is not None):
             self._AddMPIParameterSync()

         # Setup sync of initial params
         self._SyncInitialParams()

     def AddGradientOperators(self, params, *args, **kwargs):
         def create_grad(param):
             return self.ConstantFill(param, str(param) + "_grad", value=1.0)

         param_grad = {}
         # Explicitly need to create gradients on each GPU
         for param in params:
             if not isinstance(param, dict):
                 grad = create_grad(param)
                 param_grad[str(param)] = str(grad)
             else:
                 for gpu_id in self._devices:
                     device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
                     with core.DeviceScope(device):
                         assert gpu_id in param
                         p = param[gpu_id]
                         g = create_grad(p)
                         param_grad[str(p)] = str(g)

         return super(GPUDataParallelModel, self).AddGradientOperators(
             param_grad, *args, **kwargs)

     def AddWeightDecay(self, weight_decay):
         if weight_decay == 0.0:
             return

         assert(weight_decay > 0.0)

         self._explicit_scope = True
         assert \
             self._gradient_reduce_all_added, \
             "Weight decay must be done after gradient sync between gpus"

         for gpu_id in self._devices:
             with core.NameScope("gpu_{}".format(gpu_id)):
                 device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
                 with core.DeviceScope(device):
                     wd = self.param_init_net.ConstantFill([], 'wd', shape=[1],
                                                           value=weight_decay)
                     ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1],
                                                            value=1.0)
                     # Only update parameters that belong to the current GPU
                     params = self._CurrentScopeParams()

                     # Take only params that are weights
                     print("Adding weigth-decay for gpu {}.".format(gpu_id))

                     gpu_weights = [p for p in params if p in self.weights]
                     for w in gpu_weights:
                         # Equivalent to grad -= w * param
                         grad = self.param_to_grad[w]
                         self.net.WeightedSum([grad, ONE, w, wd], grad)

         self._explicit_scope = False

     def _Broadcast(self, net, param):
         # TODO(akyrola): replace with NCCLBroadcast when it's working
         # Copy params from gpu_0 to other
         for gpu_idx in self._devices[1:]:
             device_opt = core.DeviceOption(caffe2_pb2.CUDA, gpu_idx)
             with core.DeviceScope(device_opt):
                 net.Copy(
                     "gpu_{}/{}".format(self._devices[0], param),
                     "gpu_{}/{}".format(gpu_idx, param)
                 )

     def _SyncInitialParams(self):
         unique_param_names = set(
             stripParamName(p)
             for p in self.params
         )

         self._explicit_scope = True
         for param in unique_param_names:
             self._Broadcast(self.param_init_net, param)

         self._explicit_scope = False

     def _AddMPIParameterSync(self):
         # Sync from master
         unique_param_names = set(
             stripParamName(p)
             for p in self.params
         )

         self._explicit_scope = True

         # Should this be done in GPU 0 scope?
         for param_name in unique_param_names:
             param = "gpu_{}/{}".format(self._devices[0], param_name)
             self.param_init_net.Broadcast(
                 inputs=[self._mpi_comm, param],
                 outputs=[param],
                 engine='MPI'
             )
         self._explicit_scope = False

     def _AllReduceGradients(self):
         self._gradient_reduce_all_added = True

         if self._mpi_comm is None:
             self._AllReduceGradientsSingleHost()
         else:
             self._AllReduceGradientsWithMPI()

     def _AllReduceGradientsWithMPI(self):
         self._explicit_scope = True
         unique_grads_names = set(
             stripParamName(grad)
             for grad in self.param_to_grad.values()
         )

         # Step 1: sum gradients from local GPUs to master GPU
         last_out = None
         master_device_opt = core.DeviceOption(caffe2_pb2.CUDA, self._devices[0])

         # Note: sorted order to ensure each host puts the operators in
         # same order.
         for grad_name in sorted(unique_grads_names):
             grads_group = [
                 grad
                 for grad in self.param_to_grad.values()
                 if stripParamName(grad) == grad_name
             ]
             master_grad = "gpu_{}/{}".format(self._devices[0], grad_name)
             assert master_grad in grads_group

             # Remark: NCCLReduce does not support in-place modifications
             # so we need a temporary gradient blob
             reduced_grad = "gpu_{}/{}_red".format(
                 self._devices[0],
                 grad_name
             )

             with core.DeviceScope(master_device_opt):
                 self.ConstantFill(master_grad, reduced_grad, value=0.0)
                 self.net.NCCLReduce(grads_group, reduced_grad)

                 # Step 2: allreduce over MPI to all hosts, between master GPUs
                 self.net.Allreduce(
                     inputs=[self._mpi_comm, reduced_grad],
                     outputs=[master_grad],
                     engine='MPI',
                     control_input=None if last_out is None else [last_out],
                 )
                 last_out = master_grad

             # Step 3: broadcast locally
             self._Broadcast(self.net, grad_name)

         self._explicit_scope = False

     def _AllReduceGradientsSingleHost(self):
         """Performs NCCL AllReduce to distribute gradients to all the GPUs."""

         if len(self._devices) == 1:
             return

         # Take only params that have gradient associated with them.
         unique_grads_names = set(
             stripParamName(grad)
             for grad in self.param_to_grad.values()
         )

         # Now we need to Allreduce gradients on all the GPUs.
         # Pick GPU #0 as a master GPU.
         self._explicit_scope = True
         master_device_opt = core.DeviceOption(caffe2_pb2.CUDA, self._devices[0])
         with core.DeviceScope(master_device_opt):
             # Group by grads for reduce.
             for grad_name in unique_grads_names:
                 grads_group = [
                     grad
                     for grad in self.param_to_grad.values()
                     if stripParamName(grad) == grad_name
                 ]
                 assert len(grads_group) == len(self._devices), \
                     "Each GPU from {}, should have a copy of {}.".format(
                         self._devices, grad_name)
                 self.NCCLAllreduce(grads_group, grads_group)
         self._explicit_scope = False

     def _BuildLR(self, base_lr, policy="fixed", **other_lr_params):
         """A helper to create learning rate."""
         ITER = self.Iter("ITER")
         # There is one interesting thing here: since we are minimizing, we are
         # doing "descent" so the learning rate is set to be negative.
         LR = self.net.LearningRate(
             [ITER],
             "LR",
             base_lr=base_lr,
             policy=policy,
             **other_lr_params
         )
         return LR

     def _BuildSGD(self, params, base_lr, policy="fixed", **other_lr_params):
         """A helper to construct gradient update for SGD."""
         base_lr = base_lr / len(self._devices)
         LR = self._BuildLR(base_lr, policy, **other_lr_params)
         ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
         for param in params:
             grad = self.param_to_grad[param]
             if isinstance(grad, core.GradientSlice):
                 self.ScatterWeightedSum(
                     [param, ONE, grad.indices, grad.values, LR], param
                 )
             else:
                 self.WeightedSum([param, ONE, grad, LR], param)

     def _CurrentScopeParams(self):
         return [
             param
             for param in self.param_to_grad.keys()
             if str(param).startswith(scope.NAMESCOPE)
         ]

     def SGD(self, base_lr, policy="fixed", **other_lr_params):
         """Adds SGD optimizer to the model."""
         self._AllReduceGradients()

         # Create update params operators.
         self._explicit_scope = True
         for gpu_id in self._devices:
             with core.NameScope("gpu_{}".format(gpu_id)):
                 device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
                 with core.DeviceScope(device):
                     # Only update parameters that belong to the current GPU
                     params = self._CurrentScopeParams()

                     # Add optimizer update operators
                     self._BuildSGD(params, base_lr, policy, **other_lr_params)
         self._explicit_scope = False

     def CustomSGD(
         self,
         paramup_build_fn,
         base_lr,
         lr_policy,
         weight_decay,
         **other_lr_pars
     ):
         """Custom parameter update function"""
         self._AllReduceGradients()

         self.AddWeightDecay(weight_decay)

         # Run parameter update on each machine
         self._explicit_scope = True
         for gpu_id in self._devices:
             with core.NameScope("gpu_{}".format(gpu_id)):
                 device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
                 with core.DeviceScope(device):
                     LR = self._BuildLR(base_lr, lr_policy, **other_lr_pars)

                     params = self._CurrentScopeParams()
                     paramup_build_fn(self, params, LR)
         self._explicit_scope = False

     def ExecOnEachDevice(self, fn, *args, **kwargs):
         self._explicit_scope = True
         for gpu_id in self._devices:
             with core.NameScope("gpu_{}".format(gpu_id)):
                 device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
                 with core.DeviceScope(device):
                     fn(self, *args, **kwargs)

         self._explicit_scope = False


 # A helper function to extract a parameter's name
 def stripParamName(param):
     # Format is "a/b/c/d" -> d
     name = str(param)
     sep = scope._NAMESCOPE_SEPARATOR
     return name[name.rindex(sep) + 1:]


 def SetupMPICluster(num_replicas, role, job_path):
     from caffe2.python import mpi
     print("Initing library")
     dyndep.InitOpsLibrary('@/caffe2/caffe2/mpi:mpi_ops')
     print("Setup peers")
     mpi.SetupPeers(
         replicas=int(num_replicas),
         role=role,
         job_path=job_path
     )
     print("Create mpi_init net")
     mpi_init_net = core.Net('mpi_init')
     print("Create commonworld")
     mpi_comm = mpi_init_net.CreateCommonWorld(
         inputs=[],
         outputs=['comm_world'],
         engine='MPI'
     )
     print("Run mpi_init net")
     workspace.RunNetOnce(mpi_init_net)
     print("Finished MPI setup")
     return mpi_comm
	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	from types import FunctionType
	from functools import wraps
	import six

	from caffe2.python import cnn, dyndep, scope, workspace, core
	from caffe2.proto import caffe2_pb2

	dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/nccl:nccl_ops")


	DATAPARALLEL_OPS = [
	"Conv",
	"ConvTranspose",
	"GroupConv",
	"FC",
	"FC_Decomp",
	"FC_Prune",
	"FC_Sparse",
	"LRN",
	"Dropout",
	"MaxPool",
	"AveragePool",
	"Concat",
	"DepthConcat",
	"Relu",
	"Transpose",
	"SpatialBN",
	"Accuracy",
	"Adam",
	"AveragedLoss",
	"Cast",
	"LabelCrossEntropy",
	"LearningRate",
	"Print",
	"Scale",
	"Snapshot",
	"Softmax",
	"StopGradient",
	"Summarize",
	"Sum",
	"Tanh",
	"WeightedSum",
	"SquaredL2Distance",
	]


	class _GPUDataParallelMetaClass(type):
	"""A meta class to patch method in order to distribute them over multiple
	GPUs.
	"""
	_devices = []

	@staticmethod
	def _data_parallel_wrapper(op):
	@wraps(op)
	def wrapped(cls, blob_in, blob_out, args, *kwargs):
	# Helpers to extract a device specific blob or a global blob
	def self_or_item(d, key):
	if isinstance(d, dict):
	assert key in d
	return d[key]
	return d

	def get_input(gpu_id):
	if isinstance(blob_in, list):
	return [self_or_item(blob, gpu_id) for blob in blob_in]
	return self_or_item(blob_in, gpu_id)

	def get_output(gpu_id):
	return self_or_item(blob_out, gpu_id)

	# If we have explicit device scope, we do not parallelize
	if cls.explicit_scope():
	return op(
	cls,
	blob_in,
	blob_out,
	*args,
	**kwargs)

	devices = _GPUDataParallelMetaClass._devices
	results = {}
	for gpu_id in devices:
	with core.NameScope("gpu_{}".format(gpu_id)):
	device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
	with core.DeviceScope(device):
	result = op(
	cls,
	get_input(gpu_id),
	get_output(gpu_id),
	*args,
	**kwargs)
	results[gpu_id] = result
	return results

	return wrapped

	def __new__(meta, classname, bases, class_dict):
	assert len(bases) == 1, "Expects only one base class"
	base = bases[0]
	assert base is cnn.CNNModelHelper, "Base class should be CNNModelHelper"
	new_class_dict = {}
	for name, attr in base.__dict__.items():
	if name not in DATAPARALLEL_OPS:
	continue
	attr = _GPUDataParallelMetaClass._data_parallel_wrapper(attr)
	new_class_dict[name] = attr
	for name, attr in class_dict.items():
	if name in new_class_dict:
	continue
	if isinstance(attr, FunctionType):
	if name in DATAPARALLEL_OPS:
	new_class_dict[name] = \
	_GPUDataParallelMetaClass._data_parallel_wrapper(attr)
	else:
	new_class_dict[name] = attr
	return super(_GPUDataParallelMetaClass, meta).__new__(
	meta, classname, bases, new_class_dict)


	@six.add_metaclass(_GPUDataParallelMetaClass)
	class GPUDataParallelModel(cnn.CNNModelHelper):
	"""A helper class that extends CNNModelHelper to support multi GPUs
	data parallel training.
	"""
	def __init__(self, devices, args, *kwargs):
	assert len(devices) >= 1, "Should have at least 1 GPU devices"
	assert len(devices) <= workspace.NumCudaDevices(), \
	"Requested # of devices {} is greater than the # of GPUs {}".\
	format(devices, workspace.NumCudaDevices())
	_GPUDataParallelMetaClass._devices = devices
	self._devices = devices
	self._explicit_scope = False
	self._gradient_reduce_all_added = False
	self._mpi_comm = None
	super(GPUDataParallelModel, self).__init__(args, *kwargs)

	def explicit_scope(self):
	return self._explicit_scope

	def _call(self, name, args, *kwargs):
	return super(GPUDataParallelModel, self).__getattr__(
	name)(args, *kwargs)

	# TODO(denisy): try out decorators to avoid this code below
	def Accuracy(self, args, *kwargs):
	return self._call("Accuracy", args, *kwargs)

	def Adam(self, args, *kwargs):
	return self._call("Adam", args, *kwargs)

	def AveragedLoss(self, args, *kwargs):
	return self._call("AveragedLoss", args, *kwargs)

	def Cast(self, args, *kwargs):
	return self._call("Cast", args, *kwargs)

	def LabelCrossEntropy(self, args, *kwargs):
	return self._call("LabelCrossEntropy", args, *kwargs)

	def LearningRate(self, args, *kwargs):
	return self._call("LearningRate", args, *kwargs)

	def Print(self, args, *kwargs):
	return self._call("Print", args, *kwargs)

	def Scale(self, args, *kwargs):
	return self._call("Scale", args, *kwargs)

	def Snapshot(self, args, *kwargs):
	return self._call("Snapshot", args, *kwargs)

	def Softmax(self, args, *kwargs):
	return self._call("Softmax", args, *kwargs)

	def StopGradient(self, args, *kwargs):
	return self._call("StopGradient", args, *kwargs)

	def Sum(self, args, *kwargs):
	return self._call("Sum", args, *kwargs)

	def Summarize(self, args, *kwargs):
	return self._call("Summarize", args, *kwargs)

	def Tanh(self, args, *kwargs):
	return self._call("Tanh", args, *kwargs)

	def WeightedSum(self, args, *kwargs):
	return self._call("WeightedSum", args, *kwargs)

	def SquaredL2Distance(self, args, *kwargs):
	return self._call("SquaredL2Distance", args, *kwargs)

	def SetMPIComm(self, mpi_comm):
	self._mpi_comm = mpi_comm

	def FinalizeSetup(self):
	self.param_init_net.RunAllOnGPU()
	self.RunAllOnGPU()

	# If MPI enabled, broadcast params from master
	if (self._mpi_comm is not None):
	self._AddMPIParameterSync()

	# Setup sync of initial params
	self._SyncInitialParams()

	def AddGradientOperators(self, params, args, *kwargs):
	def create_grad(param):
	return self.ConstantFill(param, str(param) + "_grad", value=1.0)

	param_grad = {}
	# Explicitly need to create gradients on each GPU
	for param in params:
	if not isinstance(param, dict):
	grad = create_grad(param)
	param_grad[str(param)] = str(grad)
	else:
	for gpu_id in self._devices:
	device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
	with core.DeviceScope(device):
	assert gpu_id in param
	p = param[gpu_id]
	g = create_grad(p)
	param_grad[str(p)] = str(g)

	return super(GPUDataParallelModel, self).AddGradientOperators(
	param_grad, args, *kwargs)

	def AddWeightDecay(self, weight_decay):
	if weight_decay == 0.0:
	return

	assert(weight_decay > 0.0)

	self._explicit_scope = True
	assert \
	self._gradient_reduce_all_added, \
	"Weight decay must be done after gradient sync between gpus"

	for gpu_id in self._devices:
	with core.NameScope("gpu_{}".format(gpu_id)):
	device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
	with core.DeviceScope(device):
	wd = self.param_init_net.ConstantFill([], 'wd', shape=[1],
	value=weight_decay)
	ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1],
	value=1.0)
	# Only update parameters that belong to the current GPU
	params = self._CurrentScopeParams()

	# Take only params that are weights
	print("Adding weigth-decay for gpu {}.".format(gpu_id))

	gpu_weights = [p for p in params if p in self.weights]
	for w in gpu_weights:
	# Equivalent to grad -= w * param
	grad = self.param_to_grad[w]
	self.net.WeightedSum([grad, ONE, w, wd], grad)

	self._explicit_scope = False

	def _Broadcast(self, net, param):
	# TODO(akyrola): replace with NCCLBroadcast when it's working
	# Copy params from gpu_0 to other
	for gpu_idx in self._devices[1:]:
	device_opt = core.DeviceOption(caffe2_pb2.CUDA, gpu_idx)
	with core.DeviceScope(device_opt):
	net.Copy(
	"gpu_{}/{}".format(self._devices[0], param),
	"gpu_{}/{}".format(gpu_idx, param)
	)

	def _SyncInitialParams(self):
	unique_param_names = set(
	stripParamName(p)
	for p in self.params
	)

	self._explicit_scope = True
	for param in unique_param_names:
	self._Broadcast(self.param_init_net, param)

	self._explicit_scope = False

	def _AddMPIParameterSync(self):
	# Sync from master
	unique_param_names = set(
	stripParamName(p)
	for p in self.params
	)

	self._explicit_scope = True

	# Should this be done in GPU 0 scope?
	for param_name in unique_param_names:
	param = "gpu_{}/{}".format(self._devices[0], param_name)
	self.param_init_net.Broadcast(
	inputs=[self._mpi_comm, param],
	outputs=[param],
	engine='MPI'
	)
	self._explicit_scope = False

	def _AllReduceGradients(self):
	self._gradient_reduce_all_added = True

	if self._mpi_comm is None:
	self._AllReduceGradientsSingleHost()
	else:
	self._AllReduceGradientsWithMPI()

	def _AllReduceGradientsWithMPI(self):
	self._explicit_scope = True
	unique_grads_names = set(
	stripParamName(grad)
	for grad in self.param_to_grad.values()
	)

	# Step 1: sum gradients from local GPUs to master GPU
	last_out = None
	master_device_opt = core.DeviceOption(caffe2_pb2.CUDA, self._devices[0])

	# Note: sorted order to ensure each host puts the operators in
	# same order.
	for grad_name in sorted(unique_grads_names):
	grads_group = [
	grad
	for grad in self.param_to_grad.values()
	if stripParamName(grad) == grad_name
	]
	master_grad = "gpu_{}/{}".format(self._devices[0], grad_name)
	assert master_grad in grads_group

	# Remark: NCCLReduce does not support in-place modifications
	# so we need a temporary gradient blob
	reduced_grad = "gpu_{}/{}_red".format(
	self._devices[0],
	grad_name
	)

	with core.DeviceScope(master_device_opt):
	self.ConstantFill(master_grad, reduced_grad, value=0.0)
	self.net.NCCLReduce(grads_group, reduced_grad)

	# Step 2: allreduce over MPI to all hosts, between master GPUs
	self.net.Allreduce(
	inputs=[self._mpi_comm, reduced_grad],
	outputs=[master_grad],
	engine='MPI',
	control_input=None if last_out is None else [last_out],
	)
	last_out = master_grad

	# Step 3: broadcast locally
	self._Broadcast(self.net, grad_name)

	self._explicit_scope = False

	def _AllReduceGradientsSingleHost(self):
	"""Performs NCCL AllReduce to distribute gradients to all the GPUs."""

	if len(self._devices) == 1:
	return

	# Take only params that have gradient associated with them.
	unique_grads_names = set(
	stripParamName(grad)
	for grad in self.param_to_grad.values()
	)

	# Now we need to Allreduce gradients on all the GPUs.
	# Pick GPU #0 as a master GPU.
	self._explicit_scope = True
	master_device_opt = core.DeviceOption(caffe2_pb2.CUDA, self._devices[0])
	with core.DeviceScope(master_device_opt):
	# Group by grads for reduce.
	for grad_name in unique_grads_names:
	grads_group = [
	grad
	for grad in self.param_to_grad.values()
	if stripParamName(grad) == grad_name
	]
	assert len(grads_group) == len(self._devices), \
	"Each GPU from {}, should have a copy of {}.".format(
	self._devices, grad_name)
	self.NCCLAllreduce(grads_group, grads_group)
	self._explicit_scope = False

	def _BuildLR(self, base_lr, policy="fixed", **other_lr_params):
	"""A helper to create learning rate."""
	ITER = self.Iter("ITER")
	# There is one interesting thing here: since we are minimizing, we are
	# doing "descent" so the learning rate is set to be negative.
	LR = self.net.LearningRate(
	[ITER],
	"LR",
	base_lr=base_lr,
	policy=policy,
	**other_lr_params
	)
	return LR

	def _BuildSGD(self, params, base_lr, policy="fixed", **other_lr_params):
	"""A helper to construct gradient update for SGD."""
	base_lr = base_lr / len(self._devices)
	LR = self._BuildLR(base_lr, policy, **other_lr_params)
	ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
	for param in params:
	grad = self.param_to_grad[param]
	if isinstance(grad, core.GradientSlice):
	self.ScatterWeightedSum(
	[param, ONE, grad.indices, grad.values, LR], param
	)
	else:
	self.WeightedSum([param, ONE, grad, LR], param)

	def _CurrentScopeParams(self):
	return [
	param
	for param in self.param_to_grad.keys()
	if str(param).startswith(scope.NAMESCOPE)
	]

	def SGD(self, base_lr, policy="fixed", **other_lr_params):
	"""Adds SGD optimizer to the model."""
	self._AllReduceGradients()

	# Create update params operators.
	self._explicit_scope = True
	for gpu_id in self._devices:
	with core.NameScope("gpu_{}".format(gpu_id)):
	device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
	with core.DeviceScope(device):
	# Only update parameters that belong to the current GPU
	params = self._CurrentScopeParams()

	# Add optimizer update operators
	self._BuildSGD(params, base_lr, policy, **other_lr_params)
	self._explicit_scope = False

	def CustomSGD(
	self,
	paramup_build_fn,
	base_lr,
	lr_policy,
	weight_decay,
	**other_lr_pars
	):
	"""Custom parameter update function"""
	self._AllReduceGradients()

	self.AddWeightDecay(weight_decay)

	# Run parameter update on each machine
	self._explicit_scope = True
	for gpu_id in self._devices:
	with core.NameScope("gpu_{}".format(gpu_id)):
	device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
	with core.DeviceScope(device):
	LR = self._BuildLR(base_lr, lr_policy, **other_lr_pars)

	params = self._CurrentScopeParams()
	paramup_build_fn(self, params, LR)
	self._explicit_scope = False

	def ExecOnEachDevice(self, fn, args, *kwargs):
	self._explicit_scope = True
	for gpu_id in self._devices:
	with core.NameScope("gpu_{}".format(gpu_id)):
	device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
	with core.DeviceScope(device):
	fn(self, args, *kwargs)

	self._explicit_scope = False


	# A helper function to extract a parameter's name
	def stripParamName(param):
	# Format is "a/b/c/d" -> d
	name = str(param)
	sep = scope._NAMESCOPE_SEPARATOR
	return name[name.rindex(sep) + 1:]


	def SetupMPICluster(num_replicas, role, job_path):
	from caffe2.python import mpi
	print("Initing library")
	dyndep.InitOpsLibrary('@/caffe2/caffe2/mpi:mpi_ops')
	print("Setup peers")
	mpi.SetupPeers(
	replicas=int(num_replicas),
	role=role,
	job_path=job_path
	)
	print("Create mpi_init net")
	mpi_init_net = core.Net('mpi_init')
	print("Create commonworld")
	mpi_comm = mpi_init_net.CreateCommonWorld(
	inputs=[],
	outputs=['comm_world'],
	engine='MPI'
	)
	print("Run mpi_init net")
	workspace.RunNetOnce(mpi_init_net)
	print("Finished MPI setup")
	return mpi_comm