torch/distributed/rpc.py - platform/external/pytorch - Git at Google

 from . import invoke_rpc
 from . import ProcessGroupAgent

 import array
 import sys
 import torch


 _agent = None


 def _collect_worker_names(name, group):
     from . import all_gather
     from . import get_world_size

     # collect name length
     ws = get_world_size(group)
     name_bytes = name if sys.version_info < (3, 0) else bytes(name, 'utf8')
     name_bytes = list(array.array('B', name_bytes))
     name_len = len(name_bytes)
     len_input = torch.ones(1, dtype=torch.int64) * name_len
     len_outputs = [torch.empty(1, dtype=torch.int64) for _ in range(ws)]
     all_gather(len_outputs, len_input, group=group)

     # collect name value
     max_len = torch.stack(len_outputs).max().item()
     name_input = torch.empty(max_len, dtype=torch.uint8)
     name_input[:name_len] = torch.tensor(name_bytes, dtype=torch.uint8)
     name_outputs = [torch.empty(max_len, dtype=torch.uint8) for _ in range(ws)]
     all_gather(name_outputs, name_input, group=group)

     names = []
     for i in range(ws):
         name_tensor = name_outputs[i][:len_outputs[i]]
         names.append(bytearray(name_tensor.tolist()).decode('utf8'))

     return names


 def join_rpc():
     r"""
     Block until all local and remote RPC processes reach this method, process
     (send and receive) all pending messages, and then destroy local RPC agent.
     Every RPC process must call this method before exit.
     """
     global _agent

     if _agent:
         _agent.join()
         _agent = None


 def sync_rpc():
     r"""
     Block until all local and remote RPC processes reach this method and finish
     sending all pending RPCs. As this method synchronizes at the process
     level, if multiple threads are spawned, only one of them should call this
     method at a time.
     """
     if _agent is None:
         raise RuntimeError("RPC has not been initialized. "
                            "Call init_rpc(name) first.")

     _agent.sync()


 # TODO: add a context managet to wrap init_rpc and join_rpc
 def init_rpc(name, backend='pg'):
     r"""
     Initialize the local RPC agent which immediately makes the current process
     ready to send and receive RPCs. The caller needs to make sure the specified
     backend is properly intialized before calling this method. For example, to
     use ``pg`` (ProcessGroup) backend, ``init_process_group`` must be invoked
     prior to this method.

     Arguments:
         name (str): a globally unique name of the local RPC agent. (e.g.,
                     ``Trainer3``, ``ParameterServer2``, ``Master``, ``Worker1``)
         backend (str): type of RPC backend implementation. Currently,
                        process group backend ``"pg"`` is the only available
                        backend implementation. (default: ``"pg"``).
     """
     global _agent

     if _agent:
         raise RuntimeError("RPC is already initialized")

     if backend == 'pg':
         from .distributed_c10d import _get_default_group
         group = _get_default_group()
         # TODO: issue #23232
         names = _collect_worker_names(name, group)
         name_dict = {names[r] : r for r in range(len(names))}
         _agent = ProcessGroupAgent(name, name_dict, group)
     else:
         raise RuntimeError("Unrecognized RPC backend ", backend)


 def rpc(to, func, args=None, kwargs=None, async_call=False):
     r"""
     Make an RPC call to run function ``func`` on worker ``to``. By default, it
     blocks until the return value is locally available. RPC messages are sent
     and received in parallel to execution of Python code. This method is
     thread-safe.

     Arguments:
         to (str): name of the destination worker.
         func (callable): a builtin function (e.g., ``torch.add``).
         args (tuple): the argument tuple for the ``func`` invocation.
         kwargs (dict): is a dictionary of keyword arguments for the ``func``
                        invocation.
         async_call (bool): If set to ``True``, this will be an asynchronous RPC,
                            and returns a ``torch.distributed.FutureMessage``
                            object immediately. Otherwise, this RPC will block
                            until the return value is locally available.
                            (default: ``False``)

     Returns:
         If ``async_call`` is ``False``, returns the result of running ``func``
         on ``args`` and ``kwargs``. If ``async_call`` is ``True``, returns a
         ``torch.distributed.FutureMessage`` object that can be waited on. When
         completed, the return value of ``func`` on ``args`` and ``kwargs`` can
         be retrieved from the ``FutureMessage`` object.

     Example::

         Synchronous example:

         On worker 0:
         >>> import torch.distributed as dist
         >>> dist.init_process_group(backend='gloo', rank=0, world_size=2)
         >>> dist.init_rpc("worker0")
         >>> ret = dist.rpc("worker1", torch.add, args=(torch.ones(2), 3))
         >>> dist.join_rpc()

         One worker 1:
         >>> import torch.distributed as dist
         >>> dist.init_process_group(backend='gloo', rank=1, world_size=2)
         >>> dist.init_rpc("worker1")
         >>> dist.join_rpc()

         Asynchronous example:

         On worker 0:
         >>> import torch.distributed as dist
         >>> dist.init_process_group(backend='gloo', rank=0, world_size=2)
         >>> dist.init_rpc("worker0")
         >>> fut1 = dist.rpc("worker1", torch.add, args=(torch.ones(2), 3), async_call=True)
         >>> fut2 = dist.rpc("worker1", torch.add, args=(torch.ones(2), 2), async_call=True)
         >>> result = fut1.wait() + fut2.wait()
         >>> dist.join_rpc()

         One worker 1:
         >>> import torch.distributed as dist
         >>> dist.init_process_group(backend='gloo', rank=1, world_size=2)
         >>> dist.init_rpc("worker1")
         >>> dist.join_rpc()
     """
     if _agent is None:
         raise RuntimeError("RPC has not been initialized. "
                            "Call init_rpc(name) first.")

     qualified_name = torch.jit._find_builtin(func)
     if qualified_name is None:
         raise RuntimeError("unknown builtin function %s." % func)

     args = args if args else ()
     kwargs = kwargs if kwargs else {}
     fut = invoke_rpc(_agent, to, qualified_name, *args, **kwargs)

     if async_call:
         return fut
     else:
         return fut.wait()
	from . import invoke_rpc
	from . import ProcessGroupAgent

	import array
	import sys
	import torch


	_agent = None


	def _collect_worker_names(name, group):
	from . import all_gather
	from . import get_world_size

	# collect name length
	ws = get_world_size(group)
	name_bytes = name if sys.version_info < (3, 0) else bytes(name, 'utf8')
	name_bytes = list(array.array('B', name_bytes))
	name_len = len(name_bytes)
	len_input = torch.ones(1, dtype=torch.int64) * name_len
	len_outputs = [torch.empty(1, dtype=torch.int64) for _ in range(ws)]
	all_gather(len_outputs, len_input, group=group)

	# collect name value
	max_len = torch.stack(len_outputs).max().item()
	name_input = torch.empty(max_len, dtype=torch.uint8)
	name_input[:name_len] = torch.tensor(name_bytes, dtype=torch.uint8)
	name_outputs = [torch.empty(max_len, dtype=torch.uint8) for _ in range(ws)]
	all_gather(name_outputs, name_input, group=group)

	names = []
	for i in range(ws):
	name_tensor = name_outputs[i][:len_outputs[i]]
	names.append(bytearray(name_tensor.tolist()).decode('utf8'))

	return names


	def join_rpc():
	r"""
	Block until all local and remote RPC processes reach this method, process
	(send and receive) all pending messages, and then destroy local RPC agent.
	Every RPC process must call this method before exit.
	"""
	global _agent

	if _agent:
	_agent.join()
	_agent = None


	def sync_rpc():
	r"""
	Block until all local and remote RPC processes reach this method and finish
	sending all pending RPCs. As this method synchronizes at the process
	level, if multiple threads are spawned, only one of them should call this
	method at a time.
	"""
	if _agent is None:
	raise RuntimeError("RPC has not been initialized. "
	"Call init_rpc(name) first.")

	_agent.sync()


	# TODO: add a context managet to wrap init_rpc and join_rpc
	def init_rpc(name, backend='pg'):
	r"""
	Initialize the local RPC agent which immediately makes the current process
	ready to send and receive RPCs. The caller needs to make sure the specified
	backend is properly intialized before calling this method. For example, to
	use ``pg`` (ProcessGroup) backend, ``init_process_group`` must be invoked
	prior to this method.

	Arguments:
	name (str): a globally unique name of the local RPC agent. (e.g.,
	``Trainer3``, ``ParameterServer2``, ``Master``, ``Worker1``)
	backend (str): type of RPC backend implementation. Currently,
	process group backend ``"pg"`` is the only available
	backend implementation. (default: ``"pg"``).
	"""
	global _agent

	if _agent:
	raise RuntimeError("RPC is already initialized")

	if backend == 'pg':
	from .distributed_c10d import _get_default_group
	group = _get_default_group()
	# TODO: issue #23232
	names = _collect_worker_names(name, group)
	name_dict = {names[r] : r for r in range(len(names))}
	_agent = ProcessGroupAgent(name, name_dict, group)
	else:
	raise RuntimeError("Unrecognized RPC backend ", backend)


	def rpc(to, func, args=None, kwargs=None, async_call=False):
	r"""
	Make an RPC call to run function ``func`` on worker ``to``. By default, it
	blocks until the return value is locally available. RPC messages are sent
	and received in parallel to execution of Python code. This method is
	thread-safe.

	Arguments:
	to (str): name of the destination worker.
	func (callable): a builtin function (e.g., ``torch.add``).
	args (tuple): the argument tuple for the ``func`` invocation.
	kwargs (dict): is a dictionary of keyword arguments for the ``func``
	invocation.
	async_call (bool): If set to ``True``, this will be an asynchronous RPC,
	and returns a ``torch.distributed.FutureMessage``
	object immediately. Otherwise, this RPC will block
	until the return value is locally available.
	(default: ``False``)

	Returns:
	If ``async_call`` is ``False``, returns the result of running ``func``
	on ``args`` and ``kwargs``. If ``async_call`` is ``True``, returns a
	``torch.distributed.FutureMessage`` object that can be waited on. When
	completed, the return value of ``func`` on ``args`` and ``kwargs`` can
	be retrieved from the ``FutureMessage`` object.

	Example::

	Synchronous example:

	On worker 0:
	>>> import torch.distributed as dist
	>>> dist.init_process_group(backend='gloo', rank=0, world_size=2)
	>>> dist.init_rpc("worker0")
	>>> ret = dist.rpc("worker1", torch.add, args=(torch.ones(2), 3))
	>>> dist.join_rpc()

	One worker 1:
	>>> import torch.distributed as dist
	>>> dist.init_process_group(backend='gloo', rank=1, world_size=2)
	>>> dist.init_rpc("worker1")
	>>> dist.join_rpc()

	Asynchronous example:

	On worker 0:
	>>> import torch.distributed as dist
	>>> dist.init_process_group(backend='gloo', rank=0, world_size=2)
	>>> dist.init_rpc("worker0")
	>>> fut1 = dist.rpc("worker1", torch.add, args=(torch.ones(2), 3), async_call=True)
	>>> fut2 = dist.rpc("worker1", torch.add, args=(torch.ones(2), 2), async_call=True)
	>>> result = fut1.wait() + fut2.wait()
	>>> dist.join_rpc()

	One worker 1:
	>>> import torch.distributed as dist
	>>> dist.init_process_group(backend='gloo', rank=1, world_size=2)
	>>> dist.init_rpc("worker1")
	>>> dist.join_rpc()
	"""
	if _agent is None:
	raise RuntimeError("RPC has not been initialized. "
	"Call init_rpc(name) first.")

	qualified_name = torch.jit._find_builtin(func)
	if qualified_name is None:
	raise RuntimeError("unknown builtin function %s." % func)

	args = args if args else ()
	kwargs = kwargs if kwargs else {}
	fut = invoke_rpc(_agent, to, qualified_name, args, *kwargs)

	if async_call:
	return fut
	else:
	return fut.wait()