add elastic zeus handler (#16746)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/16746
as titled. We use a special url schem elasticzeus for elastic zeus so that we dont need to change the public interface of init_process_group.
Reviewed By: aazzolini, soumith
Differential Revision: D13948151
fbshipit-source-id: 88939dcfa0ad93467dabedad6905ec32e6ec60e6
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 3006c61..ace6f82 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -303,6 +303,8 @@
world_size (int, optional): Number of processes participating in
the job.
rank (int, optional): Rank of the current process.
+ store(Store, optional): Rendevous key/value store as an alternative
+ to other init methods.
timeout (timedelta, optional): Timeout for operations executed against
the process group. Default value equals 30 minutes.
This is only applicable for the ``gloo`` backend.
@@ -329,6 +331,10 @@
world_size = kwargs.pop('world_size', -1)
group_name = kwargs.pop('group_name', '')
rank = kwargs.pop('rank', -1)
+ store = kwargs.pop('store', None)
+ if store is not None:
+ assert world_size > 0, 'world_size needs to be positive'
+ assert rank >= 0, 'rank needs to be non-negative'
assert len(kwargs) == 0, \
"got unexpected keyword arguments: %s" % ",".join(kwargs.keys())
@@ -351,7 +357,8 @@
elif world_size != -1:
url += "?world_size={}".format(world_size)
- store, rank, world_size = next(rendezvous(url))
+ if store is None:
+ store, rank, world_size = next(rendezvous(url))
if backend == Backend.GLOO:
_default_pg = ProcessGroupGloo(
store,