add elastic zeus handler (#16746)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/16746

as titled. We use a special url schem elasticzeus for elastic zeus so that we dont need to change the public interface of init_process_group.

Reviewed By: aazzolini, soumith

Differential Revision: D13948151

fbshipit-source-id: 88939dcfa0ad93467dabedad6905ec32e6ec60e6
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 3006c61..ace6f82 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -303,6 +303,8 @@
         world_size (int, optional): Number of processes participating in
                                     the job.
         rank (int, optional): Rank of the current process.
+        store(Store, optional): Rendevous key/value store as an alternative
+                                to other init methods.
         timeout (timedelta, optional): Timeout for operations executed against
             the process group. Default value equals 30 minutes.
             This is only applicable for the ``gloo`` backend.
@@ -329,6 +331,10 @@
     world_size = kwargs.pop('world_size', -1)
     group_name = kwargs.pop('group_name', '')
     rank = kwargs.pop('rank', -1)
+    store = kwargs.pop('store', None)
+    if store is not None:
+        assert world_size > 0, 'world_size needs to be positive'
+        assert rank >= 0, 'rank needs to be non-negative'
     assert len(kwargs) == 0, \
         "got unexpected keyword arguments: %s" % ",".join(kwargs.keys())
 
@@ -351,7 +357,8 @@
         elif world_size != -1:
             url += "?world_size={}".format(world_size)
 
-        store, rank, world_size = next(rendezvous(url))
+        if store is None:
+            store, rank, world_size = next(rendezvous(url))
         if backend == Backend.GLOO:
             _default_pg = ProcessGroupGloo(
                 store,