[torchelastic][rendezvous] Add option to enable libuv for TCPStore based rendezvous backend (#118944)
Summary:
Expose an option to enable libuv in TCPStore based rendezvous backend that will allow better scaling.
Libuv support has been added recently and allows scaling for more than 2K nodes.
Test Plan: Unit tests
Differential Revision: D53335860
Pull Request resolved: https://github.com/pytorch/pytorch/pull/118944
Approved by: https://github.com/wconstab
diff --git a/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py b/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py
index ebd256a..e31b0f9 100644
--- a/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py
+++ b/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py
@@ -197,6 +197,18 @@
self._assert_create_backend_returns_backend()
+ def test_create_backend_returns_backend_with_libuv(self) -> None:
+
+ self._params.config["use_libuv"] = "true"
+
+ self._assert_create_backend_returns_backend()
+
+ def test_create_backend_returns_backend_without_libuv(self) -> None:
+
+ self._params.config["use_libuv"] = "false"
+
+ self._assert_create_backend_returns_backend()
+
def test_create_backend_raises_error_if_store_is_unreachable(self) -> None:
self._params.config["is_host"] = "false"
self._params.config["read_timeout"] = "2"
diff --git a/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py b/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py
index c73e971..144e691 100644
--- a/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py
+++ b/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py
@@ -143,6 +143,8 @@
else:
is_host = _matches_machine_hostname(host)
+ use_libuv = params.get_as_bool("use_libuv", False)
+
# The timeout
read_timeout = cast(int, params.get_as_int("read_timeout", 60))
if read_timeout <= 0:
@@ -153,7 +155,11 @@
for is_server in [is_host, False]:
try:
store = TCPStore(
- host, port, is_master=is_server, timeout=timedelta(seconds=read_timeout)
+ host,
+ port,
+ is_master=is_server,
+ timeout=timedelta(seconds=read_timeout),
+ use_libuv=use_libuv,
)
if is_server: