Migrate test to internal base class, fixes (#128367) Summary: ## Remove etc deps converted tests to non-etcd based rdzv handler so that tests don't have dependency on etcd server ## Adopt pytorch test convetions - test starts with `test_TESTS.py` - Test base class is torch.testing._internal.common_utils.TestCase - include __main__ handler ## reduce test timing (used to take > 300 seconds): 3.05s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_init_method_env_with_torchelastic 2.59s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_init_method_tcp_with_torchelastic 2.33s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_launch_elastic_worker_raise_exception 2.33s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_launch_run_path 2.30s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_nproc_launch_auto_configurations 2.24s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_is_torchelastic_launched_with_logs_spec_defined 2.24s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_is_torchelastic_launched 2.17s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_launch_elastic_multiple_agents 2.12s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_launch_elastic 2.08s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_nproc_gpu_launch_configurations 1.32s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_launch_standalone 1.05s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_nproc_launch_number_configurations 1.05s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_launch_with_env_vars 1.05s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_launch_user_script_python 1.05s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_launch_user_script_python_caffe2_bc 1.04s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_launch_user_script_bash 1.03s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_launch_user_script_default_nproc 0.04s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_logs_logs_spec_entrypoint_must_be_defined 0.01s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_launch_elastic_agent_raise_exception 0.01s call test/distributed/launcher/run_test.py::ElasticLaunchTest::test_launch_shutdown Test Plan: pytest --durations=0 test/distributed/launcher/run_test.py Differential Revision: D58388182 Pull Request resolved: https://github.com/pytorch/pytorch/pull/128367 Approved by: https://github.com/d4l3k

commit: 6af4c6acad5a352f2f974c73aba3a03535b20a0b [log] [tgz]
author: Kurman Karabukaev <kurman@meta.com> Wed Jun 12 01:03:40 2024 +0000
committer: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com> Wed Jun 12 01:03:40 2024 +0000
tree: 82bef471cd740cb32cb2f89c5fb2947539823ac7
parent: 786c24a4cd84a085173e72b59e9d0c356f923249 [diff]
diff --git a/test/distributed/launcher/run_test.py b/test/distributed/launcher/test_run.py
similarity index 89%
rename from test/distributed/launcher/run_test.py
rename to test/distributed/launcher/test_run.py
index c816042..ba58aec 100644
--- a/test/distributed/launcher/run_test.py
+++ b/test/distributed/launcher/test_run.py

@@ -13,7 +13,6 @@
 import subprocess
 import sys
 import tempfile
-import unittest
 import uuid
 from contextlib import closing
 from unittest import mock
@@ -23,12 +22,13 @@
 from torch.distributed.elastic.agent.server.api import RunResult, WorkerState
 from torch.distributed.elastic.multiprocessing import DefaultLogsSpecs
 from torch.distributed.elastic.multiprocessing.errors import ChildFailedError
-from torch.distributed.elastic.rendezvous.etcd_server import EtcdServer
 from torch.distributed.elastic.utils import get_socket_with_port
 from torch.distributed.elastic.utils.distributed import get_free_port
 from torch.testing._internal.common_utils import (
+    run_tests,
     skip_but_pass_in_sandcastle_if,
     TEST_WITH_DEV_DBG_ASAN,
+    TestCase,
 )
 
 
@@ -63,19 +63,7 @@
     pass
 
 
-class ElasticLaunchTest(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        # start a standalone, single process etcd server to use for all tests
-        cls._etcd_server = EtcdServer()
-        cls._etcd_server.start()
-        cls._etcd_endpoint = cls._etcd_server.get_endpoint()
-
-    @classmethod
-    def tearDownClass(cls):
-        # stop the standalone etcd server
-        cls._etcd_server.stop()
-
+class ElasticLaunchTest(TestCase):
     def setUp(self):
         self.test_dir = tempfile.mkdtemp()
 
@@ -103,8 +91,6 @@
         args = [
             f"--nnodes={nnodes}",
             f"--nproc-per-node={nproc_per_node}",
-            "--rdzv-backend=etcd",
-            f"--rdzv-endpoint={self._etcd_endpoint}",
             f"--rdzv-id={run_id}",
             "--monitor-interval=1",
             "--start-method=spawn",
@@ -156,8 +142,6 @@
         args = [
             f"--nnodes={nnodes}",
             f"--nproc-per-node={nproc_per_node}",
-            "--rdzv-backend=etcd",
-            f"--rdzv-endpoint={self._etcd_endpoint}",
             f"--rdzv-id={run_id}",
             "--monitor-interval=1",
             "--start-method=spawn",
@@ -187,8 +171,6 @@
         world_size = 1
         args = [
             f"--nnodes={nnodes}",
-            "--rdzv-backend=etcd",
-            f"--rdzv-endpoint={self._etcd_endpoint}",
             f"--rdzv-id={run_id}",
             "--monitor-interval=1",
             "--start-method=spawn",
@@ -220,8 +202,6 @@
 
         os.environ["PET_NNODES"] = str(nnodes)
         os.environ["PET_NPROC_PER_NODE"] = str(nproc_per_node)
-        os.environ["PET_RDZV_BACKEND"] = "etcd"
-        os.environ["PET_RDZV_ENDPOINT"] = self._etcd_endpoint
         os.environ["PET_RDZV_ID"] = run_id
         os.environ["PET_MONITOR_INTERVAL"] = "1"
         os.environ["PET_START_METHOD"] = "spawn"
@@ -250,8 +230,6 @@
         args = [
             f"--nnodes={nnodes}",
             f"--nproc-per-node={nproc_type}",
-            "--rdzv-backend=etcd",
-            f"--rdzv-endpoint={self._etcd_endpoint}",
             f"--rdzv-id={run_id}",
             "--monitor-interval=1",
             "--start-method=spawn",
@@ -272,7 +250,8 @@
     @skip_but_pass_in_sandcastle_if(
         TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
-    def test_nproc_launch_auto_configurations(self):
+    @patch("torch.cuda.is_available", return_value=False)
+    def test_nproc_launch_auto_configurations(self, _mock1):
         self._test_nproc_launch_configuration("auto", os.cpu_count())
 
     @skip_but_pass_in_sandcastle_if(
@@ -310,8 +289,9 @@
         args = [
             f"--nnodes={min_nodes}:{max_nodes}",
             f"--nproc-per-node={nproc_per_node}",
-            "--rdzv-backend=etcd",
-            f"--rdzv-endpoint={self._etcd_endpoint}",
+            "--rdzv-backend=c10d",
+            f"--rdzv-endpoint=localhost:{get_free_port()}",
+            "--rdzv-conf='join_timeout=5,last_call_timeout=1,timeout=5'",
             f"--rdzv-id={run_id}",
             "--monitor-interval=1",
             "--start-method=spawn",
@@ -343,8 +323,9 @@
         args = [
             f"--nnodes={min_nodes}:{max_nodes}",
             f"--nproc-per-node={nproc_per_node}",
-            "--rdzv-backend=etcd",
-            f"--rdzv-endpoint={self._etcd_endpoint}",
+            "--rdzv-backend=c10d",
+            f"--rdzv-endpoint=localhost:{get_free_port()}",
+            "--rdzv-conf='join_timeout=5,last_call_timeout=1,timeout=5'",
             f"--rdzv-id={run_id}",
             "--monitor-interval=1",
             "--max-restarts=0",
@@ -376,8 +357,9 @@
         args = [
             f"--nnodes={min_nodes}:{max_nodes}",
             f"--nproc-per-node={nproc_per_node}",
-            "--rdzv-backend=etcd",
-            f"--rdzv-endpoint={self._etcd_endpoint}",
+            "--rdzv-backend=c10d",
+            f"--rdzv-endpoint=localhost:{get_free_port()}",
+            "--rdzv_conf=timeout=5",
             f"--rdzv-id={run_id}",
             "--monitor-interval=1",
             "--max-restarts=0",
@@ -452,8 +434,9 @@
         args = [
             f"--nnodes={min_nodes}:{max_nodes}",
             f"--nproc-per-node={nproc_per_node}",
-            "--rdzv-backend=etcd",
-            f"--rdzv-endpoint={self._etcd_endpoint}",
+            "--rdzv-backend=c10d",
+            f"--rdzv-endpoint=localhost:{get_free_port()}",
+            "--rdzv_conf=timeout=5",
             f"--rdzv-id={run_id}",
             "--monitor-interval=1",
             "--start-method=spawn",
@@ -608,21 +591,6 @@
                 is_torchelastic_launched = fp.readline()
                 self.assertEqual("False", is_torchelastic_launched)
 
-    def test_init_method_tcp(self):
-        port = get_free_port()
-        with patch.object(
-            sys,
-            "argv",
-            [
-                path("bin/test_script_init_method.py"),
-                f"--init-method=tcp://localhost:{port}",
-                "--rank=0",
-                "--world-size=1",
-            ],
-        ):
-            runpy.run_path(sys.argv[0], run_name="__main__")
-            # nothing to validate, just make sure it runs
-
     @skip_but_pass_in_sandcastle_if(
         TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
@@ -642,27 +610,6 @@
         )
         # nothing to validate, just make sure it runs
 
-    def test_init_method_env(self):
-        port = get_free_port()
-        with patch.dict(
-            os.environ,
-            {
-                "RANK": "0",
-                "WORLD_SIZE": "1",
-                "MASTER_ADDR": "localhost",
-                "MASTER_PORT": str(port),
-            },
-        ), patch.object(
-            sys,
-            "argv",
-            [
-                path("bin/test_script_init_method.py"),
-                "--init-method=env://",
-            ],
-        ):
-            runpy.run_path(sys.argv[0], run_name="__main__")
-            # nothing to validate, just make sure it runs
-
     @skip_but_pass_in_sandcastle_if(
         TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
@@ -681,3 +628,7 @@
             ]
         )
         # nothing to validate, just make sure it runs
+
+
+if __name__ == "__main__":
+    run_tests()
commit	6af4c6acad5a352f2f974c73aba3a03535b20a0b	[log] [tgz]
author	Kurman Karabukaev <kurman@meta.com>	Wed Jun 12 01:03:40 2024 +0000
committer	PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>	Wed Jun 12 01:03:40 2024 +0000
tree	82bef471cd740cb32cb2f89c5fb2947539823ac7
parent	786c24a4cd84a085173e72b59e9d0c356f923249 [diff]