Fix failure of test_dynamo_distributed & test_inductor_collectives (#117741)
When CUDA is not available `c10d.init_process_group("nccl"...)` will fail with
> RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs found!
Hence add a corresponding skip marker to the classes deriving from DynamoDistributedSingleProcTestCase next to the `requires_nccl` marker.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/117741
Approved by: https://github.com/ezyang, https://github.com/malfet
diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 00d85c3..bc3705b 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -30,6 +30,7 @@
requires_nccl,
_dynamo_dist_per_rank_init,
)
+from torch.testing._internal.common_utils import requires_cuda
import torch._dynamo.logging
from torch.testing._internal.common_cuda import (
PLATFORM_SUPPORTS_FLASH_ATTENTION, PLATFORM_SUPPORTS_MEM_EFF_ATTENTION
@@ -546,6 +547,7 @@
@requires_nccl()
+@requires_cuda
class TestSingleProc(DynamoDistributedSingleProcTestCase):
"""
Test harness initializes dist process group.
diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
index 85c34cc..82a59bf 100644
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@@ -19,6 +19,7 @@
requires_nccl,
skip_if_lt_x_gpu,
)
+from torch.testing._internal.common_utils import requires_cuda
from torch._inductor.compile_fx import compile_fx as inductor_compile_fx
from torch.utils._triton import has_triton
from torch._inductor.utils import run_and_get_triton_code
@@ -524,6 +525,7 @@
@requires_nccl()
+@requires_cuda
class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
"""
Prefer single-proc test runner for basic tests as it is easier to work with.
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 98d45d9..e08dcf8 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1295,6 +1295,7 @@
# other libraries take up about 11% of space per process
torch.cuda.set_per_process_memory_fraction(round(1 / num_procs - .11, 2))
+requires_cuda = unittest.skipUnless(torch.cuda.is_available(), "Requires CUDA")
def skipIfCrossRef(fn):
@wraps(fn)