Fix failure of test_dynamo_distributed & test_inductor_collectives (#117741)

When CUDA is not available `c10d.init_process_group("nccl"...)` will fail with
> RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs found!

Hence add a corresponding skip marker to the classes deriving from DynamoDistributedSingleProcTestCase next to the `requires_nccl` marker.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/117741
Approved by: https://github.com/ezyang, https://github.com/malfet
diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 00d85c3..bc3705b 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -30,6 +30,7 @@
     requires_nccl,
     _dynamo_dist_per_rank_init,
 )
+from torch.testing._internal.common_utils import requires_cuda
 import torch._dynamo.logging
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FLASH_ATTENTION, PLATFORM_SUPPORTS_MEM_EFF_ATTENTION
@@ -546,6 +547,7 @@
 
 
 @requires_nccl()
+@requires_cuda
 class TestSingleProc(DynamoDistributedSingleProcTestCase):
     """
     Test harness initializes dist process group.
diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
index 85c34cc..82a59bf 100644
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@@ -19,6 +19,7 @@
     requires_nccl,
     skip_if_lt_x_gpu,
 )
+from torch.testing._internal.common_utils import requires_cuda
 from torch._inductor.compile_fx import compile_fx as inductor_compile_fx
 from torch.utils._triton import has_triton
 from torch._inductor.utils import run_and_get_triton_code
@@ -524,6 +525,7 @@
 
 
 @requires_nccl()
+@requires_cuda
 class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
     """
     Prefer single-proc test runner for basic tests as it is easier to work with.
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 98d45d9..e08dcf8 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1295,6 +1295,7 @@
     # other libraries take up about 11% of space per process
     torch.cuda.set_per_process_memory_fraction(round(1 / num_procs - .11, 2))
 
+requires_cuda = unittest.skipUnless(torch.cuda.is_available(), "Requires CUDA")
 
 def skipIfCrossRef(fn):
     @wraps(fn)