Dont skip NCCL backend when testing all_reduce_cuda (#48231)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48231

Noticed that these tests were being skipped with NCCL backend, but
there doesn't appear to be a valid reason to. Enabled these tests and verify
that they pass with 500 stress runs.
ghstack-source-id: 117085209

Test Plan: CI

Reviewed By: SciPioneer

Differential Revision: D25079030

fbshipit-source-id: 8204288ffbd387375a1a86fe8c07243cfd855549
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 30f48ac..86f9392 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -1359,8 +1359,8 @@
             )
 
         @unittest.skipIf(
-            BACKEND != "gloo",
-            "Only Gloo backend will have CUDA allReduce tested",
+            BACKEND != "gloo" and BACKEND != "nccl",
+            "Only Gloo and NCCL backends will have CUDA allReduce tested",
         )
         @skip_if_no_gpu
         def test_all_reduce_sum_cuda(self):
@@ -1424,8 +1424,8 @@
                     dist.all_reduce(_build_tensor(1, dtype=torch.cfloat), unsupported_op, group_id)
 
         @unittest.skipIf(
-            BACKEND != "gloo",
-            "Only Gloo backend will have CUDA allReduce tested",
+            BACKEND != "gloo" and BACKEND != "nccl",
+            "Only Gloo and NCCL backends will have CUDA allReduce tested",
         )
         @skip_if_no_gpu
         def test_all_reduce_sum_cuda_complex(self):