Revert "[c10d] Barrier uses stream sync instead of device sync (#117804)"

This reverts commit 0f6bbb1c070c3a9713893659377e20e147c125f6.

Reverted https://github.com/pytorch/pytorch/pull/117804 on behalf of https://github.com/clee2000 due to sorry the docs test failure is real, I think it wants the lines after the .. note to be indented https://github.com/pytorch/pytorch/actions/runs/7616827874/job/20745016788.  Marking as nosignal due to bad Dr. CI categorization ([comment](https://github.com/pytorch/pytorch/pull/117804#issuecomment-1904882487))
diff --git a/third_party/pocketfft b/third_party/pocketfft
index ea778e3..9d3ab05 160000
--- a/third_party/pocketfft
+++ b/third_party/pocketfft
@@ -1 +1 @@
-Subproject commit ea778e37710c07723435b1be58235996d1d43a5a
+Subproject commit 9d3ab05a7fffbc71a492bc6a17be034e83e8f0fe
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 5572cce..cae9a9c 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -655,20 +655,10 @@
   // Device synchronize only after we've completed timeout checks.
   if (!barrierTensors_.empty()) {
     // If we use the work to do barrier, we should block here
+    at::cuda::OptionalCUDAGuard gpuGuard;
     for (auto& device : devices_) {
-      // `dist.barrier()` only requires all CPU processes to enter this
-      // function, hence we only need to make sure the dummy all-reduce has
-      // completed. So we would only need to sync the **current stream** back to
-      // host, and do not need to synchronize the entire device (which may have
-      // kernels running on other streams).
-      // Using `cudaStreamSynchronize` instead of `cudaDeviceSynchronize` can:
-      // - lower chance of hang;
-      // - CurrentCUDAStream is usually the context of the next operation in
-      // Python, thus blocking current stream would already block the next
-      // compute kernel;
-      // - achieve better barrier performance.
-      auto currentStream = at::cuda::getCurrentCUDAStream(device.index());
-      AT_CUDA_CHECK(cudaStreamSynchronize(currentStream));
+      gpuGuard.set_index(device.index());
+      AT_CUDA_CHECK(cudaDeviceSynchronize());
     }
   }
 }
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 82d5d35..458ce0e 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -3530,10 +3530,6 @@
     Returns:
         Async work handle, if async_op is set to True.
         None, if not async_op or if not part of the group
-
-    .. note:: `ProcessGroupNCCL` now relies on stream synchronization instead of
-    device synchronization to block the CPU. Thus, please do not assume that
-    `barrier()` would perform a device synchronization.
     """
     if _rank_not_in_group(group):
         _warn_not_in_group("barrier")
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 1060a10..ccc4f10 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -9983,7 +9983,7 @@
         )
         @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
         @skip_but_pass_in_sandcastle_if(
-            True, "Skipped due to flakiness"
+            BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally"
         )
         def test_ddp_hook_pickling_powerSGD(self):