Revert "[c10d] Barrier uses stream sync instead of device sync (#117804)"
This reverts commit 0f6bbb1c070c3a9713893659377e20e147c125f6.
Reverted https://github.com/pytorch/pytorch/pull/117804 on behalf of https://github.com/clee2000 due to sorry the docs test failure is real, I think it wants the lines after the .. note to be indented https://github.com/pytorch/pytorch/actions/runs/7616827874/job/20745016788. Marking as nosignal due to bad Dr. CI categorization ([comment](https://github.com/pytorch/pytorch/pull/117804#issuecomment-1904882487))
diff --git a/third_party/pocketfft b/third_party/pocketfft
index ea778e3..9d3ab05 160000
--- a/third_party/pocketfft
+++ b/third_party/pocketfft
@@ -1 +1 @@
-Subproject commit ea778e37710c07723435b1be58235996d1d43a5a
+Subproject commit 9d3ab05a7fffbc71a492bc6a17be034e83e8f0fe
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 5572cce..cae9a9c 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -655,20 +655,10 @@
// Device synchronize only after we've completed timeout checks.
if (!barrierTensors_.empty()) {
// If we use the work to do barrier, we should block here
+ at::cuda::OptionalCUDAGuard gpuGuard;
for (auto& device : devices_) {
- // `dist.barrier()` only requires all CPU processes to enter this
- // function, hence we only need to make sure the dummy all-reduce has
- // completed. So we would only need to sync the **current stream** back to
- // host, and do not need to synchronize the entire device (which may have
- // kernels running on other streams).
- // Using `cudaStreamSynchronize` instead of `cudaDeviceSynchronize` can:
- // - lower chance of hang;
- // - CurrentCUDAStream is usually the context of the next operation in
- // Python, thus blocking current stream would already block the next
- // compute kernel;
- // - achieve better barrier performance.
- auto currentStream = at::cuda::getCurrentCUDAStream(device.index());
- AT_CUDA_CHECK(cudaStreamSynchronize(currentStream));
+ gpuGuard.set_index(device.index());
+ AT_CUDA_CHECK(cudaDeviceSynchronize());
}
}
}
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 82d5d35..458ce0e 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -3530,10 +3530,6 @@
Returns:
Async work handle, if async_op is set to True.
None, if not async_op or if not part of the group
-
- .. note:: `ProcessGroupNCCL` now relies on stream synchronization instead of
- device synchronization to block the CPU. Thus, please do not assume that
- `barrier()` would perform a device synchronization.
"""
if _rank_not_in_group(group):
_warn_not_in_group("barrier")
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 1060a10..ccc4f10 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -9983,7 +9983,7 @@
)
@skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
@skip_but_pass_in_sandcastle_if(
- True, "Skipped due to flakiness"
+ BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally"
)
def test_ddp_hook_pickling_powerSGD(self):