Revert "[RFC] add per-collective timeout value in flight recorder (#128190)"
This reverts commit 09cccbc1c74c9d1157c1caca5526e79ee9b7ea01.
Reverted https://github.com/pytorch/pytorch/pull/128190 on behalf of https://github.com/atalman due to Sorry need to revert this, in conflict with https://github.com/pytorch/pytorch/pull/127651 that needs reverting ([comment](https://github.com/pytorch/pytorch/pull/128190#issuecomment-2156075318))
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index f45600c..21a8a63 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -3548,7 +3548,7 @@
)
)
ver = t["version"]
- self.assertEqual(ver, "2.2")
+ self.assertEqual(ver, "2.1")
pg_config = t["pg_config"]
self.assertEqual(len(pg_config), 1)
default_pg_info = pg_config["0"]
@@ -3577,7 +3577,6 @@
self.assertEqual(last["output_sizes"], ((3, 4),))
self.assertEqual(last["output_dtypes"], ["Float"])
self.assertEqual(last["collective_seq_id"], 2)
- self.assertEqual(last["timeout_ms"], 600000)
now = datetime.now()
event_created_time = datetime.fromtimestamp(
last["time_created_ns"] / 1000000000
@@ -3662,7 +3661,6 @@
self.assertEqual(last["input_dtypes"], ["Float"])
self.assertEqual(last["output_sizes"], ((3, 4),))
self.assertEqual(last["output_dtypes"], ["Float"])
- self.assertEqual(last["timeout_ms"], 600000)
self.assertEqual(last["collective_seq_id"] - first["collective_seq_id"], 9)
@requires_nccl()
@@ -3867,7 +3865,6 @@
self.assertTrue(0.001 < duration < 10000, duration)
else:
self.assertTrue("duration_ms" not in t["entries"][coalesced_op])
- self.assertEqual(t["entries"][coalesced_op]["timeout_ms"], 600000)
@requires_nccl()
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 07bbcd5..8adf1e0 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -2356,7 +2356,6 @@
outputs,
r->ncclStartEvent_.get(),
r->ncclEndEvent_.get(),
- options_->timeout,
isP2P);
}
return r;
@@ -2967,7 +2966,6 @@
{tensor},
nullptr,
nullptr,
- options_->timeout,
/*isP2P=*/true);
// TODO(whc) if we want to make the per-p2p-op flightrecorder entries get
// their timings/states updated by proxy when the Work obj representing the
@@ -3001,7 +2999,6 @@
{tensor},
work->ncclStartEvent_.get(),
work->ncclEndEvent_.get(),
- options_->timeout,
/*isP2P=*/true);
}
diff --git a/torch/csrc/distributed/c10d/TraceUtils.h b/torch/csrc/distributed/c10d/TraceUtils.h
index de623d7..c3b0464 100644
--- a/torch/csrc/distributed/c10d/TraceUtils.h
+++ b/torch/csrc/distributed/c10d/TraceUtils.h
@@ -8,7 +8,6 @@
#include <torch/csrc/distributed/c10d/Utils.hpp>
#include <torch/csrc/jit/serialization/pickler.h>
#include <torch/csrc/profiler/combined_traceback.h>
-#include <chrono>
#ifdef USE_C10D_NCCL
#include <ATen/cuda/CUDAEvent.h>
@@ -29,7 +28,7 @@
static c10::IValue version_key = "version";
// Update whenever changing contents or formatting of the dump
// (minor when adding fields, major when changing existing fields)
-static c10::IValue version_val = "2.2";
+static c10::IValue version_val = "2.1";
static c10::IValue pg_config_key = "pg_config";
static c10::IValue record_id_key = "record_id";
static c10::IValue pg_id_key = "pg_id";
@@ -45,7 +44,6 @@
static c10::IValue output_dtypes_key = "output_dtypes";
static c10::IValue time_created_key = "time_created_ns";
static c10::IValue duration_key = "duration_ms";
-static c10::IValue timeout_key = "timeout_ms";
static c10::IValue frames_key = "frames";
static c10::IValue state_key = "state";
@@ -463,9 +461,6 @@
// was 'enqueued'- not necessarily started
c10::time_t time_created_;
- // configured timeout for this entry
- c10::time_t timeout_ms_;
-
// Is this a P2P event?
bool isP2P_;
@@ -513,7 +508,6 @@
const std::vector<at::Tensor>& outputs,
Event* start,
Event* end,
- std::chrono::milliseconds timeout_ms,
bool isP2P) {
if (!enabled_) {
return c10::nullopt;
@@ -534,7 +528,6 @@
std::move(start),
std::move(end),
c10::getTime(),
- timeout_ms.count(),
isP2P};
for (const auto& input : inputs) {
@@ -759,7 +752,6 @@
? int64_t(*e.time_discovered_completed_)
: c10::IValue());
dict.insert(retired_key, e.retired_);
- dict.insert(timeout_key, e.timeout_ms_);
dict.insert(is_p2p_key, e.isP2P_);
entries.push_back(dict);