Revert "[RFC] add per-collective timeout value in flight recorder (#128190)" This reverts commit 09cccbc1c74c9d1157c1caca5526e79ee9b7ea01. Reverted https://github.com/pytorch/pytorch/pull/128190 on behalf of https://github.com/atalman due to Sorry need to revert this, in conflict with https://github.com/pytorch/pytorch/pull/127651 that needs reverting ([comment](https://github.com/pytorch/pytorch/pull/128190#issuecomment-2156075318))

commit: 57a24c4fdb58b82724b4f3d55d3af105a660bf39 [log] [tgz]
author: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com> Sat Jun 08 15:25:25 2024 +0000
committer: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com> Sat Jun 08 15:25:27 2024 +0000
tree: 13a6a828ab30437e61beb3943a2c58b884e05c57
parent: 348b181a97abc2e636a6c18e5880a78e5d1dab94 [diff]
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index f45600c..21a8a63 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py

@@ -3548,7 +3548,7 @@
                 )
             )
         ver = t["version"]
-        self.assertEqual(ver, "2.2")
+        self.assertEqual(ver, "2.1")
         pg_config = t["pg_config"]
         self.assertEqual(len(pg_config), 1)
         default_pg_info = pg_config["0"]
@@ -3577,7 +3577,6 @@
             self.assertEqual(last["output_sizes"], ((3, 4),))
             self.assertEqual(last["output_dtypes"], ["Float"])
             self.assertEqual(last["collective_seq_id"], 2)
-            self.assertEqual(last["timeout_ms"], 600000)
             now = datetime.now()
             event_created_time = datetime.fromtimestamp(
                 last["time_created_ns"] / 1000000000
@@ -3662,7 +3661,6 @@
         self.assertEqual(last["input_dtypes"], ["Float"])
         self.assertEqual(last["output_sizes"], ((3, 4),))
         self.assertEqual(last["output_dtypes"], ["Float"])
-        self.assertEqual(last["timeout_ms"], 600000)
         self.assertEqual(last["collective_seq_id"] - first["collective_seq_id"], 9)
 
     @requires_nccl()
@@ -3867,7 +3865,6 @@
                 self.assertTrue(0.001 < duration < 10000, duration)
             else:
                 self.assertTrue("duration_ms" not in t["entries"][coalesced_op])
-            self.assertEqual(t["entries"][coalesced_op]["timeout_ms"], 600000)
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")

diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 07bbcd5..8adf1e0 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp

@@ -2356,7 +2356,6 @@
         outputs,
         r->ncclStartEvent_.get(),
         r->ncclEndEvent_.get(),
-        options_->timeout,
         isP2P);
   }
   return r;
@@ -2967,7 +2966,6 @@
         {tensor},
         nullptr,
         nullptr,
-        options_->timeout,
         /*isP2P=*/true);
     // TODO(whc) if we want to make the per-p2p-op flightrecorder entries get
     // their timings/states updated by proxy when the Work obj representing the
@@ -3001,7 +2999,6 @@
         {tensor},
         work->ncclStartEvent_.get(),
         work->ncclEndEvent_.get(),
-        options_->timeout,
         /*isP2P=*/true);
   }
 

diff --git a/torch/csrc/distributed/c10d/TraceUtils.h b/torch/csrc/distributed/c10d/TraceUtils.h
index de623d7..c3b0464 100644
--- a/torch/csrc/distributed/c10d/TraceUtils.h
+++ b/torch/csrc/distributed/c10d/TraceUtils.h

@@ -8,7 +8,6 @@
 #include <torch/csrc/distributed/c10d/Utils.hpp>
 #include <torch/csrc/jit/serialization/pickler.h>
 #include <torch/csrc/profiler/combined_traceback.h>
-#include <chrono>
 
 #ifdef USE_C10D_NCCL
 #include <ATen/cuda/CUDAEvent.h>
@@ -29,7 +28,7 @@
 static c10::IValue version_key = "version";
 // Update whenever changing contents or formatting of the dump
 // (minor when adding fields, major when changing existing fields)
-static c10::IValue version_val = "2.2";
+static c10::IValue version_val = "2.1";
 static c10::IValue pg_config_key = "pg_config";
 static c10::IValue record_id_key = "record_id";
 static c10::IValue pg_id_key = "pg_id";
@@ -45,7 +44,6 @@
 static c10::IValue output_dtypes_key = "output_dtypes";
 static c10::IValue time_created_key = "time_created_ns";
 static c10::IValue duration_key = "duration_ms";
-static c10::IValue timeout_key = "timeout_ms";
 
 static c10::IValue frames_key = "frames";
 static c10::IValue state_key = "state";
@@ -463,9 +461,6 @@
     // was 'enqueued'- not necessarily started
     c10::time_t time_created_;
 
-    // configured timeout for this entry
-    c10::time_t timeout_ms_;
-
     // Is this a P2P event?
     bool isP2P_;
 
@@ -513,7 +508,6 @@
       const std::vector<at::Tensor>& outputs,
       Event* start,
       Event* end,
-      std::chrono::milliseconds timeout_ms,
       bool isP2P) {
     if (!enabled_) {
       return c10::nullopt;
@@ -534,7 +528,6 @@
         std::move(start),
         std::move(end),
         c10::getTime(),
-        timeout_ms.count(),
         isP2P};
 
     for (const auto& input : inputs) {
@@ -759,7 +752,6 @@
               ? int64_t(*e.time_discovered_completed_)
               : c10::IValue());
       dict.insert(retired_key, e.retired_);
-      dict.insert(timeout_key, e.timeout_ms_);
       dict.insert(is_p2p_key, e.isP2P_);
 
       entries.push_back(dict);
commit	57a24c4fdb58b82724b4f3d55d3af105a660bf39	[log] [tgz]
author	PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>	Sat Jun 08 15:25:25 2024 +0000
committer	PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>	Sat Jun 08 15:25:27 2024 +0000
tree	13a6a828ab30437e61beb3943a2c58b884e05c57
parent	348b181a97abc2e636a6c18e5880a78e5d1dab94 [diff]