[c10d] add more fields for periodic logging (#123860) Summary: Added the names of the last enquened, started and completed colletives, in addition to their seq ID Test Plan: CI Pull Request resolved: https://github.com/pytorch/pytorch/pull/123860 Approved by: https://github.com/XilunWu

commit: 22ba180e552aa05b21d07ccceece0db41846267b [log] [tgz]
author: Shuqiang Zhang <sqzhang@meta.com> Thu Apr 11 14:06:21 2024 -0700
committer: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com> Fri Apr 12 00:11:07 2024 +0000
tree: c698e89621f52dec7d0a5330ecd5cdc8a303959e
parent: 78824fd212d236e3a467d81245f53b5ff532c934 [diff]
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index def79cd..8f2d636 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp

@@ -1551,11 +1551,17 @@
             lastStatusUpdateTime, std::chrono::steady_clock::now()) >=
             kWorkStatusUpdatePeriodMs) {
       ::c10d::C10dLoggingData data;
+      // logging integers
       data.integers["pg_id"] = uid_;
       data.integers["rank"] = rank_;
       data.integers["global_rank"] = globalRank();
       data.integers["last_enqueued_work"] = lastEnqueuedSeq_;
+      data.integers["last_started_work"] = lastStartedSeq_;
       data.integers["last_completed_work"] = lastCompletedSeq_;
+      // logging strings
+      data.strings["last_enqueued_work_name"] = lastEnqueuedWorkName_;
+      data.strings["last_started_work_name"] = lastStartedWorkName_;
+      data.strings["last_completed_work_name"] = lastCompletedWorkName_;
       logger->log(data);
       lastStatusUpdateTime = std::chrono::steady_clock::now();
     }
@@ -1647,9 +1653,19 @@
         }
       }
 
+      // a work could be started but not completed, so we should not update
+      // lastStartedSeq_ and lastStartedOpName_ if the work state is checked
+      // multiple times after the start
+      if (lastStartedSeq_ < static_cast<int64_t>(work.seq_) &&
+          work.isStarted()) {
+        lastStartedSeq_ = work.seq_;
+        lastStartedWorkName_ = opTypeToString(work.opType_);
+      }
+
       // Clean up completed work
       if (work.isCompleted()) {
         lastCompletedSeq_ = work.seq_;
+        lastCompletedWorkName_ = opTypeToString(work.opType_);
         NCCLTraceBuffer::get()->retire_id(work.trace_id_, true);
         if (onCompletionHook_) {
           // Move Work object to completedWorkList_ to be consumed by the hook
@@ -2259,6 +2275,7 @@
     // get deadlock. Here we enqueue work without outputs_.
     workMetaList_.emplace_back(*work);
     lastEnqueuedSeq_ = work->seq_;
+    lastEnqueuedWorkName_ = opTypeToString(work->opType_);
     lastWorkListUpdateTime_ = std::chrono::steady_clock::now();
   }
 }

diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index ea90079..18201db 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp

@@ -1072,11 +1072,23 @@
   // initialized to be -1 to indicate no collective has been enqueued
   int64_t lastEnqueuedSeq_{-1};
 
+  // the name of the last collective enqueued into workMetaList_
+  std::string lastEnqueuedWorkName_;
+
+  // the sequential number of the last colletive started as the kernal
+  int64_t lastStartedSeq_{-1};
+
+  // the name of the last collective started as the kernal
+  std::string lastStartedWorkName_;
+
   // the sequential number of the last colletive completed marked by
   // the watchdog thread
   // initialized to be -1 to indicate no collective has been completed
   int64_t lastCompletedSeq_{-1};
 
+  // the name of the last collective completed
+  std::string lastCompletedWorkName_;
+
   std::exception_ptr watchDogException_ = nullptr;
 
   size_t uid_;
commit	22ba180e552aa05b21d07ccceece0db41846267b	[log] [tgz]
author	Shuqiang Zhang <sqzhang@meta.com>	Thu Apr 11 14:06:21 2024 -0700
committer	PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>	Fri Apr 12 00:11:07 2024 +0000
tree	c698e89621f52dec7d0a5330ecd5cdc8a303959e
parent	78824fd212d236e3a467d81245f53b5ff532c934 [diff]