[c10d] add more fields for periodic logging (#123860)
Summary:
Added the names of the last enquened, started and completed colletives,
in addition to their seq ID
Test Plan:
CI
Pull Request resolved: https://github.com/pytorch/pytorch/pull/123860
Approved by: https://github.com/XilunWu
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index def79cd..8f2d636 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -1551,11 +1551,17 @@
lastStatusUpdateTime, std::chrono::steady_clock::now()) >=
kWorkStatusUpdatePeriodMs) {
::c10d::C10dLoggingData data;
+ // logging integers
data.integers["pg_id"] = uid_;
data.integers["rank"] = rank_;
data.integers["global_rank"] = globalRank();
data.integers["last_enqueued_work"] = lastEnqueuedSeq_;
+ data.integers["last_started_work"] = lastStartedSeq_;
data.integers["last_completed_work"] = lastCompletedSeq_;
+ // logging strings
+ data.strings["last_enqueued_work_name"] = lastEnqueuedWorkName_;
+ data.strings["last_started_work_name"] = lastStartedWorkName_;
+ data.strings["last_completed_work_name"] = lastCompletedWorkName_;
logger->log(data);
lastStatusUpdateTime = std::chrono::steady_clock::now();
}
@@ -1647,9 +1653,19 @@
}
}
+ // a work could be started but not completed, so we should not update
+ // lastStartedSeq_ and lastStartedOpName_ if the work state is checked
+ // multiple times after the start
+ if (lastStartedSeq_ < static_cast<int64_t>(work.seq_) &&
+ work.isStarted()) {
+ lastStartedSeq_ = work.seq_;
+ lastStartedWorkName_ = opTypeToString(work.opType_);
+ }
+
// Clean up completed work
if (work.isCompleted()) {
lastCompletedSeq_ = work.seq_;
+ lastCompletedWorkName_ = opTypeToString(work.opType_);
NCCLTraceBuffer::get()->retire_id(work.trace_id_, true);
if (onCompletionHook_) {
// Move Work object to completedWorkList_ to be consumed by the hook
@@ -2259,6 +2275,7 @@
// get deadlock. Here we enqueue work without outputs_.
workMetaList_.emplace_back(*work);
lastEnqueuedSeq_ = work->seq_;
+ lastEnqueuedWorkName_ = opTypeToString(work->opType_);
lastWorkListUpdateTime_ = std::chrono::steady_clock::now();
}
}
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index ea90079..18201db 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -1072,11 +1072,23 @@
// initialized to be -1 to indicate no collective has been enqueued
int64_t lastEnqueuedSeq_{-1};
+ // the name of the last collective enqueued into workMetaList_
+ std::string lastEnqueuedWorkName_;
+
+ // the sequential number of the last colletive started as the kernal
+ int64_t lastStartedSeq_{-1};
+
+ // the name of the last collective started as the kernal
+ std::string lastStartedWorkName_;
+
// the sequential number of the last colletive completed marked by
// the watchdog thread
// initialized to be -1 to indicate no collective has been completed
int64_t lastCompletedSeq_{-1};
+ // the name of the last collective completed
+ std::string lastCompletedWorkName_;
+
std::exception_ptr watchDogException_ = nullptr;
size_t uid_;