[C10D] Make all PGNCCL LOG usages use logPrefix() (#116060)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/116060
Approved by: https://github.com/fduwjj
ghstack dependencies: #116059
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 5f460ad..026576a 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -553,13 +553,13 @@
"Some NCCL operations have failed or timed out. Due to the ",
"asynchronous nature of CUDA kernels, subsequent GPU operations ",
"might run on corrupted/incomplete data.");
- LOG(ERROR) << exceptionMsg;
+ LOG(ERROR) << logPrefix() << exceptionMsg;
C10_LOG_API_USAGE_ONCE("ProcessGroupNCCL.WorkNCCL.handleException");
if (SHOULD_TEAR_DOWN(errorHandling)) {
auto tearDownMsg = c10::str(
"To avoid data inconsistency, we are taking the entire process down.");
- LOG(ERROR) << tearDownMsg;
+ LOG(ERROR) << logPrefix() << tearDownMsg;
std::rethrow_exception(exception_);
}
}
@@ -872,7 +872,8 @@
void ProcessGroupNCCL::eagerConnectSingleDevice(at::Device device) {
std::vector<at::Device> rankDevices = {device};
const auto key = getKeyFromDevices(rankDevices);
- LOG(INFO) << "Eagerly connecting nccl backend with device " << device;
+ LOG(INFO) << logPrefix() << "Eagerly connecting nccl backend with device "
+ << device;
getNCCLComm(key, rankDevices, OpType::ALLREDUCE);
}
@@ -883,8 +884,8 @@
#ifdef NCCL_HAS_COMM_SPLIT
std::vector<at::Device> rankDevices = {device};
const auto key = getKeyFromDevices(rankDevices);
- LOG(INFO) << "Performing nocolor split on backend device " << device
- << ", key " << key << ", i am " << this;
+ LOG(INFO) << logPrefix() << "Performing nocolor split on backend device "
+ << device << ", key " << key << ", i am " << this;
auto comm = getNCCLComm(key, rankDevices, OpType::ALLREDUCE);
TORCH_CHECK_WITH(
DistBackendError,
@@ -1175,7 +1176,7 @@
// output file from an earlier call before a later call overwrites it.
static std::mutex writeDebugInfoMutex;
std::lock_guard<std::mutex> lock(writeDebugInfoMutex);
- LOG(ERROR) << "ProcessGroupNCCL preparing to dump debug info.";
+ LOG(ERROR) << logPrefix() << "ProcessGroupNCCL preparing to dump debug info.";
if (ncclTraceBufferSize_ > 0) {
// We dump nccl trace into local disk by default and users can register
// their customized writer by inheriting `DebugInfoWriter` via
@@ -1196,7 +1197,7 @@
void ProcessGroupNCCL::terminateProcess(std::string errMsg) {
// Logging with `FATAL`, after errMsg printed, it calls `std::abort()`
// to terminate the program execution.
- LOG(FATAL) << errMsg;
+ LOG(FATAL) << logPrefix() << errMsg;
}
void ProcessGroupNCCL::heartbeatMonitor() {
@@ -1501,7 +1502,7 @@
if (desyncDebug_) {
auto desyncMsg = getNCCLWatchdogDebugInfo();
- LOG(ERROR) << desyncMsg;
+ LOG(ERROR) << logPrefix() << desyncMsg;
}
if (dumpOnTimeout_) {
@@ -1510,10 +1511,12 @@
}
} catch (const std::exception& e) {
- LOG(ERROR) << "Failed to retrieve TORCH_NCCL_DESYNC_DEBUG report. "
+ LOG(ERROR) << logPrefix()
+ << "Failed to retrieve TORCH_NCCL_DESYNC_DEBUG report. "
<< " Please file an issue. Error: " << e.what();
} catch (...) {
LOG(ERROR)
+ << logPrefix()
<< "Failed to rerieve TORCH_NCCL_DESYNC_DEBUG report with unknown error."
<< " Please file an issue.";
}
@@ -1779,7 +1782,7 @@
if (bound_device_id_) {
for (const auto& device : devices) {
if (*bound_device_id_ != device) {
- LOG(ERROR) << "Tensor found on device " << device
+ LOG(ERROR) << logPrefix() << "Tensor found on device " << device
<< " but backend constrained to " << *bound_device_id_;
C10_THROW_ERROR(
DistBackendError,
@@ -1932,7 +1935,8 @@
// At this point NCCL should have been initialized, hence we can accurately
// get the env value even if NCCL sets it by reading from nccl.conf file
if (getRank() == 0) {
- LOG(INFO) << "NCCL_DEBUG: " << getCvarString({"NCCL_DEBUG"}, "N/A");
+ LOG(INFO) << logPrefix()
+ << "NCCL_DEBUG: " << getCvarString({"NCCL_DEBUG"}, "N/A");
}
// See [Group Start/End Note]
@@ -3521,14 +3525,14 @@
// ensure that each process is on a different GPU
auto numGPUs = at::cuda::getNumGPUs();
int16_t deviceIdx = static_cast<int16_t>(rank_ % numGPUs);
- LOG(INFO) << c10::str(
- "Rank ",
- this->getRank(),
- " using GPU ",
- deviceIdx,
- " to perform barrier as devices used by this process are currently unknown. ",
- "This can potentially cause a hang if this rank to GPU mapping is incorrect.",
- "Specify device_ids in barrier() to force use of a particular device.");
+ LOG(INFO)
+ << logPrefix()
+ << c10::str(
+ " using GPU ",
+ deviceIdx,
+ " to perform barrier as devices used by this process are currently unknown. ",
+ "This can potentially cause a hang if this rank to GPU mapping is incorrect.",
+ "Specify device_ids in barrier() to force use of a particular device.");
devices.emplace_back(guessDeviceForRank());
} else {
for (auto usedDeviceIdx : usedDeviceIdxs_) {