[c10d]Control logging c++ traces with a flag (#133490) Summary: Logging C++ stack traces occasionally races with shutdown processes on exception. It isn't safe and we've seen SIGSEGVs in the field. These crashes prevent flight recorder dumps from completing. For now, default this dumping to `true` and provide a knob if we need to control things in production. Test Plan: Tested locally on a job named `torchx-chirag_test_run` to make sure that the JK was honored by the code. It was correctly disabled on my test job. see (TORCH_NCCL_LOG_CPP_STACK_ON_EXCEPTION: 0) below. ``` ] [trainer2]:I0814 11:21:20.152419 3708 ProcessGroupNCCL.cpp:874] [PG ID 0PG GUID 0 Rank 10] ProcessGroupNCCL environments: NCCL version: 2.20.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 1, TORCH_NCCL_DUMP_ON_TIMEOUT: 1, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK: 0, TORCH_NCCL_ENABLE_MONITORING: 0, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 480, TORCH_NCCL_TRACE_BUFFER_SIZE: 2000, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0, TORCH_NCCL_LOG_CPP_STACK_ON_EXCEPTION: 0 ``` Differential Revision: D61283335 Pull Request resolved: https://github.com/pytorch/pytorch/pull/133490 Approved by: https://github.com/fduwjj

commit: 41d6cabca1b70babe77001509aa979192469ec4f [log] [tgz]
author: Chirag Pandya <cpio@meta.com> Thu Aug 15 18:25:02 2024 +0000
committer: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com> Thu Aug 15 18:25:02 2024 +0000
tree: 2c25a4f4df4f0662b811c77a2293051e80c23e67
parent: 546c53b7849b846066160e7b3b22036d6320bdd9 [diff]
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 7821c2e..56c0d37 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp

@@ -795,6 +795,9 @@
   // both timeout and other errors.
   dumpOnException_ = getCvarBool(TORCH_NCCL_DUMP_ON_TIMEOUT, false) ||
       (dist_debug_level_ >= DebugLevel::Detail);
+  // logging C++ stack isn't safe. Introduce a variable to control it.
+  logCppStackOnUncleanShutdown_ =
+      getCvarBool(TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN, true);
   enableNanCheck_ = getCvarBool(TORCH_NCCL_NAN_CHECK, false);
   heartbeat_ = 1ULL;
   monitorThreadEnabled_.store(getCvarBool(TORCH_NCCL_ENABLE_MONITORING, true));
@@ -887,7 +890,9 @@
             << ", TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: " << heartbeatTimeoutInSec_
             << ", TORCH_NCCL_TRACE_BUFFER_SIZE: " << ncclTraceBufferSize_
             << ", TORCH_NCCL_COORD_CHECK_MILSEC: " << coordCheckIntervalMilSec_
-            << ", TORCH_NCCL_NAN_CHECK: " << enableNanCheck_;
+            << ", TORCH_NCCL_NAN_CHECK: " << enableNanCheck_
+            << ", TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN: "
+            << logCppStackOnUncleanShutdown_;
 
   if (options_->global_ranks_in_group.empty()) {
     this->globalRankStart = 0;
@@ -1426,7 +1431,7 @@
   LOG(ERROR) << errorMsg;
 
   auto& cpp_dumper = get_cpp_trace_dumper();
-  if (cpp_dumper.has_value()) {
+  if (logCppStackOnUncleanShutdown_ && cpp_dumper.has_value()) {
     LOG(INFO) << "Dumping c++ stacktraces:";
     cpp_dumper.value()([](const std::string& line) { LOG(INFO) << line; });
   }
@@ -1455,7 +1460,6 @@
       LOG(ERROR)
           << "Could not acquire GIL within 300 ms on exit, possible GIL induced hang";
     }
-    LOG(INFO) << "Could acquire GIL on exit";
   } else {
     LOG(INFO)
         << "GIL checker was not registered, perhaps this is a no-python build?";

diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 315c8d8..149a44e 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp

@@ -106,6 +106,10 @@
 static std::vector<std::string> TORCH_NCCL_COORD_CHECK_MILSEC = {
     "TORCH_NCCL_COORD_CHECK_MILSEC"};
 
+// Whether to log C++ stack traces on unclean shutdown (default true)
+static std::vector<std::string> TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN = {
+    "TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN"};
+
 static std::vector<std::string> TORCH_NCCL_NAN_CHECK = {"TORCH_NCCL_NAN_CHECK"};
 
 constexpr const char* NCCL_BACKEND_NAME = "nccl";
@@ -1082,6 +1086,9 @@
   // Whether or not to enable nan check for input tensors to collectives.
   bool enableNanCheck_;
 
+  // Whether or not to print C++ stack traces to logs on unclean shutdown.
+  bool logCppStackOnUncleanShutdown_;
+
   // Whether or not to create start CUDAEvent and enable timing for start
   // and end events. Note that enableTiming_ is always true if desyncDebug_
   // is set to true.
commit	41d6cabca1b70babe77001509aa979192469ec4f	[log] [tgz]
author	Chirag Pandya <cpio@meta.com>	Thu Aug 15 18:25:02 2024 +0000
committer	PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>	Thu Aug 15 18:25:02 2024 +0000
tree	2c25a4f4df4f0662b811c77a2293051e80c23e67
parent	546c53b7849b846066160e7b3b22036d6320bdd9 [diff]