[c10d]Control logging c++ traces with a flag (#133490)

Summary:
Logging C++ stack traces occasionally races with shutdown processes on exception. It isn't safe and we've seen SIGSEGVs in the field.
These crashes prevent flight recorder dumps from completing.

For now, default this dumping to `true` and provide a knob if we need to control things in production.

Test Plan:
Tested locally on a job named `torchx-chirag_test_run` to make sure that the JK was honored by the code.
It was correctly disabled on my test job.
see (TORCH_NCCL_LOG_CPP_STACK_ON_EXCEPTION: 0) below.

```
] [trainer2]:I0814 11:21:20.152419  3708 ProcessGroupNCCL.cpp:874] [PG ID 0PG GUID 0 Rank 10] ProcessGroupNCCL environments: NCCL version: 2.20.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 1, TORCH_NCCL_DUMP_ON_TIMEOUT: 1, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK: 0, TORCH_NCCL_ENABLE_MONITORING: 0, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 480, TORCH_NCCL_TRACE_BUFFER_SIZE: 2000, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0, TORCH_NCCL_LOG_CPP_STACK_ON_EXCEPTION: 0
```

Differential Revision: D61283335

Pull Request resolved: https://github.com/pytorch/pytorch/pull/133490
Approved by: https://github.com/fduwjj
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 7821c2e..56c0d37 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -795,6 +795,9 @@
   // both timeout and other errors.
   dumpOnException_ = getCvarBool(TORCH_NCCL_DUMP_ON_TIMEOUT, false) ||
       (dist_debug_level_ >= DebugLevel::Detail);
+  // logging C++ stack isn't safe. Introduce a variable to control it.
+  logCppStackOnUncleanShutdown_ =
+      getCvarBool(TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN, true);
   enableNanCheck_ = getCvarBool(TORCH_NCCL_NAN_CHECK, false);
   heartbeat_ = 1ULL;
   monitorThreadEnabled_.store(getCvarBool(TORCH_NCCL_ENABLE_MONITORING, true));
@@ -887,7 +890,9 @@
             << ", TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: " << heartbeatTimeoutInSec_
             << ", TORCH_NCCL_TRACE_BUFFER_SIZE: " << ncclTraceBufferSize_
             << ", TORCH_NCCL_COORD_CHECK_MILSEC: " << coordCheckIntervalMilSec_
-            << ", TORCH_NCCL_NAN_CHECK: " << enableNanCheck_;
+            << ", TORCH_NCCL_NAN_CHECK: " << enableNanCheck_
+            << ", TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN: "
+            << logCppStackOnUncleanShutdown_;
 
   if (options_->global_ranks_in_group.empty()) {
     this->globalRankStart = 0;
@@ -1426,7 +1431,7 @@
   LOG(ERROR) << errorMsg;
 
   auto& cpp_dumper = get_cpp_trace_dumper();
-  if (cpp_dumper.has_value()) {
+  if (logCppStackOnUncleanShutdown_ && cpp_dumper.has_value()) {
     LOG(INFO) << "Dumping c++ stacktraces:";
     cpp_dumper.value()([](const std::string& line) { LOG(INFO) << line; });
   }
@@ -1455,7 +1460,6 @@
       LOG(ERROR)
           << "Could not acquire GIL within 300 ms on exit, possible GIL induced hang";
     }
-    LOG(INFO) << "Could acquire GIL on exit";
   } else {
     LOG(INFO)
         << "GIL checker was not registered, perhaps this is a no-python build?";
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 315c8d8..149a44e 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -106,6 +106,10 @@
 static std::vector<std::string> TORCH_NCCL_COORD_CHECK_MILSEC = {
     "TORCH_NCCL_COORD_CHECK_MILSEC"};
 
+// Whether to log C++ stack traces on unclean shutdown (default true)
+static std::vector<std::string> TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN = {
+    "TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN"};
+
 static std::vector<std::string> TORCH_NCCL_NAN_CHECK = {"TORCH_NCCL_NAN_CHECK"};
 
 constexpr const char* NCCL_BACKEND_NAME = "nccl";
@@ -1082,6 +1086,9 @@
   // Whether or not to enable nan check for input tensors to collectives.
   bool enableNanCheck_;
 
+  // Whether or not to print C++ stack traces to logs on unclean shutdown.
+  bool logCppStackOnUncleanShutdown_;
+
   // Whether or not to create start CUDAEvent and enable timing for start
   // and end events. Note that enableTiming_ is always true if desyncDebug_
   // is set to true.