log newly added construction and runtime stats at randomly selected iterations (#51394)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/51394

log newly added construction and runtime stats at randomly selected iterations
ghstack-source-id: 121934040

Test Plan: unit tests

Reviewed By: SciPioneer

Differential Revision: D26161885

fbshipit-source-id: add6e02c1a03e6f74f08b9a9aecf90fa81631d60
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 445a945..872d1df 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -302,8 +302,8 @@
           py::arg("broadcast_buffers"),
           py::call_guard<py::gil_scoped_release>())
         .def(
-          "set_runtime_stats",
-          &::c10d::Logger::set_runtime_stats,
+          "set_runtime_stats_and_log",
+          &::c10d::Logger::set_runtime_stats_and_log,
           py::call_guard<py::gil_scoped_release>())
         .def(
           "get_ddp_logging_data",
diff --git a/torch/lib/c10d/logger.cpp b/torch/lib/c10d/logger.cpp
index 886aff4..79528d7 100644
--- a/torch/lib/c10d/logger.cpp
+++ b/torch/lib/c10d/logger.cpp
@@ -2,6 +2,10 @@
 
 namespace c10d {
 
+// When training runs at these iterations, log the runtime
+// stats.
+const int LoggingIterations[] = {10, 20, 100, 1000};
+
 namespace {
 
 const int kMilliSecondToNanosSecond = 1000000;
@@ -132,7 +136,7 @@
 }
 #endif
 
-void Logger::set_runtime_stats() {
+void Logger::set_runtime_stats_and_log() {
   // Sync with reducer's data
   std::lock_guard<std::mutex> lock(reducer_->mutex_);
   // Set runtime stats at the sampling iterations.
@@ -183,7 +187,7 @@
       return;
     }
 
-    // set_runtime_stats is called at the beginning of forward call,
+    // set_runtime_stats_and_log is called at the beginning of forward call,
     // when it is cheap to synchronize the cuda events of previous iteration,
     // as mostly all cuda operations are finished in previous iteration.
     reducer_->gpu_timer_.forward_start.synchronize();
@@ -229,6 +233,17 @@
         reducer_->cpu_timer_.backward_comm_start_time,
         reducer_->cpu_timer_.backward_compute_end_time);
   }
+  // Log runtime (e.g. avg performance) stats at the beginning and also
+  // after a larger number of iterations. Choosing 10/1000/10000 is
+  // not scientific here, it assumes most of applications will run
+  // at least 10 iterations. stats could have smaller variance if
+  // selected num_iterations_ is larger.
+  if (std::find(
+          std::begin(LoggingIterations),
+          std::end(LoggingIterations),
+          num_iterations_stats_recorded_) != std::end(LoggingIterations)) {
+    LogPyTorchDDPUsage(*ddp_logging_data_);
+  }
 }
 
 c10::DDPLoggingData Logger::get_ddp_logging_data() {
diff --git a/torch/lib/c10d/logger.hpp b/torch/lib/c10d/logger.hpp
index dae4c55..50c28be 100644
--- a/torch/lib/c10d/logger.hpp
+++ b/torch/lib/c10d/logger.hpp
@@ -44,7 +44,7 @@
   // single device program and single device module right now.
   // TODO to support single process multiple devices and multi device modules,
   // events need to be created and recorded on multiple devices.
-  void set_runtime_stats();
+  void set_runtime_stats_and_log();
 
  private:
   // ddp_logging_data_ is used to hold all the ddp related logging
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index a865be3..7177b75 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -678,7 +678,7 @@
     def forward(self, *inputs, **kwargs):
         self.reducer.save_thread_local_state()
         if torch.is_grad_enabled() and self.require_backward_grad_sync:
-            self.logger.set_runtime_stats()
+            self.logger.set_runtime_stats_and_log()
             self.reducer.prepare_for_forward()
         if self.ddp_uneven_inputs_config.ddp_join_enabled:
             ones = torch.ones(