log newly added construction and runtime stats at randomly selected iterations (#51394)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/51394
log newly added construction and runtime stats at randomly selected iterations
ghstack-source-id: 121934040
Test Plan: unit tests
Reviewed By: SciPioneer
Differential Revision: D26161885
fbshipit-source-id: add6e02c1a03e6f74f08b9a9aecf90fa81631d60
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 445a945..872d1df 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -302,8 +302,8 @@
py::arg("broadcast_buffers"),
py::call_guard<py::gil_scoped_release>())
.def(
- "set_runtime_stats",
- &::c10d::Logger::set_runtime_stats,
+ "set_runtime_stats_and_log",
+ &::c10d::Logger::set_runtime_stats_and_log,
py::call_guard<py::gil_scoped_release>())
.def(
"get_ddp_logging_data",
diff --git a/torch/lib/c10d/logger.cpp b/torch/lib/c10d/logger.cpp
index 886aff4..79528d7 100644
--- a/torch/lib/c10d/logger.cpp
+++ b/torch/lib/c10d/logger.cpp
@@ -2,6 +2,10 @@
namespace c10d {
+// When training runs at these iterations, log the runtime
+// stats.
+const int LoggingIterations[] = {10, 20, 100, 1000};
+
namespace {
const int kMilliSecondToNanosSecond = 1000000;
@@ -132,7 +136,7 @@
}
#endif
-void Logger::set_runtime_stats() {
+void Logger::set_runtime_stats_and_log() {
// Sync with reducer's data
std::lock_guard<std::mutex> lock(reducer_->mutex_);
// Set runtime stats at the sampling iterations.
@@ -183,7 +187,7 @@
return;
}
- // set_runtime_stats is called at the beginning of forward call,
+ // set_runtime_stats_and_log is called at the beginning of forward call,
// when it is cheap to synchronize the cuda events of previous iteration,
// as mostly all cuda operations are finished in previous iteration.
reducer_->gpu_timer_.forward_start.synchronize();
@@ -229,6 +233,17 @@
reducer_->cpu_timer_.backward_comm_start_time,
reducer_->cpu_timer_.backward_compute_end_time);
}
+ // Log runtime (e.g. avg performance) stats at the beginning and also
+ // after a larger number of iterations. Choosing 10/1000/10000 is
+ // not scientific here, it assumes most of applications will run
+ // at least 10 iterations. stats could have smaller variance if
+ // selected num_iterations_ is larger.
+ if (std::find(
+ std::begin(LoggingIterations),
+ std::end(LoggingIterations),
+ num_iterations_stats_recorded_) != std::end(LoggingIterations)) {
+ LogPyTorchDDPUsage(*ddp_logging_data_);
+ }
}
c10::DDPLoggingData Logger::get_ddp_logging_data() {
diff --git a/torch/lib/c10d/logger.hpp b/torch/lib/c10d/logger.hpp
index dae4c55..50c28be 100644
--- a/torch/lib/c10d/logger.hpp
+++ b/torch/lib/c10d/logger.hpp
@@ -44,7 +44,7 @@
// single device program and single device module right now.
// TODO to support single process multiple devices and multi device modules,
// events need to be created and recorded on multiple devices.
- void set_runtime_stats();
+ void set_runtime_stats_and_log();
private:
// ddp_logging_data_ is used to hold all the ddp related logging
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index a865be3..7177b75 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -678,7 +678,7 @@
def forward(self, *inputs, **kwargs):
self.reducer.save_thread_local_state()
if torch.is_grad_enabled() and self.require_backward_grad_sync:
- self.logger.set_runtime_stats()
+ self.logger.set_runtime_stats_and_log()
self.reducer.prepare_for_forward()
if self.ddp_uneven_inputs_config.ddp_join_enabled:
ones = torch.ones(