[Tensorpipe Agent] Implementing getMetrics with currently available metrics (#37980)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/37980
This implements `TensorPipeAgent::getMetrics` with the metrics currently available. Will add other metrics such as Client/Server Active Calls once time outs are implemented.
ghstack-source-id: 103624005
Test Plan: CI
Differential Revision: D21439184
fbshipit-source-id: 8a15df58cc23cdf954e604c0f806877ba111e0a6
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index b5c0b0a..33b00e9 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -20,6 +20,8 @@
constexpr long kToMilliseconds = 1000;
const std::string kGilAverageWaitTime = "agent.gil_average_wait_time_us";
+const std::string kThreadPoolSize = "agent.thread_pool_size";
+const std::string kNumIdleThreads = "agent.num_idle_threads";
////////////////////////// MetricsTracker /////////////////////////////////
@@ -420,6 +422,25 @@
}
#endif
+std::unordered_map<std::string, std::string> TensorPipeAgent::getMetrics() {
+ std::unordered_map<std::string, std::string> metrics;
+ metrics[kThreadPoolSize] = c10::to_string(threadPool_.size());
+ metrics[kNumIdleThreads] = c10::to_string(threadPool_.numAvailable());
+ if (isGILProfilingEnabled()) {
+ {
+ std::unique_lock<std::mutex> lock(metricsMutex_);
+ // Include the averages for each time series metric. This is just the GIL
+ // Wait Time for now.
+ auto averageGilWaitTime =
+ timeSeriesMetrics_[kGilAverageWaitTime]->computeAverage();
+ lock.unlock();
+ metrics[kGilAverageWaitTime] = c10::to_string(averageGilWaitTime);
+ }
+ }
+
+ return metrics;
+}
+
void TensorPipeAgent::addGilWaitTime(
const std::chrono::microseconds gilWaitTime) {
std::lock_guard<std::mutex> lock(metricsMutex_);
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.h b/torch/csrc/distributed/rpc/tensorpipe_agent.h
index bb2fb68..b979b37 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.h
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.h
@@ -72,10 +72,7 @@
const WorkerInfo& getWorkerInfo(worker_id_t workerId) const override;
std::vector<WorkerInfo> getWorkerInfos() const override;
- std::unordered_map<std::string, std::string> getMetrics() override {
- std::unordered_map<std::string, std::string> metrics;
- return metrics;
- }
+ std::unordered_map<std::string, std::string> getMetrics() override;
void addGilWaitTime(const std::chrono::microseconds gilWaitTime) override;