PyTorch ThroughputBenchmark: fix inaccuracy in number of iterations reporting (#22292)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/22292
as we do atomic fetch_add to validate if a thread should
finish, we should not take the last iteration into account. As a
result total number of iterations should be exactly the same as user
sets via config.num_iters
Now when running a unit test I see exact number of iterations reported
Differential Revision: D16023963
fbshipit-source-id: 3b12ee17276628ecd7b0979f28cd6deb777a1543
diff --git a/torch/csrc/utils/throughput_benchmark-inl.h b/torch/csrc/utils/throughput_benchmark-inl.h
index b07242b..e14c041 100644
--- a/torch/csrc/utils/throughput_benchmark-inl.h
+++ b/torch/csrc/utils/throughput_benchmark-inl.h
@@ -50,7 +50,7 @@
int64_t initialized{0};
int64_t finished{0};
bool start{false};
- std::atomic<int64_t> num_forwards{0};
+ std::atomic<int64_t> num_attempted_iters{0};
std::vector<std::thread> callers;
for (auto thread_id = 0; thread_id < config.num_calling_threads;
@@ -71,7 +71,7 @@
}
}
LOG(INFO) << "Starting forward thread " << thread_id;
- while (num_forwards.fetch_add(1) < config.num_iters) {
+ while (num_attempted_iters.fetch_add(1) < config.num_iters) {
runOnce(std::move(thread_inputs[thread_id][input_iters[thread_id]]));
++input_iters[thread_id];
}
@@ -115,9 +115,12 @@
end_time - start_time)
.count() /
1000.0 / 1000.0;
+ // We use config.num_iters instead of num_attempted_iters as it is
+ // repsesatative of the real work done. Last attempted iteration on each
+ // calling threads doesn't represent the real work (i.e. running the model)
stats.latency_avg_ms =
- total_time_ms * config.num_calling_threads / num_forwards;
- stats.num_iters = num_forwards;
+ total_time_ms * config.num_calling_threads / config.num_iters;
+ stats.num_iters = config.num_iters;
for (auto& t : callers) {
t.join();