PyTorch ThroughputBenchmark: fix inaccuracy in number of iterations reporting (#22292)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/22292

as we do atomic fetch_add to validate if a thread should
finish, we should not take the last iteration into account. As a
result total number of iterations should be exactly the same as user
sets via config.num_iters

Now when running a unit test I see exact number of iterations reported

Differential Revision: D16023963

fbshipit-source-id: 3b12ee17276628ecd7b0979f28cd6deb777a1543
diff --git a/torch/csrc/utils/throughput_benchmark-inl.h b/torch/csrc/utils/throughput_benchmark-inl.h
index b07242b..e14c041 100644
--- a/torch/csrc/utils/throughput_benchmark-inl.h
+++ b/torch/csrc/utils/throughput_benchmark-inl.h
@@ -50,7 +50,7 @@
   int64_t initialized{0};
   int64_t finished{0};
   bool start{false};
-  std::atomic<int64_t> num_forwards{0};
+  std::atomic<int64_t> num_attempted_iters{0};
   std::vector<std::thread> callers;
 
   for (auto thread_id = 0; thread_id < config.num_calling_threads;
@@ -71,7 +71,7 @@
         }
       }
       LOG(INFO) << "Starting forward thread " << thread_id;
-      while (num_forwards.fetch_add(1) < config.num_iters) {
+      while (num_attempted_iters.fetch_add(1) < config.num_iters) {
         runOnce(std::move(thread_inputs[thread_id][input_iters[thread_id]]));
         ++input_iters[thread_id];
       }
@@ -115,9 +115,12 @@
                             end_time - start_time)
                             .count() /
       1000.0 / 1000.0;
+  // We use config.num_iters instead of num_attempted_iters as it is
+  // repsesatative of the real work done. Last attempted iteration on each
+  // calling threads doesn't represent the real work (i.e. running the model)
   stats.latency_avg_ms =
-      total_time_ms * config.num_calling_threads / num_forwards;
-  stats.num_iters = num_forwards;
+      total_time_ms * config.num_calling_threads / config.num_iters;
+  stats.num_iters = config.num_iters;
 
   for (auto& t : callers) {
     t.join();