report per iteration execution time (#27923)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27923

As title

Test Plan:
```
buck run caffe2/benchmarks/operator_benchmark/pt:add_test -- --iterations 3 --ai_pep_format true

# ----------------------------------------
# PyTorch/Caffe2 Operator Micro-benchmarks
# ----------------------------------------
# Tag : short

# Benchmarking PyTorch: add
PyTorchObserver {"type": "add_M64_N64_K64_cpu", "metric": "latency", "unit": "us", "value": "0.027768373489379883"}
PyTorchObserver {"type": "add_M64_N64_K64_cpu", "metric": "latency", "unit": "us", "value": "0.02661752700805664"}
PyTorchObserver {"type": "add_M64_N64_K64_cpu", "metric": "latency", "unit": "us", "value": "0.026746749877929688"}
...

Reviewed By: hl475

Differential Revision: D17911718

fbshipit-source-id: 6fe28f2ab9ce1e0feabb5b822f04ff32dac977a9
diff --git a/benchmarks/operator_benchmark/benchmark_caffe2.py b/benchmarks/operator_benchmark/benchmark_caffe2.py
index 7d9254b..52bd3d4 100644
--- a/benchmarks/operator_benchmark/benchmark_caffe2.py
+++ b/benchmarks/operator_benchmark/benchmark_caffe2.py
@@ -4,7 +4,7 @@
 from __future__ import unicode_literals
 
 from caffe2.python import workspace
-from caffe2.python import core 
+from caffe2.python import core
 from caffe2.proto import caffe2_pb2
 import benchmark_core
 import benchmark_utils
@@ -37,22 +37,22 @@
         if device not in ['cuda', 'cpu']:
             raise ValueError("Missing attrs in configs")
 
-        if 'cuda' in device: 
+        if 'cuda' in device:
             self.dev = core.DeviceOption(caffe2_pb2.CUDA, 0)
-        else: 
+        else:
             self.dev = core.DeviceOption(caffe2_pb2.CPU)
         return self.dev
 
     def tensor(self, shapes, dtype='float32', device='cpu'):
         """ A wapper function to create C2 tensor filled with random data.
-            The name/label of the tensor is returned and it is available 
-            throughout the benchmark execution phase. 
-            Args: 
+            The name/label of the tensor is returned and it is available
+            throughout the benchmark execution phase.
+            Args:
                 shapes: int or a sequence of ints to defining the shapes of the tensor
-                dtype: use the dtypes from numpy 
+                dtype: use the dtypes from numpy
                     (https://docs.scipy.org/doc/numpy/user/basics.types.html)
-            Return: 
-                C2 tensor of dtype 
+            Return:
+                C2 tensor of dtype
         """
         blob_name = 'blob_' + str(Caffe2BenchmarkBase.tensor_index)
         dev = self._device_option(device)
@@ -65,14 +65,14 @@
         """ this is used to label the operator being benchmarked
         """
         if self.user_provided_name:
-            return self.user_provided_name 
+            return self.user_provided_name
         return self.__class__.__name__
 
-    def set_module_name(self, name): 
+    def set_module_name(self, name):
         self.user_provided_name = name
 
     def _value_to_str(self, value):
-        """ if value is bool, we will convert it to 0 and 1 
+        """ if value is bool, we will convert it to 0 and 1
         """
         ret = value
         if type(value) == bool:
@@ -83,7 +83,7 @@
         """ this is a globally unique name which can be used to
             label a specific test
         """
-        if name_type == "long": 
+        if name_type == "long":
             test_name_str = []
             for key in kargs:
                 value = kargs[key]
@@ -99,19 +99,19 @@
 
 
 class Caffe2OperatorTestCase(object):
-    """ This class includes all the information needed to benchmark an operator. 
-        op_bench: it's a user-defined class (child of Caffe2BenchmarkBase) 
+    """ This class includes all the information needed to benchmark an operator.
+        op_bench: it's a user-defined class (child of Caffe2BenchmarkBase)
         which includes input and operator, .etc
         test_config: a namedtuple includes test_name, input_shape, tag, run_backward.
         When run_backward is false, the run_forward method will be executed, otherwise
-        run_backward method will be executed. 
+        run_backward method will be executed.
     """
     def __init__(self, op_bench, test_config):
         self.op_bench = op_bench
         self.test_config = test_config
         self.framework = "Caffe2"
 
-    def run_forward(self, num_runs):
+    def run_forward(self, num_runs, print_per_iter=False):
         """ Run the forward path of an operator in a loop
         """
         with core.DeviceScope(self.op_bench.dev):
diff --git a/benchmarks/operator_benchmark/benchmark_core.py b/benchmarks/operator_benchmark/benchmark_core.py
index aa368e2..b2eed10 100644
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@@ -72,6 +72,7 @@
         self.max_iters = 1e6
         self.use_jit = args.use_jit
         self.num_runs = args.num_runs
+        self.print_per_iter = False
         if self.args.iterations:
             self.has_explicit_iteration_count = True
             self.iters = self.args.iterations
@@ -80,6 +81,10 @@
         if self.args.test_name is not None:
             self.args.tag_filter = None
 
+        if self.args.ai_pep_format:
+            self.print_per_iter = True
+
+
     def _print_header(self):
         DASH_LINE = '-' * 40
         print("# {}\n"
@@ -103,6 +108,8 @@
     def _print_perf_result(self, reported_run_time_us, test_case):
         if self.args.ai_pep_format:
             # Output for AI-PEP
+            # Print out per iteration execution time instead of avg time
+            return
             test_name = '_'.join([test_case.framework, test_case.test_config.test_name])
             for run in range(self.num_runs):
                 print("{}Observer ".format(test_case.framework) + json.dumps(
@@ -147,26 +154,28 @@
                 has_explicit_iteration_count) and
                 curr_test_total_time > self.args.min_time_per_test)
 
-    def _launch_forward(self, test_case, iters):
+    def _launch_forward(self, test_case, iters, print_per_iter):
         """ Use Python's timeit module to measure execution time (unit: second).
         """
         func = test_case.run_forward
         if self.use_jit:
             func = test_case.run_jit_forward
-        forward_time = timeit.timeit(functools.partial(func, iters), number=1)
+        forward_time = timeit.timeit(functools.partial(func, iters, print_per_iter), number=1)
         return forward_time
 
-    def _launch_backward(self, test_case, iters):
+    def _launch_backward(self, test_case, iters, print_per_iter=False):
         """ This function runs forward path of an op to get an output. Then the backward path is executed
         and the execution time is reported
         """
-        test_case.run_forward(num_runs=1)
+        test_case.run_forward(num_runs=1, print_per_iter=False)
         if test_case.framework == "PyTorch":
             test_case._output_mean()
-        backward_time = timeit.timeit(functools.partial(test_case.run_backward, iters), number=1)
+        backward_time = timeit.timeit(functools.partial(test_case.run_backward, iters,
+                                                        print_per_iter),
+                                      number=1)
         return backward_time
 
-    def _measure_time(self, launch_test, test_case, iters):
+    def _measure_time(self, launch_test, test_case, iters, print_per_iter):
         """
         This function execute the operator for <iters> iterations then look at the time.
         If it's not significant, the number of iterations will be increased before rerun.
@@ -178,7 +187,7 @@
             if self.args.wipe_cache:
                 torch.ops.operator_benchmark._clear_cache()
 
-            run_time_sec = launch_test(test_case, iters)
+            run_time_sec = launch_test(test_case, iters, print_per_iter)
             curr_test_total_time += run_time_sec
             # Analyze time after each run to decide if the result is stable
             results_are_significant = self._iteration_result_is_significant(
@@ -252,9 +261,10 @@
                 launch_func = self._launch_forward
 
             # Warmup
-            launch_func(test_case, self.args.warmup_iterations)
+            launch_func(test_case, self.args.warmup_iterations, print_per_iter=False)
             # Actual Execution
-            reported_time = [self._measure_time(launch_func, test_case, self.iters)
+            reported_time = [self._measure_time(launch_func, test_case,
+                                                self.iters, self.print_per_iter)
                              for _ in range(self.num_runs)]
 
             self._print_perf_result(reported_time, test_case)
diff --git a/benchmarks/operator_benchmark/benchmark_pytorch.py b/benchmarks/operator_benchmark/benchmark_pytorch.py
index fe558dc..6d13fdb 100644
--- a/benchmarks/operator_benchmark/benchmark_pytorch.py
+++ b/benchmarks/operator_benchmark/benchmark_pytorch.py
@@ -3,6 +3,8 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+import time
+import json
 import benchmark_core
 import torch
 import cpp_extension # noqa
@@ -16,8 +18,8 @@
 
 class TorchBenchmarkBase(object):
     """ This is a base class used to create Pytorch operator benchmark.
-        module_name is the name of the operator being benchmarked. 
-        test_name is the name (it's created by concatenating all the 
+        module_name is the name of the operator being benchmarked.
+        test_name is the name (it's created by concatenating all the
         inputs) of a specific test
     """
 
@@ -32,12 +34,12 @@
 
     def auto_set(self):
         """ This is used to automatically set the require_grad for the backward patch.
-            It is implemented based on two counters. One counter to save the number of 
-            times init has been called. The other counter to save the number of times 
-            this function itself has been called. In the very first time init is called, 
-            this function counts how many inputs require gradient. In each of the 
-            following init calls, this function will return only one true value. 
-            Here is an example: 
+            It is implemented based on two counters. One counter to save the number of
+            times init has been called. The other counter to save the number of times
+            this function itself has been called. In the very first time init is called,
+            this function counts how many inputs require gradient. In each of the
+            following init calls, this function will return only one true value.
+            Here is an example:
                 ...
                 self.v1 = torch.rand(M, N, K, requires_grad=self.auto_set())
                 self.v2 = torch.rand(M, N, K, requires_grad=self.auto_set())
@@ -49,23 +51,23 @@
         if self._pass_count == 0:
             self._num_inputs_require_grads += 1
             return True
-        else: 
+        else:
             self._auto_set_counter += 1
             return (self._pass_count == self._auto_set_counter)
 
     def forward(self):
-        pass 
+        pass
 
     def _wrap_forward(self, foo):
-        """ The function passed to JIT trace must have at least one argument, 
-            this function is to wrap the forward method to meet that requirement. 
+        """ The function passed to JIT trace must have at least one argument,
+            this function is to wrap the forward method to meet that requirement.
             _consume op is used to avoid the dead-code-elimination optimization
-            in JIT. 
+            in JIT.
         """
         return torch.ops.operator_benchmark._consume(self.forward())
 
     def _generate_jit_forward_graph(self):
-        """ generate a graph for the forward function via tracing 
+        """ generate a graph for the forward function via tracing
         """
 
         func = torch.jit.trace(self._wrap_forward, torch.rand(1))
@@ -84,19 +86,19 @@
         """ this is used to label the operator being benchmarked
         """
         if self.user_given_name:
-            return self.user_given_name 
+            return self.user_given_name
         return self.__class__.__name__
 
-    def set_module_name(self, name): 
+    def set_module_name(self, name):
         self.user_given_name = name
 
     def test_name(self, **kargs):
-        """ this is a globally unique name which can be used to 
-            label a specific test 
+        """ this is a globally unique name which can be used to
+            label a specific test
         """
 
         # This is a list of attributes which will not be included
-        # in the test name. 
+        # in the test name.
         skip_key_list = ['device']
 
         test_name_str = []
@@ -111,12 +113,12 @@
 
 
 class PyTorchOperatorTestCase(object):
-    """ This class includes all the information needed to benchmark an operator. 
+    """ This class includes all the information needed to benchmark an operator.
         op_bench: it's a user-defined class (child of TorchBenchmarkBase)
         which includes input and operator, .etc
         test_config: a namedtuple includes test_name, input_shape, tag, run_backward.
-        When run_backward is false, the run_forward method will be executed, 
-        When run_backward is true, run_forward_eager and _output_mean will be 
+        When run_backward is false, the run_forward method will be executed,
+        When run_backward is true, run_forward_eager and _output_mean will be
         executed to generate output. Then, run_backward will be executed.
     """
     def __init__(self, op_bench, test_config):
@@ -125,30 +127,48 @@
         self.place_holder_tensor = torch.ones(1)
         self.framework = "PyTorch"
 
-    def run_jit_forward(self, num_runs):
+    def run_jit_forward(self, num_runs, print_per_iter=False):
         """ Run the forward path of an op with JIT mode
         """
         if self.op_bench._jit_forward is None:
             self.op_bench._jit_forward = self.op_bench._generate_jit_forward_graph()
         self.op_bench._jit_forward(num_runs, self.place_holder_tensor)
 
-    def run_forward(self, num_runs):
+    def run_forward(self, num_runs, print_per_iter):
         """ Run the forward path of an op with eager mode
         """
-        for _ in range(num_runs):
-            self.output = self.op_bench.forward()
+        if print_per_iter:
+            time_series = []
+            for _ in range(num_runs):
+                start_time = time.time()
+                self.output = self.op_bench.forward()
+                end_time = time.time()
+                time_series.append(end_time - start_time)
+
+            for iter_time in time_series:
+                print("PyTorchObserver " + json.dumps(
+                    {
+                        "type": self.test_config.test_name,
+                        "metric": "latency",
+                        "unit": "us",
+                        "value": str(iter_time),
+                    }
+                ))
+        else:
+            for _ in range(num_runs):
+                self.output = self.op_bench.forward()
 
     def _output_mean(self):
-        """ TODO (mingzhe): it is not necessary to sum up everything by myself, 
-            torch.autograd.backward do take a gradient tensor. By default, it 
-            is the same shape as your output tensor, with all 1s. 
-            Mathematically, it is the same as if the output is summed together. 
-            So we should be able to get ride of this method. 
+        """ TODO (mingzhe): it is not necessary to sum up everything by myself,
+            torch.autograd.backward do take a gradient tensor. By default, it
+            is the same shape as your output tensor, with all 1s.
+            Mathematically, it is the same as if the output is summed together.
+            So we should be able to get ride of this method.
             dummy function for gradient calculation
         """
         self.mean = self.output.mean()
 
-    def run_backward(self, num_runs):
+    def run_backward(self, num_runs, print_per_iter=False):
         """ Run the backward path of an op in many iterations
         """
         # TODO: can we use JIT here to reduce python overhead?