report per iteration execution time (#27923)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27923
As title
Test Plan:
```
buck run caffe2/benchmarks/operator_benchmark/pt:add_test -- --iterations 3 --ai_pep_format true
# ----------------------------------------
# PyTorch/Caffe2 Operator Micro-benchmarks
# ----------------------------------------
# Tag : short
# Benchmarking PyTorch: add
PyTorchObserver {"type": "add_M64_N64_K64_cpu", "metric": "latency", "unit": "us", "value": "0.027768373489379883"}
PyTorchObserver {"type": "add_M64_N64_K64_cpu", "metric": "latency", "unit": "us", "value": "0.02661752700805664"}
PyTorchObserver {"type": "add_M64_N64_K64_cpu", "metric": "latency", "unit": "us", "value": "0.026746749877929688"}
...
Reviewed By: hl475
Differential Revision: D17911718
fbshipit-source-id: 6fe28f2ab9ce1e0feabb5b822f04ff32dac977a9
diff --git a/benchmarks/operator_benchmark/benchmark_caffe2.py b/benchmarks/operator_benchmark/benchmark_caffe2.py
index 7d9254b..52bd3d4 100644
--- a/benchmarks/operator_benchmark/benchmark_caffe2.py
+++ b/benchmarks/operator_benchmark/benchmark_caffe2.py
@@ -4,7 +4,7 @@
from __future__ import unicode_literals
from caffe2.python import workspace
-from caffe2.python import core
+from caffe2.python import core
from caffe2.proto import caffe2_pb2
import benchmark_core
import benchmark_utils
@@ -37,22 +37,22 @@
if device not in ['cuda', 'cpu']:
raise ValueError("Missing attrs in configs")
- if 'cuda' in device:
+ if 'cuda' in device:
self.dev = core.DeviceOption(caffe2_pb2.CUDA, 0)
- else:
+ else:
self.dev = core.DeviceOption(caffe2_pb2.CPU)
return self.dev
def tensor(self, shapes, dtype='float32', device='cpu'):
""" A wapper function to create C2 tensor filled with random data.
- The name/label of the tensor is returned and it is available
- throughout the benchmark execution phase.
- Args:
+ The name/label of the tensor is returned and it is available
+ throughout the benchmark execution phase.
+ Args:
shapes: int or a sequence of ints to defining the shapes of the tensor
- dtype: use the dtypes from numpy
+ dtype: use the dtypes from numpy
(https://docs.scipy.org/doc/numpy/user/basics.types.html)
- Return:
- C2 tensor of dtype
+ Return:
+ C2 tensor of dtype
"""
blob_name = 'blob_' + str(Caffe2BenchmarkBase.tensor_index)
dev = self._device_option(device)
@@ -65,14 +65,14 @@
""" this is used to label the operator being benchmarked
"""
if self.user_provided_name:
- return self.user_provided_name
+ return self.user_provided_name
return self.__class__.__name__
- def set_module_name(self, name):
+ def set_module_name(self, name):
self.user_provided_name = name
def _value_to_str(self, value):
- """ if value is bool, we will convert it to 0 and 1
+ """ if value is bool, we will convert it to 0 and 1
"""
ret = value
if type(value) == bool:
@@ -83,7 +83,7 @@
""" this is a globally unique name which can be used to
label a specific test
"""
- if name_type == "long":
+ if name_type == "long":
test_name_str = []
for key in kargs:
value = kargs[key]
@@ -99,19 +99,19 @@
class Caffe2OperatorTestCase(object):
- """ This class includes all the information needed to benchmark an operator.
- op_bench: it's a user-defined class (child of Caffe2BenchmarkBase)
+ """ This class includes all the information needed to benchmark an operator.
+ op_bench: it's a user-defined class (child of Caffe2BenchmarkBase)
which includes input and operator, .etc
test_config: a namedtuple includes test_name, input_shape, tag, run_backward.
When run_backward is false, the run_forward method will be executed, otherwise
- run_backward method will be executed.
+ run_backward method will be executed.
"""
def __init__(self, op_bench, test_config):
self.op_bench = op_bench
self.test_config = test_config
self.framework = "Caffe2"
- def run_forward(self, num_runs):
+ def run_forward(self, num_runs, print_per_iter=False):
""" Run the forward path of an operator in a loop
"""
with core.DeviceScope(self.op_bench.dev):
diff --git a/benchmarks/operator_benchmark/benchmark_core.py b/benchmarks/operator_benchmark/benchmark_core.py
index aa368e2..b2eed10 100644
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@@ -72,6 +72,7 @@
self.max_iters = 1e6
self.use_jit = args.use_jit
self.num_runs = args.num_runs
+ self.print_per_iter = False
if self.args.iterations:
self.has_explicit_iteration_count = True
self.iters = self.args.iterations
@@ -80,6 +81,10 @@
if self.args.test_name is not None:
self.args.tag_filter = None
+ if self.args.ai_pep_format:
+ self.print_per_iter = True
+
+
def _print_header(self):
DASH_LINE = '-' * 40
print("# {}\n"
@@ -103,6 +108,8 @@
def _print_perf_result(self, reported_run_time_us, test_case):
if self.args.ai_pep_format:
# Output for AI-PEP
+ # Print out per iteration execution time instead of avg time
+ return
test_name = '_'.join([test_case.framework, test_case.test_config.test_name])
for run in range(self.num_runs):
print("{}Observer ".format(test_case.framework) + json.dumps(
@@ -147,26 +154,28 @@
has_explicit_iteration_count) and
curr_test_total_time > self.args.min_time_per_test)
- def _launch_forward(self, test_case, iters):
+ def _launch_forward(self, test_case, iters, print_per_iter):
""" Use Python's timeit module to measure execution time (unit: second).
"""
func = test_case.run_forward
if self.use_jit:
func = test_case.run_jit_forward
- forward_time = timeit.timeit(functools.partial(func, iters), number=1)
+ forward_time = timeit.timeit(functools.partial(func, iters, print_per_iter), number=1)
return forward_time
- def _launch_backward(self, test_case, iters):
+ def _launch_backward(self, test_case, iters, print_per_iter=False):
""" This function runs forward path of an op to get an output. Then the backward path is executed
and the execution time is reported
"""
- test_case.run_forward(num_runs=1)
+ test_case.run_forward(num_runs=1, print_per_iter=False)
if test_case.framework == "PyTorch":
test_case._output_mean()
- backward_time = timeit.timeit(functools.partial(test_case.run_backward, iters), number=1)
+ backward_time = timeit.timeit(functools.partial(test_case.run_backward, iters,
+ print_per_iter),
+ number=1)
return backward_time
- def _measure_time(self, launch_test, test_case, iters):
+ def _measure_time(self, launch_test, test_case, iters, print_per_iter):
"""
This function execute the operator for <iters> iterations then look at the time.
If it's not significant, the number of iterations will be increased before rerun.
@@ -178,7 +187,7 @@
if self.args.wipe_cache:
torch.ops.operator_benchmark._clear_cache()
- run_time_sec = launch_test(test_case, iters)
+ run_time_sec = launch_test(test_case, iters, print_per_iter)
curr_test_total_time += run_time_sec
# Analyze time after each run to decide if the result is stable
results_are_significant = self._iteration_result_is_significant(
@@ -252,9 +261,10 @@
launch_func = self._launch_forward
# Warmup
- launch_func(test_case, self.args.warmup_iterations)
+ launch_func(test_case, self.args.warmup_iterations, print_per_iter=False)
# Actual Execution
- reported_time = [self._measure_time(launch_func, test_case, self.iters)
+ reported_time = [self._measure_time(launch_func, test_case,
+ self.iters, self.print_per_iter)
for _ in range(self.num_runs)]
self._print_perf_result(reported_time, test_case)
diff --git a/benchmarks/operator_benchmark/benchmark_pytorch.py b/benchmarks/operator_benchmark/benchmark_pytorch.py
index fe558dc..6d13fdb 100644
--- a/benchmarks/operator_benchmark/benchmark_pytorch.py
+++ b/benchmarks/operator_benchmark/benchmark_pytorch.py
@@ -3,6 +3,8 @@
from __future__ import print_function
from __future__ import unicode_literals
+import time
+import json
import benchmark_core
import torch
import cpp_extension # noqa
@@ -16,8 +18,8 @@
class TorchBenchmarkBase(object):
""" This is a base class used to create Pytorch operator benchmark.
- module_name is the name of the operator being benchmarked.
- test_name is the name (it's created by concatenating all the
+ module_name is the name of the operator being benchmarked.
+ test_name is the name (it's created by concatenating all the
inputs) of a specific test
"""
@@ -32,12 +34,12 @@
def auto_set(self):
""" This is used to automatically set the require_grad for the backward patch.
- It is implemented based on two counters. One counter to save the number of
- times init has been called. The other counter to save the number of times
- this function itself has been called. In the very first time init is called,
- this function counts how many inputs require gradient. In each of the
- following init calls, this function will return only one true value.
- Here is an example:
+ It is implemented based on two counters. One counter to save the number of
+ times init has been called. The other counter to save the number of times
+ this function itself has been called. In the very first time init is called,
+ this function counts how many inputs require gradient. In each of the
+ following init calls, this function will return only one true value.
+ Here is an example:
...
self.v1 = torch.rand(M, N, K, requires_grad=self.auto_set())
self.v2 = torch.rand(M, N, K, requires_grad=self.auto_set())
@@ -49,23 +51,23 @@
if self._pass_count == 0:
self._num_inputs_require_grads += 1
return True
- else:
+ else:
self._auto_set_counter += 1
return (self._pass_count == self._auto_set_counter)
def forward(self):
- pass
+ pass
def _wrap_forward(self, foo):
- """ The function passed to JIT trace must have at least one argument,
- this function is to wrap the forward method to meet that requirement.
+ """ The function passed to JIT trace must have at least one argument,
+ this function is to wrap the forward method to meet that requirement.
_consume op is used to avoid the dead-code-elimination optimization
- in JIT.
+ in JIT.
"""
return torch.ops.operator_benchmark._consume(self.forward())
def _generate_jit_forward_graph(self):
- """ generate a graph for the forward function via tracing
+ """ generate a graph for the forward function via tracing
"""
func = torch.jit.trace(self._wrap_forward, torch.rand(1))
@@ -84,19 +86,19 @@
""" this is used to label the operator being benchmarked
"""
if self.user_given_name:
- return self.user_given_name
+ return self.user_given_name
return self.__class__.__name__
- def set_module_name(self, name):
+ def set_module_name(self, name):
self.user_given_name = name
def test_name(self, **kargs):
- """ this is a globally unique name which can be used to
- label a specific test
+ """ this is a globally unique name which can be used to
+ label a specific test
"""
# This is a list of attributes which will not be included
- # in the test name.
+ # in the test name.
skip_key_list = ['device']
test_name_str = []
@@ -111,12 +113,12 @@
class PyTorchOperatorTestCase(object):
- """ This class includes all the information needed to benchmark an operator.
+ """ This class includes all the information needed to benchmark an operator.
op_bench: it's a user-defined class (child of TorchBenchmarkBase)
which includes input and operator, .etc
test_config: a namedtuple includes test_name, input_shape, tag, run_backward.
- When run_backward is false, the run_forward method will be executed,
- When run_backward is true, run_forward_eager and _output_mean will be
+ When run_backward is false, the run_forward method will be executed,
+ When run_backward is true, run_forward_eager and _output_mean will be
executed to generate output. Then, run_backward will be executed.
"""
def __init__(self, op_bench, test_config):
@@ -125,30 +127,48 @@
self.place_holder_tensor = torch.ones(1)
self.framework = "PyTorch"
- def run_jit_forward(self, num_runs):
+ def run_jit_forward(self, num_runs, print_per_iter=False):
""" Run the forward path of an op with JIT mode
"""
if self.op_bench._jit_forward is None:
self.op_bench._jit_forward = self.op_bench._generate_jit_forward_graph()
self.op_bench._jit_forward(num_runs, self.place_holder_tensor)
- def run_forward(self, num_runs):
+ def run_forward(self, num_runs, print_per_iter):
""" Run the forward path of an op with eager mode
"""
- for _ in range(num_runs):
- self.output = self.op_bench.forward()
+ if print_per_iter:
+ time_series = []
+ for _ in range(num_runs):
+ start_time = time.time()
+ self.output = self.op_bench.forward()
+ end_time = time.time()
+ time_series.append(end_time - start_time)
+
+ for iter_time in time_series:
+ print("PyTorchObserver " + json.dumps(
+ {
+ "type": self.test_config.test_name,
+ "metric": "latency",
+ "unit": "us",
+ "value": str(iter_time),
+ }
+ ))
+ else:
+ for _ in range(num_runs):
+ self.output = self.op_bench.forward()
def _output_mean(self):
- """ TODO (mingzhe): it is not necessary to sum up everything by myself,
- torch.autograd.backward do take a gradient tensor. By default, it
- is the same shape as your output tensor, with all 1s.
- Mathematically, it is the same as if the output is summed together.
- So we should be able to get ride of this method.
+ """ TODO (mingzhe): it is not necessary to sum up everything by myself,
+ torch.autograd.backward do take a gradient tensor. By default, it
+ is the same shape as your output tensor, with all 1s.
+ Mathematically, it is the same as if the output is summed together.
+ So we should be able to get ride of this method.
dummy function for gradient calculation
"""
self.mean = self.output.mean()
- def run_backward(self, num_runs):
+ def run_backward(self, num_runs, print_per_iter=False):
""" Run the backward path of an op in many iterations
"""
# TODO: can we use JIT here to reduce python overhead?