| """Example use of Timer and op fuzzers to measure kernel performance. |
| |
| $ python -m examples.op_benchmark |
| """ |
| |
| import numpy as np |
| import torch |
| |
| from utils import Timer |
| from op_fuzzers.binary import BinaryOpFuzzer |
| from op_fuzzers.unary import UnaryOpFuzzer |
| |
| |
| _MEASURE_TIME = 1.0 |
| |
| |
| def assert_dicts_equal(dict_0, dict_1): |
| """Builtin dict comparison will not compare numpy arrays. |
| e.g. |
| x = {"a": np.ones((2, 1))} |
| x == x # Raises ValueError |
| """ |
| assert set(dict_0.keys()) == set(dict_0.keys()) |
| assert all(np.all(v == dict_1[k]) for k, v in dict_0.items() if k != "dtype") |
| |
| |
| def run(n, stmt, fuzzer_cls): |
| float_iter = fuzzer_cls(seed=0, dtype=torch.float32).take(n) |
| int_iter = fuzzer_cls(seed=0, dtype=torch.int32).take(n) |
| results = [] |
| for i, (float_values, int_values) in enumerate(zip(float_iter, int_iter)): |
| float_tensors, float_tensor_params, float_params = float_values |
| int_tensors, int_tensor_params, int_params = int_values |
| |
| # This benchmark assumes that the two fuzzers generate identically |
| # sized and strided Tensors, since the same seed is used. |
| assert_dicts_equal(float_params, int_params) |
| assert_dicts_equal(float_tensor_params["x"], int_tensor_params["x"]) |
| |
| float_measurement, int_measurement = [ |
| Timer( |
| stmt, |
| globals=tensors, |
| ).blocked_autorange(min_run_time=_MEASURE_TIME) |
| for tensors in (float_tensors, int_tensors) |
| ] |
| |
| descriptions = [] |
| for name in float_tensors: |
| shape_str = "(" + ", ".join([ |
| f"2 ** {int(np.log2(i))}" |
| if 2 ** int(np.log2(i)) == i and i > 1 |
| else str(i) |
| for i in float_tensors[name].shape |
| ]) + ")" |
| order = float_tensor_params[name]["order"] |
| order_str = ("" if all(order == np.arange(len(order))) else str(tuple(order))) |
| steps = float_tensor_params[name]["steps"] |
| steps_str = str(steps) if sum(steps) > len(steps) else "" |
| descriptions.append((name, shape_str, order_str, steps_str)) |
| results.append((float_measurement, int_measurement, descriptions)) |
| |
| print(f"\r{i + 1} / {n}", end="") |
| print() |
| |
| parsed_results, name_len, shape_len, order_len, steps_len = [], 0, 0, 0, 0 |
| for float_measurement, int_measurement, descriptions in results: |
| t_float = float_measurement.median * 1e6 |
| t_int = int_measurement.median * 1e6 |
| rel_diff = abs(t_float - t_int) / (t_float + t_int) * 2 |
| parsed_results.append((t_float, t_int, rel_diff, descriptions)) |
| for name, shape, order, steps in descriptions: |
| name_len = max(name_len, len(name)) |
| shape_len = max(shape_len, len(shape)) |
| order_len = max(order_len, len(order)) |
| steps_len = max(steps_len, len(steps)) |
| |
| parsed_results.sort(key=lambda x: x[2]) |
| |
| print(f"stmt: {stmt}") |
| print(f" diff faster{'':>17}{' ' * name_len} ", end="") |
| print(f"{'shape'.ljust(shape_len)}{'':>16}{'order'.ljust(order_len)}", end="") |
| print(f" steps\n{'-' * 100}") |
| for results, spacer in [(parsed_results[:10], "..."), (parsed_results[-10:], "")]: |
| for t_float, t_int, rel_diff, descriptions in results: |
| time_str = [f"{rel_diff * 100:>4.1f}% {'int' if t_int < t_float else 'float':<20}"] |
| time_str.extend(["".ljust(len(time_str[0])) for _ in descriptions[:-1]]) |
| for t_str, (name, shape, order, steps) in zip(time_str, descriptions): |
| name = f"{name}:".ljust(name_len + 1) |
| shape = shape.ljust(shape_len + 10) |
| order = order.ljust(order_len) |
| print(f"{t_str} {name} {shape}| {order} | {steps}") |
| print(spacer) |
| |
| |
| def main(): |
| run(n=100, stmt="torch.median(x, dim=0)", fuzzer_cls=UnaryOpFuzzer) |
| run(n=100, stmt="torch.square(x)", fuzzer_cls=UnaryOpFuzzer) |
| run(n=100, stmt="x + y", fuzzer_cls=BinaryOpFuzzer) |
| |
| |
| if __name__ == "__main__": |
| main() |