| #!/usr/bin/env python3 |
| from __future__ import annotations |
| |
| import argparse |
| import collections |
| import contextlib |
| import copy |
| import csv |
| import functools |
| import importlib |
| import itertools |
| import logging |
| import os |
| import pathlib |
| import random |
| import shutil |
| import signal |
| import subprocess |
| import sys |
| import time |
| from contextlib import contextmanager |
| |
| from typing import Any, Callable, Mapping, NamedTuple, Optional, Tuple, Type |
| from unittest.mock import MagicMock |
| |
| import numpy as np |
| import pandas as pd |
| import psutil |
| import torch |
| |
| import torch._dynamo |
| import torch._dynamo.utils |
| import torch._export |
| import torch.distributed |
| from scipy.stats import gmean, ttest_ind |
| from torch._dynamo.profiler import fx_insert_profiling, Profiler |
| from torch._dynamo.testing import dummy_fx_compile, format_speedup, same |
| from torch._dynamo.utils import clone_inputs, graph_break_reasons |
| from torch._functorch.aot_autograd import set_model_name |
| from torch._inductor import config as inductor_config |
| from torch._inductor.utils import fresh_inductor_cache |
| from torch._subclasses.fake_tensor import FakeTensorMode |
| |
| from torch.utils import _pytree as pytree |
| from torch.utils._pytree import tree_map, tree_map_only |
| |
| from tqdm.auto import tqdm, trange |
| |
| try: |
| from .microbenchmarks.operator_inp_utils import OperatorInputsMode |
| except ImportError: |
| from microbenchmarks.operator_inp_utils import OperatorInputsMode |
| |
| try: |
| import torch_xla.core.xla_model as xm |
| except ImportError: |
| # ignore the error if torch_xla is not installed |
| pass |
| |
| log = logging.getLogger(__name__) |
| |
| # We are primarily interested in TF32 |
| torch.backends.cuda.matmul.allow_tf32 = True |
| |
| # Suppress torch.profiler spam |
| os.environ["KINETO_LOG_LEVEL"] = "5" |
| |
| current_name = "" |
| current_device = "" |
| current_onnx_compiler = "" |
| current_batch_size = None |
| output_filename = None |
| |
| MAX_DOWNLOAD_ATTEMPTS = 5 |
| |
| |
| class CI(NamedTuple): |
| backend: str # aot_eager or inductor |
| training: bool |
| dynamic: bool = False |
| device: str = "cuda" |
| |
| |
| CI_SKIP = collections.defaultdict(list) |
| |
| |
| # Skips for dynamic=False |
| |
| # Here eager really means dynamo+eager |
| CI_SKIP[CI("eager", training=False)] = [ |
| # TorchBench |
| "DALLE2_pytorch", # AttributeError: text_encodings |
| "hf_BigBird", # fail_accuracy |
| # TypeError: pad_center() takes 1 positional argument but 2 were given |
| "tacotron2", |
| # torchrec_dlrm requires gcc-11, https://github.com/pytorch/benchmark/pull/1427 |
| "torchrec_dlrm", |
| # Huggingface |
| "DebertaV2ForQuestionAnswering", # OOM |
| ] |
| |
| CI_SKIP[CI("eager", training=True)] = [ |
| *CI_SKIP[CI("eager", training=False)], |
| # TorchBench |
| "BERT_pytorch", # accuracy |
| "Background_Matting", # fp64_OOM |
| "hf_BigBird", # fp64_OOM |
| "hf_T5_base", # fp64_OOM |
| "llama", # Accuracy failed: allclose not within tol=0.001 |
| "vision_maskrcnn", # The size of tensor a (29) must match the size of tensor b (33) (doesn't repro) |
| # Huggingface |
| "XGLMForCausalLM", # OOM |
| # TIMM |
| "cait_m36_384", # fp64_OOM |
| "convit_base", # fp64_OOM |
| "mobilenetv2_100", # accuracy |
| "xcit_large_24_p8_224", # fp64_OOM, |
| ] |
| |
| CI_SKIP[CI("aot_eager", training=False)] = [ |
| *CI_SKIP[CI("eager", training=False)], |
| # all dynamic shapes errors for detectron variants |
| "demucs", # OOM |
| "detectron2_fasterrcnn_r_101_c4", |
| "detectron2_fasterrcnn_r_101_dc5", |
| "detectron2_fasterrcnn_r_101_fpn", |
| "detectron2_fasterrcnn_r_50_c4", |
| "detectron2_fasterrcnn_r_50_dc5", |
| "detectron2_fasterrcnn_r_50_fpn", |
| "detectron2_fcos_r_50_fpn", |
| "detectron2_maskrcnn_r_101_c4", |
| "detectron2_maskrcnn_r_101_fpn", |
| "detectron2_maskrcnn_r_50_c4", |
| "detectron2_maskrcnn_r_50_fpn", |
| "hf_BigBird", # OOM |
| "tacotron2", # AssertionError: Deduped args out of bounds |
| # Huggingface |
| "BartForConditionalGeneration", # OOM |
| "DebertaV2ForQuestionAnswering", # OOM |
| # Torchbench |
| "speech_transformer", # https://github.com/pytorch/pytorch/issues/99893 |
| "pyhpc_isoneutral_mixing", # https://github.com/pytorch/pytorch/issues/99893 |
| "pyhpc_turbulent_kinetic_energy", # https://github.com/pytorch/pytorch/issues/99893 |
| ] |
| |
| CI_SKIP[CI("aot_eager", training=True)] = [ |
| *CI_SKIP[CI("aot_eager", training=False)], |
| # TorchBench |
| "Background_Matting", # fp64_OOM |
| "hf_T5_base", # fp64_OOM |
| "mobilenet_v2_quantized_qat", # fp64_OOM |
| "resnet50_quantized_qat", # fp64_OOM |
| "pytorch_struct", |
| # Huggingface |
| "MBartForConditionalGeneration", # OOM |
| "M2M100ForConditionalGeneration", # OOM |
| "XGLMForCausalLM", # OOM |
| # TIMM |
| "cait_m36_384", # fp64_OOM |
| "convit_base", # fp64_OOM |
| "fbnetv3_b", # Accuracy (blocks.2.2.bn1.weight.grad) |
| "levit_128", # Accuracy (patch_embed.0.c.weight.grad) |
| "lcnet_050", # Accuracy (blocks.1.0.bn2.weight.grad) |
| "sebotnet33ts_256", # Accuracy (stem.conv1.conv.weight.grad) |
| "xcit_large_24_p8_224", # fp64_OOM, |
| ] |
| |
| CI_SKIP[CI("inductor", training=False)] = [ |
| # TorchBench |
| "DALLE2_pytorch", # AttributeError: text_encodings |
| # torchrec_dlrm requires gcc-11, https://github.com/pytorch/benchmark/pull/1427 |
| "torchrec_dlrm", |
| "demucs", # OOM |
| "detectron2_fasterrcnn_r_101_c4", |
| "detectron2_fasterrcnn_r_101_dc5", |
| "detectron2_fasterrcnn_r_101_fpn", |
| "detectron2_fasterrcnn_r_50_c4", |
| "detectron2_fasterrcnn_r_50_dc5", |
| "detectron2_fasterrcnn_r_50_fpn", |
| "detectron2_fcos_r_50_fpn", |
| "detectron2_maskrcnn_r_101_c4", |
| "detectron2_maskrcnn_r_101_fpn", |
| "detectron2_maskrcnn_r_50_c4", |
| "detectron2_maskrcnn_r_50_fpn", |
| # TorchBench |
| "detectron2", |
| "densenet121", # flaky accuracy |
| "hf_T5", # accuracy |
| "hf_BigBird", # accuracy |
| "hf_GPT2_large", # OOM |
| "maml", # accuracy |
| "mobilenet_v2_quantized_qat", # The eval test only supports CPU |
| "pytorch_struct", # Test eval is not implemented |
| "pyhpc_equation_of_state", # Accuracy |
| "pyhpc_turbulent_kinetic_energy", # Accuracy |
| "tacotron2", |
| ] |
| |
| CI_SKIP[CI("inductor", training=False, device="cpu")] = [ |
| # TorchBench |
| "drq", # Need to update torchbench |
| "detectron2_fasterrcnn_r_101_c4", |
| "detectron2_fasterrcnn_r_101_dc5", |
| "detectron2_fasterrcnn_r_101_fpn", |
| "detectron2_fasterrcnn_r_50_c4", |
| "detectron2_fasterrcnn_r_50_dc5", |
| "detectron2_fasterrcnn_r_50_fpn", |
| "detectron2_fcos_r_50_fpn", |
| "detectron2_maskrcnn_r_101_c4", |
| "detectron2_maskrcnn_r_101_fpn", |
| "detectron2_maskrcnn_r_50_c4", |
| "detectron2_maskrcnn_r_50_fpn", |
| "doctr_det_predictor", # requires newer gcc |
| "doctr_reco_predictor", # requires newer gcc |
| "gat", # does not work with fp32 |
| "gcn", # does not work with fp32 |
| "hf_Bert_large", # OOM |
| "hf_GPT2_large", # Intermittent failure on CI |
| "hf_T5_base", # OOM |
| "mobilenet_v2_quantized_qat", |
| "pyhpc_turbulent_kinetic_energy", |
| "resnet50_quantized_qat", # Eager model failed to run(Quantize only works on Float Tensor, got Double) |
| "sage", # does not work with fp32 |
| # torchrec_dlrm requires gcc-11, https://github.com/pytorch/benchmark/pull/1427 |
| "torchrec_dlrm", |
| # Huggingface |
| "MBartForConditionalGeneration", # Accuracy https://github.com/pytorch/pytorch/issues/94793 |
| "PLBartForConditionalGeneration", # Accuracy https://github.com/pytorch/pytorch/issues/94794 |
| # TIMM |
| "cait_m36_384", # Accuracy |
| "pnasnet5large", # OOM |
| "xcit_large_24_p8_224", # OOM https://github.com/pytorch/pytorch/issues/95984 |
| "opacus_cifar10", # Fails to run https://github.com/pytorch/pytorch/issues/99201 |
| ] |
| |
| CI_SKIP[CI("inductor", training=True)] = [ |
| *CI_SKIP[CI("inductor", training=False)], |
| # TorchBench |
| "Background_Matting", # fp64_OOM |
| "dlrm", # Fails on CI - unable to repro locally |
| "hf_T5_base", # accuracy |
| "mobilenet_v3_large", # accuracy |
| "resnet50_quantized_qat", # Eager model failed to run |
| "AlbertForQuestionAnswering", # accuracy |
| "crossvit_9_240", # fails to run on timm 0.8.22 with cudagraphs, mempools |
| "deit_base_distilled_patch16_224", # fails to run in timm 0.8.22, cudagraphs |
| "mobilevit_s", |
| "pit_b_224", |
| "twins_pcpvt_base", |
| "visformer_small", |
| "vit_base_patch16_224", |
| "xcit_large_24_p8_224", |
| ] |
| |
| # Skips for dynamic=True |
| |
| CI_SKIP[CI("aot_eager", training=False, dynamic=True)] = [ |
| *CI_SKIP[CI("aot_eager", training=False)], |
| "vision_maskrcnn", # accuracy failure on boxes, after https://github.com/pytorch/pytorch/issues/101093 |
| # https://github.com/pytorch/pytorch/issues/103760 |
| "dlrm", |
| "hf_T5_generate", |
| ] |
| |
| CI_SKIP[CI("aot_eager", training=True, dynamic=True)] = [ |
| *CI_SKIP[CI("aot_eager", training=True)], |
| *CI_SKIP[CI("aot_eager", training=False, dynamic=True)], |
| "llama", # AssertionError: cannot compute free_symbols of True |
| ] |
| |
| CI_SKIP[CI("inductor", training=False, dynamic=True)] = [ |
| *CI_SKIP[CI("aot_eager", training=False, dynamic=True)], |
| *CI_SKIP[CI("inductor", training=False)], |
| "nanogpt_generate", # Assertion `index out of bounds: 0 <= tmp0 < 64` failed. |
| ] |
| |
| CI_SKIP[CI("inductor", training=True, dynamic=True)] = [ |
| # NB: Intentionally omitting for symmetry with dynamic=False |
| # *CI_SKIP[CI("aot_eager", training=True, dynamic=True)], |
| *CI_SKIP[CI("inductor", training=False, dynamic=True)], |
| *CI_SKIP[CI("inductor", training=True)], |
| "levit_128", # Accuracy fails on A10G, passes on A100 |
| "sebotnet33ts_256", # Flaky accuracy failed |
| ] |
| |
| CI_SKIP[CI("inductor", training=False, dynamic=True, device="cpu")] = [ |
| *CI_SKIP[CI("inductor", training=False, device="cpu")], |
| "pyhpc_isoneutral_mixing", |
| "dpn107", |
| ] |
| |
| CI_SKIP_OPTIMIZER = { |
| # TIMM |
| "convmixer_768_32", # accuracy |
| "hrnet_w18", # Stack issue in fx |
| # TorchBench |
| "dlrm", # symbolic shapes error |
| # HF |
| "pnasnet5large", # Stack issue in fx |
| "MobileBertForMaskedLM", # Stack issue in fx |
| "MobileBertForQuestionAnswering", # Stack issue in fx |
| "PegasusForConditionalGeneration", # OOM |
| } |
| |
| |
| def model_specified_by_path(path_and_class_str): |
| return ":" in path_and_class_str |
| |
| |
| def load_model_from_path(path_and_class_str): |
| configs = {} |
| for kvstr in path_and_class_str.split(","): |
| k, v = kvstr.split(":") |
| configs[k] = v |
| |
| for name in ["path", "class"]: |
| if name not in configs: |
| raise RuntimeError( |
| "Invalid --only arguments. Check help message for the correct format" |
| ) |
| |
| path = configs["path"] |
| class_name = configs["class"] |
| |
| if path[:1] != "/": |
| raise RuntimeError( |
| "Use absolute path since dynamo may change the current working directory which makes using relative path tricky" |
| ) |
| |
| spec = importlib.util.spec_from_file_location("module_name", path) |
| module = importlib.util.module_from_spec(spec) |
| spec.loader.exec_module(module) |
| |
| model_class = getattr(module, class_name) |
| assert issubclass(model_class, torch.nn.Module) |
| model = model_class() |
| assert hasattr(model, "get_example_inputs") |
| inputs = model.get_example_inputs() |
| return model, inputs |
| |
| |
| def output_csv(filename, headers, row): |
| if os.path.exists(filename): |
| with open(filename, "r") as fd: |
| lines = list(csv.reader(fd)) or [[]] |
| if headers and len(headers) > len(lines[0]): |
| # if prior results failed the header might not be filled in yet |
| lines[0] = headers |
| else: |
| headers = lines[0] |
| else: |
| lines = [headers] |
| lines.append([(f"{x:.6f}" if isinstance(x, float) else x) for x in row]) |
| with open(filename, "w") as fd: |
| writer = csv.writer(fd, lineterminator="\n") |
| for line in lines: |
| writer.writerow(list(line) + ["0"] * (len(headers) - len(line))) |
| |
| |
| def nothing(f): |
| return f |
| |
| |
| @functools.lru_cache(None) |
| def patch_torch_manual_seed(): |
| """Make torch manual seed deterministic. Helps with accuracy testing.""" |
| |
| def deterministic_torch_manual_seed(*args, **kwargs): |
| from torch._C import default_generator |
| |
| seed = 1337 |
| import torch.cuda |
| |
| if not torch.cuda._is_in_bad_fork(): |
| torch.cuda.manual_seed_all(seed) |
| return default_generator.manual_seed(seed) |
| |
| torch.manual_seed = deterministic_torch_manual_seed |
| |
| |
| def synchronize(): |
| pass |
| |
| |
| def summarize_graph_break(filename): |
| """ |
| Sorts and de-dupes the graphs breaks on the reason string. Note that this |
| function is just a best effort to reduce the logging information. We could |
| miss some graph breaks because of de-duping. We can further refine this |
| function as need arises. |
| """ |
| log_file = f"{filename.rstrip('.csv')}_graph_breaks.csv" |
| if os.path.exists(log_file): |
| df = pd.read_csv(log_file) |
| df = df.sort_values("reason").drop_duplicates(subset="reason") |
| |
| # Specialize for multi tensor sgd as reason is not identical |
| multi_tensor_sgd_row = df.loc[df["reason"].str.contains("_multi_tensor_sgd")] |
| if len(multi_tensor_sgd_row): |
| df = df[ |
| ~df["reason"].str.contains("_multi_tensor_sgd") |
| ] # Drop all sgd rows |
| df = pd.concat( |
| [df, pd.DataFrame([multi_tensor_sgd_row.iloc[0]])], axis=0 |
| ) # Add back a single row |
| df.to_csv(f"{log_file.rstrip('.csv')}_deduped.csv", index=False) |
| |
| |
| def print_summary(filename, print_dataframe=False): |
| if not (filename and os.path.exists(filename)): |
| return |
| data = pd.read_csv(filename) |
| if "tag" in data.columns: |
| for tag in data.tag.unique(): |
| if tag == "0.0000": |
| continue # This happens for failed runs |
| print(f"\nSummary for tag={tag}:") |
| print_summary_table(data[data.tag == tag], print_dataframe=print_dataframe) |
| else: |
| print_summary_table(data, print_dataframe=print_dataframe) |
| summarize_graph_break(filename) |
| |
| |
| def print_summary_table(data, print_dataframe=False): |
| if print_dataframe: |
| pd.options.display.max_rows = 1000 |
| pd.options.display.max_columns = 1000 |
| pd.options.display.width = 2000 |
| print(data) |
| width = max(map(len, data.columns)) |
| for col in data.columns: |
| try: |
| if col in ("dev", "name", "batch_size", "tag"): |
| continue |
| elif col in ("pct_ops", "pct_time"): |
| print(col.ljust(width), f"{data[col].mean():.3%}") |
| elif col in ("graphs", "graph_calls", "captured_ops", "total_ops"): |
| print(col.ljust(width), f"{data[col].mean():.3f}") |
| elif col in ("compilation_latency"): |
| print(col.ljust(width), f"mean={data[col].mean():.3f} seconds") |
| elif col in ("compression_ratio"): |
| print(col.ljust(width), f"mean={data[col].mean():.3f}x") |
| elif col in ("accuracy"): |
| pass_rate = (data[col] == "pass").mean() |
| print(col.ljust(width), f"pass_rate={100*pass_rate:.2f}%") |
| else: |
| cdata = data[col] |
| print( |
| col.ljust(width), |
| f"gmean={gmean(cdata):.2f}x mean={cdata.mean():.3f}x", |
| ) |
| except Exception as e: |
| pass |
| |
| |
| def tensor_is_on_xla(tensors): |
| def visit(x: torch.Tensor): |
| nonlocal result |
| if x.device.type == "xla": |
| result = True |
| |
| result = False |
| tree_map_only(torch.Tensor, visit, tensors) |
| return result |
| |
| |
| def timed( |
| model, |
| model_iter_fn, |
| example_inputs, |
| times=1, |
| return_result=False, |
| collect_outputs=False, |
| ): |
| use_xla = tensor_is_on_xla(example_inputs) |
| synchronize() |
| |
| if use_xla: |
| xm.mark_step() |
| xm.wait_device_ops() |
| |
| time_total = 0 |
| # Dont collect outputs to correctly measure timing |
| for _ in range(times): |
| # Put this call inside the loop to reset the seed for each iteration. |
| # Don't include reset_rng_state() to correctly measure timing |
| reset_rng_state(use_xla) |
| t_iter_begin = time.perf_counter() |
| result = model_iter_fn(model, example_inputs, collect_outputs=collect_outputs) |
| |
| # instead of calling sync on result_list, we should call mark_step. |
| # In training case, result_list may be empty, but we want to |
| # send all the pending graphs for compilation. |
| if use_xla: |
| # For the model running on regular torchxla (baseline), we need the |
| # mark step to send the accumulated graph for compilation. |
| # |
| # For the model running with dynamo/torchxla bridge, in training case, |
| # we need the mark step to send the optimizer graph out for |
| # compilation. |
| xm.mark_step() |
| t_iter_end = time.perf_counter() |
| time_total += t_iter_end - t_iter_begin |
| |
| t_0 = time.perf_counter() |
| if use_xla: |
| xm.wait_device_ops() |
| synchronize() |
| t_1 = time.perf_counter() |
| time_total += t_1 - t_0 |
| return (time_total, result) if return_result else time_total |
| |
| |
| class Stats: |
| totals = collections.defaultdict(collections.Counter) |
| |
| @classmethod |
| def reset_counters(cls): |
| for k, v in torch._dynamo.utils.counters.items(): |
| cls.totals[k].update(v) |
| ok = torch._dynamo.utils.counters["frames"]["ok"] |
| total = torch._dynamo.utils.counters["frames"]["total"] |
| torch._dynamo.utils.counters.clear() |
| return ok, total |
| |
| @classmethod |
| def print_summary(cls): |
| for k, v in sorted(cls.totals.items()): |
| lines = "\n ".join(map(str, v.most_common(50))) |
| print(f"STATS {k}\n {lines}") |
| |
| @classmethod |
| def aot_summary(cls): |
| return [cls.totals["aot_autograd"]["total"], cls.totals["aot_autograd"]["ok"]] |
| |
| |
| def coverage_experiment(args, model_iter_fn, model, example_inputs): |
| """ |
| Test operator/model coverage of TorchDynamo and record statistics |
| taken from a profiler. This target is mainly intended to check |
| correctness. |
| |
| Writes to ./coverage.csv |
| """ |
| profiler = Profiler() |
| frozen_model_iter_fn = torch._dynamo.run(model_iter_fn) |
| with profiler.prof: |
| frozen_model_iter_fn(model, example_inputs) |
| coverage_result = profiler.results() |
| output_csv( |
| output_filename, |
| ( |
| "dev", |
| "name", |
| "batch_size", |
| "graphs", |
| "graph_calls", |
| "captured_ops", |
| "total_ops", |
| "pct_ops", |
| "pct_time", |
| ), |
| [ |
| current_device, |
| current_name, |
| current_batch_size, |
| ] |
| + coverage_result.tocsv(), |
| ) |
| return coverage_result |
| |
| |
| def speedup_experiment_fx2trt(args, model_iter_fn, model, example_inputs): |
| """ |
| Measure speedups over eager using the trt inference backend. TRT backend is based fx graph |
| generated by torch._dynamo. |
| Writes to ./speedups_fx2trt.csv |
| """ |
| return speedup_experiment(args, model_iter_fn, model, example_inputs) |
| |
| |
| def recompile_profiler_experiment(args, model_iter_fn, model, example_inputs): |
| with torch._dynamo.utils.CompileProfiler() as prof: |
| opt_model_iter_fn = torch._dynamo.optimize(prof, nopython=args.nopython)( |
| model_iter_fn |
| ) |
| opt_model_iter_fn(model, example_inputs) |
| output_csv( |
| output_filename, ["model", "profiler report"], [current_name, prof.report()] |
| ) |
| met = prof.get_metrics() |
| guard_failures = len(met["guard_failures"]) |
| return [guard_failures] |
| |
| |
| def randomize_input(inputs): |
| if isinstance(inputs, (list, tuple)): |
| return type(inputs)([randomize_input(x) for x in inputs]) |
| elif isinstance(inputs, torch.Tensor): |
| if inputs.dtype in (torch.float32, torch.float64): |
| torch._dynamo.utils.counters["randomize_input"]["times"] += 1 |
| return torch.randn_like(inputs) |
| elif inputs.dtype == torch.int64: |
| # Note: we can not simply tune integer tensors as follows |
| # `return torch.randint_like(inputs, high=inputs.max().item())` |
| # This may break some invariants between tensors. |
| # E.g. in embedding lookup case, one tensor is the length |
| # and another is an indices tensor. |
| return inputs |
| else: |
| raise RuntimeError( |
| f"randomize_input need support tensor of type {inputs.dtype}" |
| ) |
| else: |
| raise RuntimeError( |
| f"randomize_input can not handle input of type {type(inputs)}" |
| ) |
| |
| |
| def maybe_mark_step(args): |
| if args.trace_on_xla: |
| xm.mark_step() |
| |
| |
| def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs): |
| """ |
| Measure speedups over eager. |
| |
| Writes to ./speedups.csv |
| """ |
| # if args.dynamic_shapes: |
| # return speedup_experiment_ds(args, model_iter_fn, model, example_inputs) |
| |
| timings = np.zeros((args.repeat, 2), np.float64) |
| # if we randomize the input, we should also check the result is correct |
| should_check_result = should_randomize_input = args.randomize_input |
| |
| import contextlib |
| |
| from torch._inductor.utils import maybe_profile |
| |
| @contextlib.contextmanager |
| def maybe_mark_profile(*args, **kwargs): |
| prof: torch.profiler.profile = kwargs.pop("p", None) |
| mark = kwargs.pop("mark", None) |
| if prof: |
| with torch.profiler.record_function(mark): |
| yield |
| else: |
| yield |
| |
| times = args.iterations_per_run |
| |
| # Use higher tolerance for XLA since XLA cause numerical unstability when |
| # graph size changes |
| tolerance = args.xla_tolerance if args.trace_on_xla else 1e-4 |
| torch._dynamo.config.repro_tolerance = tolerance |
| |
| with maybe_profile(args.export_profiler_trace) as p: |
| frozen_model_iter_fn = torch._dynamo.run(model_iter_fn) |
| for rep in trange(args.repeat, desc="running benchmark"): |
| inputs = ( |
| randomize_input(copy.deepcopy(example_inputs)) |
| if should_randomize_input |
| else example_inputs |
| ) |
| # need call mark_step to perform the computation |
| # on randomize_input. Otherwise the first call using the |
| # inputs will incur high penalty then the next one. |
| maybe_mark_step(args) |
| |
| # interleave the runs to handle frequency scaling and load changes |
| with maybe_mark_profile(p=p, mark="expected"): |
| timings[rep, 0], expected_output = timed( |
| model, |
| model_iter_fn, |
| inputs, |
| return_result=True, |
| times=times, |
| collect_outputs=args.collect_outputs, |
| ) |
| |
| # call mark_step between the 2 calls to make the comparison fair. |
| maybe_mark_step(args) |
| |
| with maybe_mark_profile(p=p, mark="actual"): |
| timings[rep, 1], actual_output = timed( |
| model, |
| frozen_model_iter_fn, |
| inputs, |
| return_result=True, |
| times=times, |
| collect_outputs=args.collect_outputs, |
| ) |
| |
| if should_check_result: |
| is_correct = is_correct and same( |
| expected_output, actual_output, tol=tolerance |
| ) |
| |
| if args.export_profiler_trace: |
| name = args.profiler_trace_name + "_" + model.name + ".json" |
| name = os.path.join(torch._dynamo.config.base_dir, name) |
| p.export_chrome_trace(name) |
| median = np.median(timings, axis=0) |
| speedup = median[0] / median[1] |
| if args.dump_raw_metrics: |
| np.save( |
| f"{output_filename[:-4]}-raw_timings-{current_name}-{current_device}.npy", |
| timings, |
| ) |
| |
| first_headers = ["dev", "name", "batch_size"] |
| first_fields = [current_device, current_name, current_batch_size] |
| if "tag" in kwargs: |
| first_headers.append("tag") |
| first_fields.append(kwargs["tag"]) |
| headers = first_headers + ["speedup", "abs_latency"] |
| row = first_fields + [float(speedup), median[1] * 1000] |
| msg = f"{speedup:.3f}x" |
| if args.baseline: |
| headers.extend( |
| [ |
| "baseline", |
| "speedup_vs_baseline", |
| ] |
| ) |
| df = pd.read_csv(args.baseline) |
| try: |
| baseline_speedup = df[df["name"] == current_name]["speedup"].item() |
| row.extend([baseline_speedup, speedup / baseline_speedup]) |
| msg = f"{baseline_speedup:.3f}x -> {speedup:.3f}x [{speedup / baseline_speedup:.3f}x]" |
| except (KeyError, ZeroDivisionError): |
| row.extend( |
| [ |
| 0.0, |
| 0.0, |
| ] |
| ) |
| if "compilation_latency" in kwargs: |
| headers += [ |
| "compilation_latency", |
| "compression_ratio", |
| "eager_peak_mem", |
| "dynamo_peak_mem", |
| ] |
| row.append(kwargs["compilation_latency"]) |
| row.append(kwargs["compression_ratio"]) |
| row.append(kwargs["eager_peak_mem"]) |
| row.append(kwargs["dynamo_peak_mem"]) |
| if "dynamo_stats" in kwargs: |
| for k, v in kwargs["dynamo_stats"].items(): |
| headers.append(k) |
| row.append(v) |
| output_csv( |
| output_filename, |
| headers, |
| row, |
| ) |
| headers, data = torch._dynamo.utils.compile_times(repr="csv", aggregate=True) |
| assert ( |
| output_filename.find(".csv") > 0 |
| ), f"expected output_filename to be a .csv, but got {output_filename}" |
| output_csv( |
| output_filename[:-4] + "_compilation_metrics.csv", |
| first_headers + headers, |
| first_fields + data, |
| ) |
| return msg |
| |
| |
| def speedup_experiment_ds(args, model_iter_fn, model, example_inputs): |
| """ |
| Run dynamic shapes benchmarks. |
| |
| Requires dynamic shape compatible models, which provide a list of example inputs. |
| |
| Warms up using the first input example and then iterates the inputs, |
| measuring (and expecting minimal) variance between the runtime for different examples. |
| |
| """ |
| timings = np.zeros((args.repeat, len(example_inputs), 2), np.float64) |
| |
| if args.repeat > 5: |
| print( |
| f"\ndynamic shapes experiments are slow, consider setting --repeat less than {args.repeat}\n" |
| ) |
| |
| nwarmup = 4 |
| for rep in range(args.repeat): |
| # Start each rep fresh, e.g. only warmup on example 0 |
| torch._dynamo.reset() |
| optimized_model_iter_fn = optimize_ctx(model_iter_fn) |
| for _ in range(nwarmup): |
| optimized_model_iter_fn(model, example_inputs[0]) |
| |
| for input_idx, inputs in enumerate(example_inputs): |
| # interleave the runs to handle frequency scaling and load changes |
| timings[rep, input_idx, 0] = timed( |
| model, model_iter_fn, inputs, return_result=False |
| ) |
| # different from regular speedup_experiment, we _DO_ want to allow recompilation |
| timings[rep, input_idx, 1] = timed( |
| model, optimized_model_iter_fn, inputs, return_result=False |
| ) |
| medians = np.median(timings, axis=0) |
| speedups = list(medians[:, 0] / medians[:, 1]) |
| speedups_mean = np.mean(speedups) |
| speedups_median = np.median(speedups) |
| speedups_var = np.var(speedups) |
| |
| # TODO this x[0] is not going to work in general but bert only has 1 input |
| shapes = [x[0].shape for x in example_inputs] |
| shape_keys = sorted(set(shapes)) |
| shape_speedups = { |
| shape: [ |
| it[1] for it in filter(lambda it: it[0] == shape, zip(shapes, speedups)) |
| ] |
| for shape in shape_keys |
| } |
| output_str = ( |
| f"mean: {speedups_mean:.3f}, median: {speedups_median:.3f}, var: {speedups_var:.3f}" |
| + "\nSpeedups by shape: " |
| + "\n".join( |
| [ |
| f"{shape}: " |
| + ", ".join([f"{speedup: .3g}" for speedup in shape_speedups[shape]]) |
| for shape in shape_keys |
| ] |
| ) |
| ) |
| output_csv( |
| output_filename, |
| ("dev", "name", "batch_size", "speedup mean", "speedup median", "speedup var"), |
| [ |
| current_device, |
| current_name, |
| current_batch_size, |
| speedups_mean, |
| speedups_median, |
| speedups_var, |
| ], |
| ) |
| return output_str |
| |
| |
| def speedup_experiment_onnx( |
| onnx_model_cls: Type[OnnxModelFromTorchScript], |
| args, |
| model_iter_fn, |
| model, |
| example_inputs, |
| **kwargs, |
| ): |
| """ |
| Measure speedups over eager. |
| |
| This function is responsible for the following: |
| 1. Creation of OnnxModel, which handles export, ort initialization. |
| 2. Creating iobinding with OnnxModel if device is CUDA, which is essential for perf measurement. |
| 3. Running ORT with OnnxModel. |
| |
| Writes to ./{output_filename}, which should be |
| `pathlib.Path(self.output_dir) / f"{self.compiler}_{suite}_{self.dtype}_{self.mode}_{self.device}_{self.testing}.csv". |
| |
| TODO(bowbao): Record export time and export peak memory usage. |
| """ |
| timings = np.zeros((args.repeat, 2), np.float64) |
| is_correct = True |
| should_randomize_input = args.randomize_input |
| times = args.iterations_per_run |
| |
| onnx_model = onnx_model_cls( |
| args.output_directory or ".", model, copy.deepcopy(example_inputs) |
| ) |
| |
| def create_onnx_input_binded_fn( |
| onnx_model: OnnxModelFromTorchScript, pt_inputs, example_outputs |
| ): |
| # Goal is to move the iobinding creation outside of the timer function. |
| iobinding, outputs = onnx_model.create_iobinding(pt_inputs, example_outputs) |
| |
| def onnxrt_model_iter_fn(model, inputs, collect_outputs=True): |
| onnx_model.run_with_iobinding(iobinding, outputs) |
| if collect_outputs: |
| return outputs |
| |
| return onnxrt_model_iter_fn |
| |
| def create_onnx_fn(onnx_model: OnnxModelFromTorchScript, pt_inputs): |
| def onnxrt_model_iter_fn(model, inputs, collect_outputs=True): |
| return onnx_model.run(pt_inputs) |
| |
| return onnxrt_model_iter_fn |
| |
| for rep in range(args.repeat): |
| inputs = ( |
| randomize_input(copy.deepcopy(example_inputs)) |
| if should_randomize_input |
| else example_inputs |
| ) |
| timings[rep, 0], expected_output = timed( |
| model, |
| model_iter_fn, |
| inputs, |
| return_result=True, |
| times=times, |
| collect_outputs=args.collect_outputs, |
| ) |
| |
| if current_device == "cpu": |
| onnxrt_model_iter_fn = create_onnx_fn(onnx_model, inputs) |
| else: |
| onnxrt_model_iter_fn = create_onnx_input_binded_fn( |
| onnx_model, inputs, expected_output |
| ) |
| |
| timings[rep, 1], actual_output = timed( |
| model, |
| onnxrt_model_iter_fn, |
| inputs, |
| return_result=True, |
| times=times, |
| collect_outputs=args.collect_outputs, |
| ) |
| |
| pvalue = ttest_ind(timings[:, 0], timings[:, 1]).pvalue |
| median = np.median(timings, axis=0) |
| speedup = median[0] / median[1] |
| if args.dump_raw_metrics: |
| np.save( |
| f"{output_filename[:-4]}-raw_timings-{current_name}-{current_device}.npy", |
| timings, |
| ) |
| |
| headers = ["dev", "name", "batch_size", "speedup", "abs_latency"] |
| row = [ |
| current_device, |
| current_name, |
| current_batch_size, |
| float(speedup), |
| median[1] * 1000, |
| ] |
| if "compilation_latency" in kwargs: |
| headers = headers + ["compilation_latency", "compression_ratio"] |
| row.append(kwargs["compilation_latency"]) |
| row.append(kwargs["compression_ratio"]) |
| |
| output_csv( |
| output_filename, |
| headers, |
| row, |
| ) |
| headers, data = torch._dynamo.utils.compile_times(repr="csv", aggregate=True) |
| assert ( |
| output_filename.find(".csv") > 0 |
| ), f"expected output_filename to be a .csv, but got {output_filename}" |
| output_csv( |
| output_filename[:-4] + "_compilation_metrics.csv", |
| ["dev", "name", "batch_size"] + headers, |
| [current_device, current_name, current_batch_size] + data, |
| ) |
| return format_speedup(speedup, pvalue, is_correct=is_correct) |
| |
| |
| def overhead_experiment(*args, model_iter_fn): |
| """ |
| Measure overheads of TorchDynamo by running with no backend (only |
| eager+FX), and reporting speedup/slowdown over eager. |
| |
| Writes to ./overheads.csv |
| """ |
| return speedup_experiment(*args, model_iter_fn) |
| |
| |
| def print_fx(gm, example_inputs): |
| print(gm.graph) |
| return gm |
| |
| |
| def print_aten_ops(gm, example_inputs): |
| from functorch.compile import aot_module |
| |
| def trace_printer(gm, _): |
| print(gm.graph) |
| return gm |
| |
| return aot_module(gm, fw_compiler=trace_printer, bw_compiler=trace_printer) |
| |
| |
| def baselines(models, model_iter_fn, example_inputs, args): |
| """ |
| Common measurement code across all baseline experiments. |
| """ |
| models = list(models) |
| for idx, (name, model) in enumerate(models): |
| if idx == 0: |
| result0 = model_iter_fn(model, example_inputs) |
| elif model is not None: |
| try: |
| result = model_iter_fn(model, example_inputs) |
| if same(result0, result): |
| continue |
| print(name, "is INCORRECT") |
| except Exception: |
| log.exception("error checking %s", name) |
| models[idx] = (name, None) |
| timings = np.zeros((args.repeat, len(models)), np.float64) |
| timings.fill(1.0e10) |
| for rep in range(args.repeat): |
| for idx, (name, model) in enumerate(models): |
| if model is not None: |
| try: |
| timings[rep, idx] = timed(model, model_iter_fn, example_inputs) |
| except Exception: |
| pass |
| pvalue = [ |
| ttest_ind(timings[:, 0], timings[:, i]).pvalue |
| for i in range(1, timings.shape[1]) |
| ] |
| median = np.median(timings, axis=0) |
| speedup = median[0] / median[1:] |
| for idx, (name, model) in enumerate(models[1:]): |
| if model is None: |
| speedup[idx] = 0.0 |
| result = " ".join( |
| [ |
| format_speedup(s, p, m is not None) |
| for s, p, m in zip(speedup, pvalue, [m for n, m in models[1:]]) |
| ] |
| ) |
| output_csv( |
| output_filename, |
| ("dev", "name", "batch_size") + tuple(n for n, m in models[1:]), |
| [current_device, current_name, current_batch_size] |
| + [f"{x:.4f}" for x in speedup], |
| ) |
| return result |
| |
| |
| def xla(args, model_iter_fn, model, example_inputs): |
| xla_dev = xm.xla_device(devkind=current_device) |
| model_xla = copy.deepcopy(model).to("cpu").to(device=xla_dev) |
| example_inputs_xla = tree_map_only( |
| torch.Tensor, lambda x: x.to("cpu").to(device=xla_dev), example_inputs |
| ) |
| for _ in range(3): # warmup |
| timed(model, model_iter_fn, example_inputs) |
| timed(model_xla, model_iter_fn, example_inputs_xla) |
| timings = np.zeros((args.repeat, 2), np.float64) |
| timings.fill(1.0e10) |
| for rep in range(args.repeat): |
| timings[rep, 0] = timed(model, model_iter_fn, example_inputs) |
| timings[rep, 1] = timed(model_xla, model_iter_fn, example_inputs_xla) |
| |
| pvalue = ttest_ind(timings[:, 0], timings[:, 1]).pvalue |
| time_baseline, time_xla = np.median(timings, axis=0) |
| speedup = time_baseline / time_xla |
| output_csv( |
| output_filename, |
| ("dev", "name", "batch_size", "speedup", "time_baseline", "time_xla"), |
| [ |
| current_device, |
| current_name, |
| current_batch_size, |
| speedup, |
| time_baseline, |
| time_xla, |
| ], |
| ) |
| return format_speedup(speedup, pvalue) |
| |
| |
| def try_script(model, example_inputs): |
| try: |
| return torch.jit.script(model) |
| except Exception: |
| return None |
| |
| |
| def download_retry_decorator(download_fn): |
| """ |
| Decorator function for applying retry logic to a download function. |
| |
| The wrapped function will be called up to 5 times and raises an exception if the function fails each time. |
| After each unsuccessful attempt, there is a delay before the next attempt, which is increased linearly with the number of tries. |
| |
| Usage: |
| @download_retry_decorator |
| def download_function(model_name: str): |
| # download logic goes here |
| """ |
| |
| @functools.wraps(download_fn) |
| def wrapper(self, *args, **kwargs) -> Any: |
| tries = 0 |
| total_allowed_tries = MAX_DOWNLOAD_ATTEMPTS |
| while tries <= total_allowed_tries: |
| try: |
| model = download_fn(self, *args, **kwargs) |
| return model |
| except Exception as e: |
| tries += 1 |
| if tries <= total_allowed_tries: |
| wait = tries * 30 |
| print( |
| f"Failed to load model: {e}. Trying again ({tries}/{total_allowed_tries}) after {wait}s" |
| ) |
| time.sleep(wait) |
| else: |
| raise RuntimeError( |
| f"Failed to load model '{args}' with following error(s): {str(e)}." |
| ) |
| |
| return wrapper |
| |
| |
| class OnnxModelFromTorchScript: |
| """TorchScript based onnx export. `torch.onnx.export` |
| |
| TODO(bowbao): |
| * large model export failed. |
| Onnx Model is larger than 2GB, but exporter makes decision based pt model size, which is |
| smaller than 2GB. |
| * OOM on slightly larger model. |
| Both pt model and ort inference session are on gpu. Attempt has been made to move ORT to |
| cuda:1, however ORT perf drop significantly. |
| For now running everything with batch_size 1 set in launch script. |
| """ |
| |
| TORCH_TO_NUMPY_DTYPE = { |
| torch.float16: np.float16, |
| torch.float32: np.float32, |
| torch.float64: np.float64, |
| torch.uint8: np.uint8, |
| torch.int8: np.int8, |
| torch.int16: np.int16, |
| torch.int32: np.int32, |
| torch.int64: np.longlong, |
| torch.bool: np.bool_, |
| } |
| |
| def __init__(self, output_directory, model, example_inputs): |
| self.model_path = self._generate_onnx_model_path(output_directory) |
| self._export( |
| model, |
| example_inputs, |
| self.model_path, |
| opset_version=17, |
| do_constant_folding=False, |
| verbose=False, |
| ) |
| self.onnx_session = self._init_ort_session(self.model_path) |
| |
| def _generate_onnx_model_path( |
| self, output_directory: str, onnx_model_folder_name: str = "bench_onnx_models" |
| ) -> str: |
| # Hack to get model name. |
| from torch._functorch import aot_autograd |
| |
| model_name = aot_autograd.model_name |
| model_path = pathlib.Path(output_directory, onnx_model_folder_name, model_name) |
| if model_path.exists() and model_path.is_dir(): |
| shutil.rmtree(model_path) |
| model_path.mkdir(parents=True, exist_ok=True) |
| return str(model_path / "model.onnx") |
| |
| def _export(self, model, example_inputs, output_path: str, /, **kwargs) -> None: |
| # Hack for huggingface models (kwargs only). |
| if isinstance(example_inputs, dict): |
| |
| class WrapperModel(torch.nn.Module): |
| def __init__(self, model, keys): |
| super().__init__() |
| self.model = model |
| self.keys = keys |
| |
| def forward(self, *args): |
| return self.model(**dict(zip(self.keys, args))) |
| |
| model = WrapperModel(model, list(example_inputs.keys())) |
| |
| torch.onnx.export( |
| model, |
| self.format_pt_inputs(example_inputs), |
| output_path, |
| **kwargs, |
| ) |
| |
| def _init_ort_session(self, model_path: str): |
| import onnxruntime |
| |
| if current_device == "cpu": |
| ort_providers = ["CPUExecutionProvider"] |
| else: |
| # NOTE(bowbao): Reduce OOM by running ORT on another gpu. |
| # TODO(bowbao): This works to avoid OOM, but performance is surprisingly very bad. |
| # cuda_provider_options = { |
| # "device_id": 1 if torch.cuda.device_count() > 1 else 0, |
| # } |
| # ort_providers = [("CUDAExecutionProvider", cuda_provider_options)] |
| ort_providers = ["CUDAExecutionProvider"] |
| |
| ort_session = onnxruntime.InferenceSession( |
| self.model_path, |
| providers=ort_providers, |
| ) |
| return ort_session |
| |
| def format_pt_inputs(self, pt_inputs): |
| # NOTE(bowbao): For huggingface benchmark, pt_inputs are formatted as dictionary, |
| # and consumed like `model(**pt_inputs)`. |
| # For other benchmarks, pt_inputs are formatted as tuple and consumed |
| # like `model(*pt_inputs)`. |
| if isinstance(pt_inputs, dict): |
| pt_inputs = list(pt_inputs.values()) |
| if isinstance(pt_inputs, torch.Tensor): |
| pt_inputs = (pt_inputs,) |
| return tuple(arg.contiguous() for arg in pt_inputs) |
| |
| def format_pt_outputs(self, pt_outputs): |
| if isinstance(pt_outputs, torch.Tensor): |
| pt_outputs = (pt_outputs,) |
| |
| pt_outputs, _ = pytree.tree_flatten(pt_outputs) |
| |
| # Hack for huggingface model outputs |
| try: |
| from transformers import modeling_outputs |
| except ImportError: |
| pass |
| else: |
| |
| def _to_tuple(x): |
| if isinstance(x, modeling_outputs.ModelOutput): |
| return x.to_tuple() |
| return x |
| |
| pt_outputs = pytree.tree_map(_to_tuple, pt_outputs) |
| pt_outputs, _ = pytree.tree_flatten(pt_outputs) |
| |
| return pt_outputs |
| |
| def create_outputs(self, *example_outputs): |
| return tuple(torch.empty_like(x) for x in example_outputs) |
| |
| def create_iobinding(self, pt_inputs, example_outputs): |
| pt_inputs = self.format_pt_inputs(pt_inputs) |
| example_outputs = self.format_pt_outputs(example_outputs) |
| |
| iobinding = self.onnx_session.io_binding() |
| args = [arg.contiguous() for arg in pt_inputs] |
| for ort_input, arg in zip(self.onnx_session.get_inputs(), args): |
| # NOTE: Small hack to reduce OOM issue by running ORT on another device. |
| # Disabled due to ORT perf regression. |
| # if torch.cuda.device_count() > 1: |
| # arg = arg.detach().to("cuda:1") |
| device = arg.device |
| iobinding.bind_input( |
| ort_input.name, |
| device.type, |
| device.index or 0, |
| self.TORCH_TO_NUMPY_DTYPE[arg.dtype], |
| arg.size(), |
| arg.data_ptr(), |
| ) |
| |
| outputs = self.create_outputs(*example_outputs) |
| for ort_output, output in zip(self.onnx_session.get_outputs(), outputs): |
| # if torch.cuda.device_count() > 1: |
| # output = output.detach().to("cuda:1") |
| device = output.device |
| iobinding.bind_output( |
| ort_output.name, |
| device.type, |
| device.index or 0, |
| self.TORCH_TO_NUMPY_DTYPE[output.dtype], |
| output.size(), |
| output.data_ptr(), |
| ) |
| return iobinding, outputs |
| |
| def run_with_iobinding(self, iobinding, outputs): |
| # 'outputs' are torch empty tensors binded to 'iobinding'. |
| self.onnx_session.run_with_iobinding(iobinding) |
| return outputs |
| |
| def run(self, pt_inputs): |
| # NOTE: For CUDA performance testing, use `run_with_iobinding` to exclude memory |
| # copying overhead for inputs/outputs between cpu and gpu. |
| # Otherwise perf number is inaccurate. |
| pt_inputs = self.format_pt_inputs(pt_inputs) |
| onnx_inputs = { |
| ort_input.name: pt_input.cpu().numpy() |
| for ort_input, pt_input in zip(self.onnx_session.get_inputs(), pt_inputs) |
| } |
| ort_outputs = self.onnx_session.run(None, onnx_inputs) |
| pt_outputs = [ |
| torch.from_numpy(ort_output).to(current_device) |
| for ort_output in ort_outputs |
| ] |
| if len(pt_outputs) == 1: |
| return pt_outputs[0] |
| return pt_outputs |
| |
| |
| class OnnxModelFromDynamo(OnnxModelFromTorchScript): |
| """Dynamo and Fx based export. `torch.onnx.dynamo_export`.""" |
| |
| def __init__(self, output_directory, model, example_inputs): |
| self.model_path = self._generate_onnx_model_path( |
| output_directory, "bench_dynamo_onnx_model" |
| ) |
| self._export_output = self._export(model, example_inputs, self.model_path) |
| self.onnx_session = self._init_ort_session(self.model_path) |
| |
| def _normalize_bench_inputs( |
| self, example_inputs |
| ) -> Tuple[Tuple[Any], Mapping[str, Any]]: |
| # NOTE(bowbao): For huggingface benchmark, example_inputs are formatted as dictionary, |
| # and consumed like `model(**example_inputs)`. |
| # For other benchmarks, example_inputs are formatted as tuple and consumed |
| # like `model(*example_inputs)`. |
| if isinstance(example_inputs, dict): |
| return (), example_inputs |
| else: |
| return example_inputs, {} |
| |
| def _export( |
| self, model, example_inputs, output_path: str |
| ) -> torch.onnx.ExportOutput: |
| example_args, example_kwargs = self._normalize_bench_inputs(example_inputs) |
| options = torch.onnx.ExportOptions() |
| export_output = torch.onnx.dynamo_export( |
| model, *example_args, **example_kwargs, export_options=options |
| ) |
| |
| export_output.save(output_path) |
| return export_output |
| |
| def format_pt_inputs(self, pt_inputs): |
| pt_args, pt_kwargs = self._normalize_bench_inputs(pt_inputs) |
| return self._export_output.adapt_torch_inputs_to_onnx(*pt_args, **pt_kwargs) |
| |
| def format_pt_outputs(self, pt_outputs): |
| return self._export_output.adapt_torch_outputs_to_onnx(pt_outputs) |
| |
| |
| def optimize_onnx_ctx( |
| output_directory: str, |
| onnx_model_cls: Type[OnnxModelFromTorchScript], |
| run_n_iterations: Callable, |
| ) -> Callable: |
| # NOTE(bowbao): This function creates and returns the onnx version of 'run_n_iterations', |
| # which does the following: |
| # 1. Export and cache model. |
| # 2. Create iobinding for ORT. |
| # 3. Run ORT for n iterations. |
| onnx_model: Optional[OnnxModelFromTorchScript] = None |
| |
| def run_n_iterations_onnx(model, inputs, n=2): |
| from _onnx import reporter |
| from torch.onnx._internal import exporter |
| from torch.onnx._internal.fx import diagnostics |
| |
| # NOTE(bowbao): Capture all export & ort errors and diagnostics. |
| # Serialize to csv, to be parsed and summarized later by '._onnx/reporter.py'. |
| # TODO: Accuracy mismatch is not reported here in csv. |
| assert ( |
| output_filename.find(".csv") > 0 |
| ), f"expected output_filename to be a .csv, but got {output_filename}" |
| output_error_filename = output_filename[:-4] + "_export_error.csv" |
| parser = reporter.ExportErrorParser( |
| current_device, current_name, current_batch_size |
| ) |
| try: |
| nonlocal onnx_model |
| if onnx_model is None: |
| onnx_model = onnx_model_cls( |
| output_directory, model, copy.deepcopy(inputs) |
| ) |
| |
| for _ in range(n - 1): |
| onnx_model.run(inputs) |
| return onnx_model.run(inputs) |
| except exporter.OnnxExporterError as e: |
| # `torch.onnx.dynamo_export` raises error that encloses diagnostics. |
| diagnostic_context = e.diagnostic_context |
| for parsed_error in parser.parse_diagnostic_context(diagnostic_context): |
| output_csv( |
| output_error_filename, parsed_error.headers, parsed_error.row |
| ) |
| |
| # Check also the raw exception that caused export failure. |
| # Skip if it is already analyzed by diagnostics. |
| cause_of_exception = e.__cause__ |
| if not isinstance( |
| cause_of_exception, diagnostics.RuntimeErrorWithDiagnostic |
| ): |
| parsed_error = parser.parse_exception(cause_of_exception) |
| output_csv( |
| output_error_filename, parsed_error.headers, parsed_error.row |
| ) |
| raise |
| except Exception as e: |
| # `torch.onnx.export` errors. |
| # ORT errors. |
| parsed_error = parser.parse_exception(e) |
| output_csv(output_error_filename, parsed_error.headers, parsed_error.row) |
| raise |
| |
| return run_n_iterations_onnx |
| |
| |
| def read_batch_size_from_file(args, filename, model_name): |
| batch_size = None |
| if os.path.exists("benchmarks"): |
| filename = os.path.join("benchmarks", filename) |
| assert os.path.exists(filename), filename |
| with open(filename, "r") as f: |
| lines = f.readlines() |
| lines = [i.split(",") for i in lines if len(i.strip()) > 0] |
| for val in lines: |
| cur_name, b = val |
| if model_name == cur_name: |
| batch_size = int(b) |
| if batch_size is None: |
| log.warning("Could not find batch size for %s", model_name) |
| elif batch_size == -1: |
| raise RuntimeError( |
| f"Batch size is unset for {model_name} in {args.batch_size_file}" |
| ) |
| print(f"batch size: {batch_size}") |
| return batch_size |
| |
| |
| class TimeOutException(Exception): |
| pass |
| |
| |
| def alarm_handler(signum, frame): |
| raise TimeOutException() |
| |
| |
| def exit_after(s): |
| """ |
| Decorator to raise TimeoutException if the fn is taking more than s seconds |
| to run. |
| """ |
| |
| def outer(fn): |
| def inner(*args, **kwargs): |
| signal.signal(signal.SIGALRM, alarm_handler) |
| signal.alarm(s) |
| try: |
| result = fn(*args, **kwargs) |
| finally: |
| signal.alarm(0) |
| return result |
| |
| return inner |
| |
| return outer |
| |
| |
| def get_peak_memory(): |
| return torch.cuda.max_memory_allocated() / 10**9 |
| |
| |
| def null_experiment(args, model_iter_fn, model, example_inputs): |
| """ |
| A no-op experiment useful for making sure TorchBenchark alone works properly. |
| """ |
| |
| return [] |
| |
| |
| def cast_to(dtype, model, inputs): |
| # cast model and inputs to fp16 |
| if dtype == torch.float16: |
| model = model.half() |
| else: |
| model = model.to(dtype) |
| |
| inputs = tree_map( |
| lambda x: x.to(dtype) |
| if isinstance(x, torch.Tensor) and x.is_floating_point() |
| else x, |
| inputs, |
| ) |
| return model, inputs |
| |
| |
| def cast_to_bf16(model, inputs): |
| return cast_to(torch.bfloat16, model, inputs) |
| |
| |
| def cast_to_fp16(model, inputs): |
| return cast_to(torch.float16, model, inputs) |
| |
| |
| def cast_to_fp64(model, inputs): |
| return cast_to(torch.float64, model, inputs) |
| |
| |
| def cast_to_fp32(model, inputs): |
| return cast_to(torch.float32, model, inputs) |
| |
| |
| def reset_rng_state(use_xla=False): |
| torch.manual_seed(1337) |
| random.seed(1337) |
| np.random.seed(1337) |
| if use_xla: |
| xm.set_rng_state(1337, str(xm.xla_device())) |
| |
| |
| class DummyGradScaler: |
| def scale(self, loss): |
| return loss |
| |
| |
| def get_dynamo_stats(): |
| # TODO: consider deepcopy'ing the entire counters struct and |
| # adding a helper to do subtraction on it |
| return collections.Counter( |
| { |
| "calls_captured": torch._dynamo.utils.counters["stats"]["calls_captured"], |
| "unique_graphs": torch._dynamo.utils.counters["stats"]["unique_graphs"], |
| "graph_breaks": sum(torch._dynamo.utils.counters["graph_break"].values()), |
| # NB: The plus removes zero counts |
| "unique_graph_breaks": len(+torch._dynamo.utils.counters["graph_break"]), |
| } |
| ) |
| |
| |
| def maybe_fresh_cache(fn, is_cold_start): |
| def inner(*args, **kwargs): |
| cache_minder = contextlib.nullcontext() |
| if is_cold_start: |
| cache_entries = {} |
| cache_minder = fresh_inductor_cache(cache_entries) |
| |
| try: |
| with cache_minder: |
| return fn(*args, **kwargs) |
| finally: |
| dump_cache = False |
| if dump_cache and is_cold_start: |
| output_csv( |
| output_filename[:-4] + "_triton_cache.csv", |
| ["dev", "name", "batch_size", "triton_cache"], |
| [ |
| current_device, |
| current_name, |
| current_batch_size, |
| cache_entries, |
| ], |
| ) |
| |
| return inner |
| |
| |
| @contextmanager |
| def maybe_init_distributed(should_init_distributed, port="6789", rank=0, world_size=1): |
| # To avoid multiple inheritance from _dynamo.test_case.TestCase and MultiProcessTestCase, |
| # Just manually implement the most important part of the dynamo behavior to reset/clear. |
| try: |
| if should_init_distributed: |
| torch.cuda.set_device(rank) |
| os.environ["MASTER_ADDR"] = "localhost" |
| os.environ["MASTER_PORT"] = port |
| torch.distributed.init_process_group( |
| "nccl", rank=rank, world_size=world_size |
| ) |
| yield |
| finally: |
| if should_init_distributed: |
| torch.distributed.destroy_process_group() |
| |
| |
| class BenchmarkRunner: |
| def __init__(self): |
| self.model_iter_fn = None |
| self.grad_scaler = DummyGradScaler() |
| self.autocast = contextlib.nullcontext |
| self.optimizer = None |
| self._args = None |
| |
| def setup_amp(self): |
| if self.args.only in self.fp32_only_models: |
| return |
| |
| if self.args.amp and self.args.devices == ["cuda"]: |
| # AMP training can lead to small loss values which can undeflow |
| # gradient values returning in zero gradients. To solve this |
| # problem, PyTorch introduces GradScaler. GradScaler is a stateful |
| # structure, that scales the loss values to prevent underflow. Loss |
| # values are big at the beginning of training (therefore not |
| # requiring scaling), while loss value tends to be small as network |
| # starts getting better (requiring scaling). GradScaler manages all |
| # of this fine tuning, checking the gradients are turning to inf, |
| # discarding such batches. |
| |
| # Since we are not running a long iteration, default value of |
| # init_scale 65536 is going to turn all gradients to inf. Therefore, |
| # we just use a init_scale of 2.0 for benchmarking purpose. |
| |
| # Disabling Gradscaler because |
| # 1) Benchmark setup runs 2 iterations of fwd-bwd. So, not useful. |
| # 2) Current setup shares grad_scaler for eager and dynamo model, |
| # which is bad as Gradscaler has state and can adjust the scaling |
| # factor between eager and dynamo run, making accuracy check |
| # harder. |
| # self.grad_scaler = torch.cuda.amp.GradScaler(init_scale=2.0) |
| self.autocast = torch.cuda.amp.autocast |
| elif (self.args.bfloat16 or self.args.amp) and self.args.devices == ["cpu"]: |
| self.autocast = torch.cpu.amp.autocast |
| |
| def init_optimizer(self, name, device, params): |
| if device == "cuda" and self.args.training and name not in CI_SKIP_OPTIMIZER: |
| self.optimizer = torch.optim.SGD(params, lr=0.01) |
| else: |
| self.optimizer = None |
| |
| @property |
| def args(self): |
| return self._args |
| |
| @args.setter |
| def args(self, args): |
| self._args = args |
| |
| @property |
| def skip_models(self): |
| return set() |
| |
| @property |
| def skip_models_for_cuda(self): |
| return set() |
| |
| @property |
| def skip_models_for_cpu(self): |
| return set() |
| |
| @property |
| def slow_models(self): |
| return set() |
| |
| @property |
| def very_slow_models(self): |
| return set() |
| |
| @property |
| def non_deterministic_models(self): |
| return set() |
| |
| @property |
| def fp32_only_models(self): |
| return set() |
| |
| @property |
| def force_amp_for_fp16_bf16_models(self): |
| return set() |
| |
| @property |
| def skip_not_suitable_for_training_models(self): |
| return set() |
| |
| @property |
| def failing_torchinductor_models(self): |
| return set() |
| |
| @property |
| def failing_fx2trt_models(self): |
| return set() |
| |
| @property |
| def skip_accuracy_checks_large_models_dashboard(self): |
| return set() |
| |
| @property |
| def skip_accuracy_check_as_eager_non_deterministic(self): |
| return set() |
| |
| @property |
| def get_tolerance_and_cosine_flag(self, is_training, current_device, name): |
| raise NotImplementedError() |
| |
| @property |
| def equal_nan(self): |
| equal_nan = True |
| if self.args.float32: |
| equal_nan = False |
| return equal_nan |
| |
| def iter_models(self, args): |
| for model_name in self.iter_model_names(args): |
| for device in args.devices: |
| try: |
| yield self.load_model( |
| device, |
| model_name, |
| batch_size=args.batch_size, |
| ) |
| except NotImplementedError: |
| continue # bad benchmark implementation |
| |
| def deepcopy_model(self, model): |
| return copy.deepcopy(model) |
| |
| def cast_based_on_args(self, model, example_inputs): |
| if self.args.float32 or self.args.only in self.fp32_only_models: |
| if not self.args.float32: |
| log.warning("Model %s supports float32 only", self.args.only) |
| model, example_inputs = cast_to_fp32(model, example_inputs) |
| elif self.args.float16: |
| if self.args.only in self.force_amp_for_fp16_bf16_models: |
| log.warning( |
| "Model %s does not support float16, running with amp instead", |
| self.args.only, |
| ) |
| self.args.amp = True |
| self.setup_amp() |
| else: |
| model, example_inputs = cast_to_fp16(model, example_inputs) |
| elif self.args.bfloat16: |
| if self.args.only in self.force_amp_for_fp16_bf16_models: |
| log.warning( |
| "Model %s does not support bfloat16, running with amp instead", |
| self.args.only, |
| ) |
| self.args.amp = True |
| self.setup_amp() |
| else: |
| model, example_inputs = cast_to_bf16(model, example_inputs) |
| |
| return model, example_inputs |
| |
| def validate_model(self, model, example_inputs): |
| """ |
| Runs the eager model with example inputs to ensure that eager passes. |
| """ |
| model = self.deepcopy_model(model) |
| example_inputs = clone_inputs(example_inputs) |
| model, example_inputs = self.cast_based_on_args(model, example_inputs) |
| try: |
| self.model_iter_fn(model, example_inputs) |
| except Exception as e: |
| raise NotImplementedError("Eager model failed to run") from e |
| |
| def maybe_cast(self, model, example_inputs): |
| model = self.deepcopy_model(model) |
| example_inputs = clone_inputs(example_inputs) |
| model, example_inputs = self.cast_based_on_args(model, example_inputs) |
| return model, example_inputs |
| |
| def decay_batch_exp(self, batch_size, factor=0.5, divisor=2): |
| out_batch_size = batch_size * factor |
| if out_batch_size > divisor: |
| out_batch_size = (out_batch_size + 1) // divisor * divisor |
| else: |
| out_batch_size = batch_size - 1 |
| return max(0, int(out_batch_size)) |
| |
| def batch_size_finder(self, device, model_name, initial_batch_size=1024): |
| batch_size = initial_batch_size |
| while batch_size >= 1: |
| torch.cuda.empty_cache() |
| try: |
| device, name, model, example_inputs, _ = self.load_model( |
| device, |
| model_name, |
| batch_size, |
| ) |
| self.model_iter_fn(model, example_inputs) |
| return batch_size |
| except RuntimeError as e: |
| error_str = str(e) |
| if "channels_last" in error_str: |
| break |
| batch_size = self.decay_batch_exp(batch_size) |
| return 1 |
| |
| def run_n_iterations(self, mod, inputs): |
| n = self.args.iterations |
| for _ in range(n - 1): |
| self.model_iter_fn(mod, inputs, collect_outputs=False) |
| return self.model_iter_fn(mod, inputs, collect_outputs=True) |
| |
| def optimizer_zero_grad(self, mod): |
| if self.optimizer is not None: |
| self.optimizer.zero_grad(True) |
| else: |
| mod.zero_grad(True) |
| |
| def optimizer_step(self): |
| if self.optimizer is not None: |
| self.optimizer.step() |
| |
| def get_benchmark_indices(self, length): |
| start = self._args.partition_id * (length // self._args.total_partitions) |
| end = ( |
| (self._args.partition_id + 1) * (length // self._args.total_partitions) |
| if self._args.partition_id < self._args.total_partitions - 1 |
| else length |
| ) |
| return start, end |
| |
| def check_accuracy( |
| self, name, model, example_inputs, optimize_ctx, experiment, tag |
| ): |
| """ |
| Checks accuracy. |
| 1) Collect the outputs with fp64 datatype. This is useful for error checking. |
| 2) Checks if eager itself has variations. |
| """ |
| start_stats = get_dynamo_stats() |
| |
| def record_status(accuracy_status, dynamo_start_stats): |
| """ |
| Records the status in the csv file |
| """ |
| if current_name in self.non_deterministic_models: |
| if accuracy_status in ( |
| "pass", |
| "eager_two_runs_differ", |
| "fail_accuracy", |
| ): |
| accuracy_status = "pass" |
| |
| headers = ["dev", "name", "batch_size", "accuracy"] |
| fields = [current_device, current_name, current_batch_size, accuracy_status] |
| |
| if tag is not None: |
| headers.insert(3, "tag") |
| fields.insert(3, tag) |
| |
| dynamo_stats = get_dynamo_stats() |
| dynamo_stats.subtract(dynamo_start_stats) |
| for k, v in dynamo_stats.items(): |
| headers.append(k) |
| fields.append(v) |
| |
| output_csv(output_filename, headers, fields) |
| return accuracy_status |
| |
| if name in self.skip_accuracy_checks_large_models_dashboard: |
| return record_status("pass_due_to_skip", dynamo_start_stats=start_stats) |
| |
| def deepcopy_and_maybe_ddp(model): |
| model = self.deepcopy_model(model) |
| if self.args.ddp: |
| assert ( |
| torch.distributed.is_available() |
| ), "Can't use DDP without a distributed enabled build" |
| from torch.nn.parallel import DistributedDataParallel as DDP |
| |
| model = DDP(model, find_unused_parameters=True) |
| elif self.args.fsdp: |
| assert ( |
| torch.distributed.is_available() |
| ), "Can't use FSDP without a distributed enabled build" |
| from torch.distributed.fsdp import FullyShardedDataParallel as FSDP |
| |
| model = FSDP(model, use_orig_params=True) |
| if torch._inductor.config.triton.cudagraphs: |
| log.warning("Disabling cudagraphs for FSDP compatibility") |
| torch._inductor.config.triton.cudagraphs = False |
| return model |
| |
| # Collect the fp64 reference outputs to be used later for accuracy checking. |
| fp64_outputs = None |
| try: |
| model_fp64, inputs_fp64 = cast_to_fp64( |
| deepcopy_and_maybe_ddp(model), |
| clone_inputs(example_inputs), |
| ) |
| self.init_optimizer(name, current_device, model_fp64.parameters()) |
| fp64_outputs = self.run_n_iterations(model_fp64, inputs_fp64) |
| except Exception: |
| log.warning( |
| "fp64 golden ref were not generated for %s. Setting accuracy check to cosine", |
| name, |
| ) |
| self.args.cosine = True |
| fp64_outputs = None |
| |
| tolerance, cos_similarity = self.get_tolerance_and_cosine_flag( |
| self.args.training, current_device, name |
| ) |
| |
| # Cast the model to float16/float32 as necessary |
| model, example_inputs = self.maybe_cast(model, example_inputs) |
| accuracy_status = "pass" |
| |
| with self.pick_grad(name, self.args.training): |
| # Get results of native pytorch |
| reset_rng_state() |
| try: |
| model_copy = deepcopy_and_maybe_ddp(model) |
| self.init_optimizer(name, current_device, model_copy.parameters()) |
| correct_result = self.run_n_iterations( |
| model_copy, clone_inputs(example_inputs) |
| ) |
| except Exception as e: |
| accuracy_status = ( |
| "eager_1st_run_OOM" |
| if isinstance(e, torch.cuda.OutOfMemoryError) |
| else "eager_1st_run_fail" |
| ) |
| log.exception(e) |
| return record_status(accuracy_status, dynamo_start_stats=start_stats) |
| |
| # Rerun native pytorch |
| reset_rng_state() |
| try: |
| model_copy = deepcopy_and_maybe_ddp(model) |
| self.init_optimizer(name, current_device, model_copy.parameters()) |
| correct_rerun_result = self.run_n_iterations( |
| model_copy, clone_inputs(example_inputs) |
| ) |
| except Exception as e: |
| accuracy_status = ( |
| "eager_2nd_run_OOM" |
| if isinstance(e, torch.cuda.OutOfMemoryError) |
| else "eager_2nd_run_fail" |
| ) |
| return record_status(accuracy_status, dynamo_start_stats=start_stats) |
| |
| # Two eager runs should have exactly same result |
| is_same = True |
| try: |
| if ( |
| name not in self.skip_accuracy_check_as_eager_non_deterministic |
| and not same( |
| correct_result, |
| correct_rerun_result, |
| fp64_ref=None, |
| cos_similarity=False, |
| tol=0, |
| equal_nan=self.equal_nan, |
| ) |
| ): |
| is_same = False |
| except Exception as e: |
| # Sometimes torch.allclose may throw RuntimeError |
| is_same = False |
| |
| if not is_same: |
| accuracy_status = "eager_two_runs_differ" |
| return record_status(accuracy_status, dynamo_start_stats=start_stats) |
| |
| correct_rerun_result = None |
| |
| # Run with Dynamo |
| reset_rng_state() |
| torch._dynamo.reset() |
| try: |
| model_copy = deepcopy_and_maybe_ddp(model) |
| self.init_optimizer(name, current_device, model_copy.parameters()) |
| if self.args.export: |
| # TB and TIMM use list example_inputs |
| # HF use dict example_inputs |
| if isinstance(example_inputs, dict): |
| raise RuntimeError( |
| "expect example_inputs as list/tuple, but got dict. need to support kwargs in torch._export.export" |
| ) |
| # apply export on module directly |
| # no need for n iterations |
| # the logic should be the same to self.model_iter_fn (forward_pass) |
| with self.autocast(): |
| optimized_model_iter_fn = optimize_ctx( |
| model_copy, example_inputs |
| ) |
| new_result = optimized_model_iter_fn(*example_inputs) |
| else: |
| optimized_model_iter_fn = optimize_ctx(self.run_n_iterations) |
| new_result = optimized_model_iter_fn(model_copy, example_inputs) |
| except Exception as e: |
| log.exception(e) |
| print( |
| "TorchDynamo optimized model failed to run because of following error" |
| ) |
| accuracy_status = ( |
| "OOM" |
| if isinstance(e, torch.cuda.OutOfMemoryError) |
| else "fail_to_run" |
| ) |
| return record_status(accuracy_status, dynamo_start_stats=start_stats) |
| |
| if name in self.skip_accuracy_check_as_eager_non_deterministic: |
| return record_status("pass_due_to_skip", dynamo_start_stats=start_stats) |
| |
| # Workaround for ONNX for non-tensor outputs |
| if ( |
| current_onnx_compiler == "torchscript" |
| or current_onnx_compiler == "dynamo" |
| ): |
| from _onnx import patch |
| |
| ( |
| correct_result, |
| new_result, |
| fp64_outputs, |
| ) = patch.patch_non_tensor_outputs( |
| correct_result, new_result, fp64_outputs |
| ) |
| |
| try: |
| if not same( |
| correct_result, |
| new_result, |
| fp64_outputs, |
| equal_nan=self.equal_nan, |
| cos_similarity=cos_similarity, |
| tol=tolerance, |
| ): |
| is_same = False |
| except Exception as e: |
| # Sometimes torch.allclose may throw RuntimeError |
| is_same = False |
| |
| if not is_same: |
| if self.args.skip_accuracy_check: |
| accuracy_status = "pass_due_to_skip" |
| else: |
| accuracy_status = "fail_accuracy" |
| return record_status(accuracy_status, dynamo_start_stats=start_stats) |
| |
| return record_status(accuracy_status, dynamo_start_stats=start_stats) |
| |
| def check_tolerance( |
| self, name, model, example_inputs, optimize_ctx, base_device="cpu" |
| ): |
| """ |
| Checks tolerance based on https://pytorch.org/docs/stable/generated/torch.allclose.html. |
| """ |
| tolerance_status = "pass" |
| if name in self.skip_accuracy_checks_large_models_dashboard: |
| tolerance_status = "pass_due_to_skip" |
| return tolerance_status |
| # Cast the model to float16/float32 as necessary |
| model, example_inputs = self.maybe_cast(model, example_inputs) |
| |
| with self.pick_grad(name, self.args.training): |
| # Get results of native pytorch |
| reset_rng_state() |
| model_copy = copy.deepcopy(model) |
| model_copy = model_copy.to(base_device) |
| example_inputs_copy = copy.deepcopy(example_inputs) |
| example_inputs_copy = tree_map( |
| lambda x: x.to(base_device), example_inputs_copy |
| ) |
| self.init_optimizer(name, base_device, model_copy.parameters()) |
| correct_result = self.run_n_iterations(model_copy, example_inputs_copy) |
| |
| # Run with Dynamo |
| # Sometime CI fails with random triton compilation failure which will be skipped for now |
| # TODO: revisit this after switching to new Triton runtime |
| reset_rng_state() |
| torch._dynamo.reset() |
| try: |
| self.init_optimizer(name, current_device, model.parameters()) |
| optimized_model_iter_fn = optimize_ctx(self.run_n_iterations) |
| new_result = optimized_model_iter_fn(model, example_inputs) |
| except Exception as e: |
| log.exception(e) |
| if ( |
| self.args.ci |
| and isinstance(e, BackendCompilerFailed) |
| and ( |
| "Internal Triton PTX codegen error" in str(e) |
| or "cubin" in str(e) |
| ) |
| ): |
| return "pass_due_to_skip" |
| else: |
| print( |
| "TorchDynamo optimized model failed to run because of following error" |
| ) |
| return "fail_to_run" |
| |
| def dump_max_mean_values(tol, ref, res): |
| if isinstance(ref, (list, tuple, torch.nn.ParameterList, torch.Size)): |
| for refi, resi in zip(ref, res): |
| dump_max_mean_values(tol, refi, resi) |
| elif isinstance(ref, dict): |
| for k in ref.keys(): |
| dump_max_mean_values(tol, ref[k], res[k]) |
| elif isinstance(ref, torch.Tensor): |
| res = res.to(base_device) |
| t = torch.abs(ref - res) / (1 + torch.abs(ref)) |
| tol.append(t.flatten().to(torch.float32)) |
| return tol |
| |
| tol = [] |
| dump_max_mean_values(tol, correct_result, new_result) |
| tol = torch.cat(tol) |
| tol = torch.tensor(tol) |
| max = torch.max(tol) |
| mean = torch.mean(tol) |
| div = torch.std(tol) |
| headers = ["dev", "name", "batch_size", "max", "mean", "std"] |
| fields = [ |
| current_device, |
| current_name, |
| current_batch_size, |
| max.item(), |
| mean.item(), |
| div.item(), |
| ] |
| output_csv(output_filename, headers, fields) |
| return tolerance_status |
| |
| def run_performance_test( |
| self, name, model, example_inputs, optimize_ctx, experiment, tag=None |
| ): |
| if self.args.xla: |
| with self.pick_grad(name, self.args.training): |
| return experiment(*self.maybe_cast(model, example_inputs)) |
| |
| def warmup(fn, model, example_inputs, mode, niters=5): |
| peak_mem = 0 |
| start_stats = get_dynamo_stats() |
| try: |
| if current_device == "cuda": |
| torch.cuda.reset_peak_memory_stats() |
| torch.cuda.empty_cache() |
| t0 = time.perf_counter() |
| for _ in range(niters): |
| fn(model, example_inputs) |
| t1 = time.perf_counter() |
| latency = t1 - t0 |
| if current_device == "cuda": |
| peak_mem = get_peak_memory() |
| elif current_device == "cpu": |
| total = psutil.virtual_memory().total |
| percentage = psutil.Process(os.getpid()).memory_percent() |
| peak_mem = percentage * total / 10**9 |
| except Exception: |
| log.exception("Backend %s failed in warmup()", mode) |
| return sys.exit(-1) |
| dynamo_stats = get_dynamo_stats() |
| dynamo_stats.subtract(start_stats) |
| return latency, peak_mem, dynamo_stats |
| |
| # Cast the model to float16/float32 as necessary |
| model, example_inputs = self.maybe_cast(model, example_inputs) |
| self.init_optimizer(name, current_device, model.parameters()) |
| with self.pick_grad(name, self.args.training): |
| ok, total = Stats.reset_counters() |
| experiment_kwargs = {} |
| if tag is not None: |
| experiment_kwargs["tag"] = tag |
| results = [] |
| |
| eager_latency, eager_peak_mem, _ = warmup( |
| self.model_iter_fn, model, example_inputs, "eager" |
| ) |
| optimized_model_iter_fn = optimize_ctx(self.model_iter_fn) |
| dynamo_latency, dynamo_peak_mem, dynamo_stats = warmup( |
| optimized_model_iter_fn, model, example_inputs, "dynamo" |
| ) |
| |
| compilation_time = dynamo_latency - eager_latency |
| compression_ratio = ( |
| eager_peak_mem / dynamo_peak_mem if dynamo_peak_mem else 0.0 |
| ) |
| if self.args.print_memory: |
| print( |
| f"memory: eager: {eager_peak_mem:.2f} GB, " |
| f"dynamo: {dynamo_peak_mem:.2f} GB, " |
| f"ratio: {compression_ratio:.2f}" |
| ) |
| |
| if experiment.func is speedup_experiment: |
| experiment_kwargs["compilation_latency"] = compilation_time |
| experiment_kwargs["compression_ratio"] = compression_ratio |
| experiment_kwargs["eager_peak_mem"] = eager_peak_mem |
| experiment_kwargs["dynamo_peak_mem"] = dynamo_peak_mem |
| experiment_kwargs["dynamo_stats"] = dynamo_stats |
| |
| if experiment.func is coverage_experiment: |
| ok, total = Stats.reset_counters() |
| results = [] |
| # run with torch._dynamo few times to populate the cache |
| for _ in range(3): |
| optimized_model_iter_fn(model, example_inputs) |
| _, frames_second_pass = Stats.reset_counters() # should be 0 |
| if frames_second_pass > 0: |
| optimized_model_iter_fn(model, example_inputs) |
| _, frames_third_pass = Stats.reset_counters() # should be 0 |
| else: |
| frames_third_pass = 0 |
| |
| results.append( |
| f"{ok:3}/{total:3} +{frames_third_pass} frames {compilation_time:3.0f}s" |
| ) |
| |
| if not hasattr(model, name): |
| model.name = name |
| results.append(experiment(model, example_inputs, **experiment_kwargs)) |
| return " ".join(map(str, results)) |
| |
| def run_one_model( |
| self, |
| name, |
| model, |
| example_inputs, |
| optimize_ctx, |
| experiment, |
| explain=False, |
| tag=None, |
| ): |
| mode = "train" if self.args.training else "eval" |
| msg = f"{current_device:4} {mode:5} {current_name:34} " |
| if tag: |
| msg += f" {tag:26}" |
| print(msg, flush=True) |
| |
| start_stats = get_dynamo_stats() |
| |
| if self.args.accuracy: |
| status = self.check_accuracy( |
| name, model, example_inputs, optimize_ctx, experiment, tag |
| ) |
| print(status) |
| elif self.args.tolerance: |
| status = self.check_tolerance(name, model, example_inputs, optimize_ctx) |
| print(status) |
| elif self.args.performance: |
| status = self.run_performance_test( |
| name, model, example_inputs, optimize_ctx, experiment, tag |
| ) |
| print(status) |
| if self.args.timing: |
| from torch._dynamo.utils import op_count, print_time_report |
| from torch.utils._stats import simple_call_counter |
| |
| print_time_report() |
| stats = "STATS: " |
| stats = stats + " | ".join( |
| itertools.chain( |
| [f"call_* op count: {op_count}"], |
| (f"{key}:{value}" for key, value in simple_call_counter.items()), |
| ) |
| ) |
| print(stats) |
| stats = get_dynamo_stats() |
| stats.subtract(start_stats) |
| |
| if explain: |
| print( |
| f"Dynamo produced {stats['unique_graphs']} graphs " |
| f"covering {stats['calls_captured']} ops with " |
| f"{stats['graph_breaks']} graph breaks ({stats['unique_graph_breaks']} unique)" |
| ) |
| |
| if explain or self.args.log_graph_breaks or self.args.print_graph_breaks: |
| filename = f"{output_filename.rstrip('.csv')}_graph_breaks.csv" |
| |
| def add_double_quotes(x): |
| # Delimiter because reason could have comma |
| return f'"{x}"' |
| |
| for graph_break in graph_break_reasons: |
| reason = add_double_quotes(graph_break.reason) |
| user_stack = add_double_quotes( |
| ", ".join([str(x) for x in graph_break.user_stack]) |
| ) |
| output_csv( |
| filename, |
| ["model", "reason", "user_stack"], |
| [current_name, reason, user_stack], |
| ) |
| |
| if self.args.stats: |
| Stats.print_summary() |
| |
| |
| def help(fn): |
| return fn.__doc__ |
| |
| |
| diff_branch_default = "DIFF-BRANCH-DEFAULT" |
| |
| |
| def should_diff_branch(args): |
| return args.diff_branch != diff_branch_default |
| |
| |
| def parse_args(args=None): |
| parser = argparse.ArgumentParser() |
| parser.add_argument( |
| "--filter", "-k", action="append", help="filter benchmarks with regexp" |
| ) |
| parser.add_argument( |
| "--exclude", "-x", action="append", help="filter benchmarks with regexp" |
| ) |
| parser.add_argument( |
| "--exclude-exact", action="append", help="filter benchmarks with exact match" |
| ) |
| parser.add_argument( |
| "--total-partitions", |
| type=int, |
| default=1, |
| choices=range(1, 10), |
| help="Total number of partitions we want to divide the benchmark suite into", |
| ) |
| parser.add_argument( |
| "--partition-id", |
| type=int, |
| default=0, |
| help="ID of the benchmark suite partition to be run. Used to divide CI tasks", |
| ) |
| parser.add_argument( |
| "--devices", "--device", "-d", action="append", help="cpu or cuda" |
| ) |
| parser.add_argument("--device-index", help="CUDA device index") |
| parser.add_argument( |
| "--repeat", "-n", type=int, default=30, help="number of timing runs" |
| ) |
| iterations_per_run_help = """ |
| Run this may iterations for each time measurement. This is mainly used for |
| XLA training. We want to run multiple iterations per measurement so the |
| tracing and computation for different iteartions can overlap with each |
| other. This makes sure we have an accurate xla baseline. |
| """ |
| parser.add_argument( |
| "--iterations-per-run", type=int, default=1, help=iterations_per_run_help |
| ) |
| parser.add_argument( |
| "--randomize-input", |
| action="store_true", |
| help="Whether to randomize the input values. Dimensions will be kept the same.", |
| ) |
| parser.add_argument( |
| "--threads", |
| "-t", |
| type=int, |
| help="number of threads to use for eager and inductor", |
| ) |
| parser.add_argument( |
| "--nopython", action="store_true", help="Turn graph breaks into errors" |
| ) |
| parser.add_argument( |
| "--no-skip", |
| action="store_true", |
| help="run models that are in the global SKIP list", |
| ) |
| parser.add_argument( |
| "--prims-nvfuser", action="store_true", help="user prims + nvfuser backend" |
| ) |
| parser.add_argument( |
| "--dump-raw-metrics", |
| action="store_true", |
| help="dump raw timing metrics from speedup experiment", |
| ) |
| parser.add_argument( |
| "--log-operator-inputs", |
| action="store_true", |
| default=False, |
| ) |
| parser.add_argument( |
| "--channels-last", |
| action="store_true", |
| default=False, |
| help="use channels last format", |
| ) |
| parser.add_argument( |
| "--batch-size", "--batch_size", type=int, help="batch size for benchmarking" |
| ) |
| parser.add_argument( |
| "--iterations", type=int, default=2, help="how many iterations to run" |
| ) |
| parser.add_argument( |
| "--batch-size-file", type=str, help="String to load batch size from" |
| ) |
| parser.add_argument("--cosine", action="store_true", help="use cosine similarity") |
| parser.add_argument( |
| "--cpp-wrapper", action="store_true", help="turn on cpp/cuda wrapper codegen" |
| ) |
| parser.add_argument( |
| "--ci", action="store_true", help="Flag to tell that its a CI run" |
| ) |
| parser.add_argument( |
| "--dynamic-ci-skips-only", |
| action="store_true", |
| help=( |
| "Run only the models that would have been skipped in CI " |
| "if dynamic-shapes, compared to running without dynamic-shapes. " |
| "This is useful for checking if more models are now " |
| "successfully passing with dynamic shapes. " |
| "Implies --dynamic-shapes and --ci" |
| ), |
| ) |
| parser.add_argument( |
| "--dashboard", action="store_true", help="Flag to tell that its a Dashboard run" |
| ) |
| parser.add_argument( |
| "--skip-fp64-check", action="store_true", help="skip accuracy check using fp64" |
| ) |
| parser.add_argument( |
| "--fast", "-f", action="store_true", help="skip slow benchmarks" |
| ) |
| parser.add_argument( |
| "--only", |
| help="""Run just one model from torchbench. Or |
| specify the path and class name of the model in format like: |
| --only=path:<MODEL_FILE_PATH>,class:<CLASS_NAME> |
| |
| Due to the fact that dynamo changes current working directory, |
| the path should be an absolute path. |
| |
| The class should have a method get_example_inputs to return the inputs |
| for the model. An example looks like |
| ``` |
| class LinearModel(nn.Module): |
| def __init__(self): |
| super().__init__() |
| self.linear = nn.Linear(10, 10) |
| |
| def forward(self, x): |
| return self.linear(x) |
| |
| def get_example_inputs(self): |
| return (torch.randn(2, 10),) |
| ``` |
| """, |
| ) |
| parser.add_argument( |
| "--ddp", |
| action="store_true", |
| help="Wraps model in DDP before running it, and uses dynamo DDPOptmizer (graph breaks) by default.", |
| ) |
| parser.add_argument( |
| "--fsdp", |
| action="store_true", |
| help="""Wraps model in FSDP before running it. Disables cudagraphs by default. |
| Doesn't recursively wrap, mainly useful for checking dynamo UnspecNNModule compatibility |
| """, |
| ) |
| parser.add_argument( |
| "--no-optimize-ddp", |
| action="store_true", |
| help="Disables dynamo DDPOptimizer (graph breaks). (Applies only when using --ddp benchmark mode).", |
| ) |
| parser.add_argument( |
| "--distributed-master-port", |
| default="6789", |
| help="Port to bind for for torch.distributed. Use the default unless it's conflicting with another user", |
| ) |
| parser.add_argument( |
| "--dynamic-shapes", |
| action="store_true", |
| help="Runs a dynamic shapes version of the benchmark, if available.", |
| ) |
| parser.add_argument( |
| "--dynamic-batch-only", |
| action="store_true", |
| help="Only assume batch dimension is dynamic. Implies --dynamic-shapes", |
| ) |
| parser.add_argument( |
| "--specialize-int", action="store_true", help="Run with specialize_int=True." |
| ) |
| parser.add_argument( |
| "--use-eval-mode", |
| action="store_true", |
| help="sets model.eval() to reduce randomness", |
| ) |
| parser.add_argument( |
| "--skip-accuracy-check", |
| action="store_true", |
| help="keeps running even when accuracy fails", |
| ) |
| parser.add_argument( |
| "--generate-aot-autograd-stats", |
| action="store_true", |
| help="Generates AOT Autograd stats like how mnay graphs are sent to AOT", |
| ) |
| parser.add_argument( |
| "--inductor-settings", |
| action="store_true", |
| help="Use same settings as --inductor for baseline comparisons", |
| ) |
| parser.add_argument( |
| "--suppress-errors", |
| action="store_true", |
| help="Suppress errors instead of raising them", |
| ) |
| parser.add_argument( |
| "--output", |
| help="Overrides the output filename", |
| ) |
| parser.add_argument( |
| "--output-directory", |
| help="Overrides the directory to place output files.", |
| ) |
| parser.add_argument( |
| "--baseline", |
| help="Compare with a prior --output", |
| ) |
| parser.add_argument( |
| "--part", |
| default=None, |
| help="Specify the part of the model to run.", |
| ) |
| parser.add_argument( |
| "--export-profiler-trace", |
| action="store_true", |
| help="exports trace of kineto profiler", |
| ) |
| parser.add_argument( |
| "--profiler-trace-name", |
| "--profiler_trace_name", |
| help="Overwrites exported trace name", |
| ) |
| parser.add_argument( |
| "--diff-branch", |
| default=diff_branch_default, |
| help="delta current branch against given branch.", |
| ) |
| parser.add_argument( |
| "--tag", default=None, help="Specify a tag to be included in csv files." |
| ) |
| parser.add_argument( |
| "--explain", |
| action="store_true", |
| help="print some graph/op statistics during the run, similar to .explain()", |
| ) |
| parser.add_argument( |
| "--stats", |
| action="store_true", |
| help="print graph counter stats", |
| ) |
| parser.add_argument( |
| "--print-memory", |
| action="store_true", |
| help="print extra memory statistics", |
| ) |
| parser.add_argument( |
| "--print-dataframe-summary", |
| action="store_true", |
| help="print dataframe result used for calculating accuracy", |
| ) |
| parser.add_argument( |
| "--cold-start-latency", |
| "--cold_start_latency", |
| action="store_true", |
| help="Use a fresh triton cachedir when running each model, to force cold-start compile.", |
| ) |
| parser.add_argument( |
| "--disable-cudagraphs", |
| action="store_true", |
| help="Disables cudagraphs for Inductor", |
| ) |
| parser.add_argument( |
| "--disable-split-reductions", |
| action="store_true", |
| help="Disables split reductions for Inductor", |
| ) |
| parser.add_argument( |
| "--disable-persistent-reductions", |
| action="store_true", |
| help="Disables split reductions for Inductor", |
| ) |
| parser.add_argument( |
| "--disable-divisible-by-16", |
| action="store_true", |
| help="Disables divisible by 16 hint to Triton for Inductor", |
| ) |
| parser.add_argument( |
| "--inductor-compile-mode", |
| default=None, |
| help="torch.compile mode argument for inductor runs.", |
| ) |
| parser.add_argument( |
| "--print-graph-breaks", |
| action="store_true", |
| help="Show a warning whenever graph break", |
| ) |
| parser.add_argument( |
| "--log-graph-breaks", |
| action="store_true", |
| help="log graph breaks in a file", |
| ) |
| parser.add_argument( |
| "--trace-on-xla", |
| action="store_true", |
| help="Whether to trace the model on XLA or on eager device", |
| ) |
| parser.add_argument( |
| "--xla-tolerance", |
| type=float, |
| default=1e-2, |
| help="XLA needs a loose tolerance to pass the correctness check", |
| ) |
| parser.add_argument( |
| "--collect-outputs", |
| action="store_true", |
| help="""Whether to collect outputs for training. Set this to true if we |
| want to verify the numerical correctness of graidents. But that may |
| cause time measurement not accurate""", |
| ) |
| parser.add_argument( |
| "--enable-activation-checkpointing", |
| action="store_true", |
| help="Enables activation checkpointing for HF models", |
| ) |
| parser.add_argument("--timing", action="store_true", help="Emits phase timing") |
| |
| parser.add_argument( |
| "--progress", |
| action="store_true", |
| help="Print n/k models message between each model run.", |
| ) |
| |
| parser.add_argument( |
| "--timeout", |
| type=int, |
| default=2000, |
| help="timeout (second) for benchmarking.", |
| ) |
| |
| parser.add_argument( |
| "--per_process_memory_fraction", |
| type=float, |
| default=1, |
| help="Set per-process GPU memory fraction (limit) for reducing usable size and reproducing OOMs", |
| ) |
| |
| parser.add_argument( |
| "--no-translation-validation", |
| action="store_true", |
| help="Disable translation validation for accuracy builds.", |
| ) |
| |
| group_fuser = parser.add_mutually_exclusive_group() |
| # --nvfuser is now the default, keep the option to not break scripts |
| group_fuser.add_argument("--nvfuser", action="store_true", help=argparse.SUPPRESS) |
| group_fuser.add_argument("--nnc", action="store_true", help="enable NNC for GPUs") |
| |
| group_prec = parser.add_mutually_exclusive_group() |
| group_prec.add_argument("--float16", action="store_true", help="cast model to fp16") |
| group_prec.add_argument( |
| "--bfloat16", action="store_true", help="cast model to bf16" |
| ) |
| group_prec.add_argument("--float32", action="store_true", help="cast model to fp32") |
| group_prec.add_argument( |
| "--amp", action="store_true", help="use automatic mixed precision" |
| ) |
| |
| group_printout = parser.add_mutually_exclusive_group() |
| group_printout.add_argument( |
| "--verbose", "-v", action="store_true", help="enable verbose debug printouts" |
| ) |
| group_printout.add_argument( |
| "--quiet", "-q", action="store_true", help="suppress debug printouts" |
| ) |
| |
| group = parser.add_mutually_exclusive_group() |
| group.add_argument( |
| "--coverage", action="store_true", help="(default) " + help(coverage_experiment) |
| ) |
| group.add_argument( |
| "--overhead", action="store_true", help=help(overhead_experiment) |
| ) |
| group.add_argument( |
| "--speedup-dynamo-ts", |
| action="store_true", |
| help="TorchDynamo frontend with torchscript backend", |
| ) |
| group.add_argument( |
| "--speedup-fx2trt", action="store_true", help=help(speedup_experiment_fx2trt) |
| ) |
| group.add_argument( |
| "--speedup-fx2trt-fp16", |
| action="store_true", |
| help=help(speedup_experiment_fx2trt), |
| ) |
| group.add_argument( |
| "--print-fx", |
| action="store_true", |
| help="Print fx traces captured from model", |
| ) |
| group.add_argument( |
| "--print-aten-ops", |
| action="store_true", |
| help="Print traces of aten ops captured by AOT autograd", |
| ) |
| group.add_argument( |
| "--inductor", |
| action="store_true", |
| help="Measure speedup with TorchInductor", |
| ) |
| group.add_argument( |
| "--export", |
| action="store_true", |
| help="Measure pass rate with export", |
| ) |
| group.add_argument( |
| "--xla", action="store_true", help="Compare TorchXLA to eager PyTorch" |
| ) |
| group.add_argument( |
| "--torchscript-onnx", |
| "--torchscript_onnx", |
| action="store_true", |
| help="Measure speedup with TorchScript ONNX, i.e. `torch.onnx.export`", |
| ) |
| group.add_argument( |
| "--dynamo-onnx", |
| "--dynamo_onnx", |
| action="store_true", |
| help="Measure speedup with Dynamo ONNX, i.e. `torch.onnx.dynamo_export`", |
| ) |
| group.add_argument( |
| "--backend", |
| choices=torch._dynamo.list_backends(exclude_tags=None), |
| help="measure speedup with a given backend", |
| ) |
| group.add_argument("--nothing", action="store_true", help=help(null_experiment)) |
| group.add_argument( |
| "--log-conv-args", |
| action="store_true", |
| help="Dump convolution input/weight/bias's shape/stride/dtype and other options to json", |
| ) |
| group.add_argument( |
| "--recompile-profiler", |
| "--recompile_profiler", |
| action="store_true", |
| help="Run the dynamo recompilation profiler on each model.", |
| ) |
| group.add_argument( |
| "--find-batch-sizes", |
| action="store_true", |
| help="finds the largest batch size that could fit on GPUs", |
| ) |
| |
| mode_group = parser.add_mutually_exclusive_group(required=True) |
| mode_group.add_argument( |
| "--accuracy", |
| action="store_true", |
| help="Checks accuracy with small batch size and eval mode", |
| ) |
| mode_group.add_argument( |
| "--performance", action="store_true", help="Measures performance speedup" |
| ) |
| mode_group.add_argument( |
| "--tolerance", |
| action="store_true", |
| help="extracts the tolerance for each model with small batch size and eval mode", |
| ) |
| run_mode_group = parser.add_mutually_exclusive_group(required=True) |
| run_mode_group.add_argument( |
| "--training", |
| action="store_true", |
| help="Performs training", |
| ) |
| run_mode_group.add_argument( |
| "--inference", action="store_true", help="Performs inference" |
| ) |
| return parser.parse_args(args) |
| |
| |
| def main(runner, original_dir=None): |
| if original_dir: |
| os.chdir(original_dir) |
| args = parse_args() |
| if args.baseline: |
| args.baseline = os.path.abspath(args.baseline) |
| |
| if should_diff_branch(args): |
| import git |
| |
| # We do this here so we error out earlier if there's an issue |
| repo = git.Repo() |
| if repo.is_dirty(): |
| raise RuntimeError( |
| "--diff-branch called on dirty branch. Commit, stash, or reset." |
| ) |
| main_branch = repo.active_branch.name |
| if main_branch == args.diff_branch: |
| raise RuntimeError( |
| f"--diff-branch: current branch is same as {args.diff_branch} branch, what are you diffing?" |
| ) |
| |
| with maybe_init_distributed( |
| (args.ddp or args.fsdp) and args.only, port=args.distributed_master_port |
| ): |
| return maybe_fresh_cache( |
| run, (args.cold_start_latency and args.only) or args.ci |
| )(runner, args, original_dir) |
| |
| |
| def run(runner, args, original_dir=None): |
| # Pass the parsed args object to benchmark runner object |
| runner.args = args |
| |
| args.filter = args.filter or [r"."] |
| args.exclude = args.exclude or [r"^$"] |
| args.exclude_exact = args.exclude_exact or [] |
| |
| if args.inductor: |
| assert args.backend is None |
| args.backend = "inductor" |
| if args.dynamic_ci_skips_only: |
| args.dynamic_shapes = True |
| args.ci = True |
| if args.dynamic_batch_only: |
| args.dynamic_shapes = True |
| torch._dynamo.config.assume_static_by_default = True |
| torch._dynamo.config.automatic_dynamic_shapes = True |
| if args.dynamic_shapes: |
| torch._dynamo.config.automatic_dynamic_shapes = True |
| if not args.dynamic_batch_only: |
| torch._dynamo.config.assume_static_by_default = False |
| if args.specialize_int: |
| torch._dynamo.config.specialize_int = True |
| if args.ci: |
| if args.accuracy: |
| # Run fewer iterations when checking accuracy |
| args.repeat = 2 |
| |
| # Set translation validation on by default on CI accuracy runs. |
| torch._dynamo.config.translation_validation = True |
| |
| if args.dynamic_ci_skips_only: |
| # Test only the incremental set of jobs whose skipped was |
| # caused solely by turning on dynamic shapes |
| assert args.dynamic_shapes |
| ci = functools.partial(CI, args.backend, training=args.training) |
| args.filter = list( |
| set(CI_SKIP[ci(dynamic=True)]) - set(CI_SKIP[ci(dynamic=False)]) |
| ) |
| else: |
| ci = functools.partial( |
| CI, args.backend, training=args.training, dynamic=args.dynamic_shapes |
| ) |
| for device in args.devices: |
| args.exclude_exact.extend(CI_SKIP[ci(device=device)]) |
| if args.ddp: |
| # TODO: we could also hook DDP bench up to --speedup bench, _not_ for mgpu e2e perf, |
| # but just to measure impact on singlenode of performing graph-breaks. |
| # Left it as a follow up to keep this PR isolated. |
| assert ( |
| args.accuracy |
| ), "DDP benchmark is currently only hooked up to --accuracy bench" |
| assert args.training, "DDP benchmark requires --training mode" |
| if args.no_optimize_ddp: |
| torch._dynamo.config.optimize_ddp = False |
| else: |
| # TODO(whc) after enabling DDPOptimizer by default this could be removed or assert |
| torch._dynamo.config.optimize_ddp = True |
| if args.only == "dlrm": |
| log.error( |
| "DLRM+DDP is unsupported as it requires sharding the embedding layer separately from DDP" |
| ) |
| return sys.exit(-1) |
| if args.accuracy: |
| # Use small batch size. We use >1 batch size to ensure we test |
| # batch_norm type of operators that work on batch dims. |
| # TODO - Go through the failures for batch size = 2 |
| if args.batch_size is None: |
| if runner.suite_name == "huggingface": |
| args.batch_size = 1 |
| elif runner.suite_name == "torchbench": |
| args.batch_size = 4 |
| else: |
| # Larger batch size of TIMM models to have stable batch_norm |
| assert runner.suite_name == "timm_models" |
| args.batch_size = 8 |
| |
| # Remove sources of randomness |
| if runner.suite_name not in ("timm_models", "huggingface"): |
| # TODO - Using train mode for timm_models and HF models. Move to train mode for Torchbench as well. |
| args.use_eval_mode = True |
| inductor_config.fallback_random = True |
| if args.only is not None and args.only not in { |
| "alexnet", |
| "Background_Matting", |
| "pytorch_CycleGAN_and_pix2pix", |
| "pytorch_unet", |
| "Super_SloMo", |
| "vgg16", |
| # https://github.com/pytorch/pytorch/issues/96724 |
| "Wav2Vec2ForCTC", |
| "Wav2Vec2ForPreTraining", |
| }: |
| # some of the models do not support use_deterministic_algorithms |
| torch.use_deterministic_algorithms(True) |
| if args.only in {"hf_T5_generate"}: |
| torch._dynamo.config.automatic_dynamic_shapes = True |
| os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" |
| torch.backends.cudnn.deterministic = True |
| torch.backends.cudnn.allow_tf32 = False |
| torch.backends.cudnn.benchmark = False |
| torch.backends.cuda.matmul.allow_tf32 = False |
| |
| # Remove randomeness when torch manual seed is called |
| patch_torch_manual_seed() |
| |
| # Some models e.g. yolov3 assert batch size on n_gpus |
| if "CUDA_VISIBLE_DEVICES" not in os.environ: |
| args.device_index = "0" |
| |
| # Stricter check to disable fallbacks |
| args.suppress_errors = False |
| |
| if args.device_index is not None: |
| os.environ["CUDA_VISIBLE_DEVICES"] = args.device_index |
| |
| elif args.performance: |
| # Ensure that we test on real scenarios |
| args.use_eval_mode = False |
| |
| if args.partition_id > args.total_partitions or args.partition_id < 0: |
| print("Invalid partition id") |
| return sys.exit(-1) |
| |
| if not args.devices: |
| if torch.cuda.is_available(): |
| args.devices = ["cuda"] |
| else: |
| log.warning("torch.cuda.is_available() == False, using CPU") |
| args.devices = ["cpu"] |
| |
| if args.devices != ["cpu"] and torch.cuda.is_available(): |
| global synchronize |
| synchronize = torch.cuda.synchronize |
| |
| if ( |
| args.devices == ["cuda"] |
| and torch.cuda.get_device_properties(0).total_memory < 25 * 2**30 |
| ): |
| # OOM errors on an RTX 3090 with 24gb RAM |
| runner.skip_models.update( |
| { |
| # torchbench |
| "hf_Longformer", |
| "timm_nfnet", |
| "timm_efficientdet", |
| } |
| ) |
| if args.training: |
| runner.skip_models.add("hf_T5") |
| |
| if args.nnc: |
| torch._C._jit_override_can_fuse_on_cpu(True) |
| torch._C._jit_override_can_fuse_on_gpu(True) |
| torch._C._jit_set_texpr_fuser_enabled(True) |
| torch._C._jit_set_nvfuser_enabled(False) |
| |
| if args.threads: |
| torch.set_num_threads(args.threads) |
| |
| if args.verbose: |
| torch._logging.set_logs(dynamo=logging.DEBUG) |
| |
| if args.print_graph_breaks: |
| torch._dynamo.config.print_graph_breaks = True |
| |
| if args.quiet: |
| torch._logging.set_logs(dynamo=logging.ERROR) |
| |
| torch._dynamo.config.suppress_errors = args.suppress_errors |
| |
| if args.training: |
| runner.model_iter_fn = runner.forward_and_backward_pass |
| runner.skip_models.update(runner.skip_not_suitable_for_training_models) |
| else: |
| runner.model_iter_fn = runner.forward_pass |
| |
| if args.fast: |
| runner.skip_models.update(runner.slow_models) |
| |
| if args.devices == ["cpu"]: |
| runner.skip_models.update(runner.very_slow_models) |
| runner.skip_models.update(runner.skip_models_for_cpu) |
| elif args.devices == ["cuda"]: |
| runner.skip_models.update(runner.skip_models_for_cuda) |
| |
| if args.no_skip: |
| runner.skip_models.clear() |
| |
| experiment = null_experiment |
| global current_name, current_device, current_batch_size, output_filename, optimize_ctx, current_onnx_compiler |
| optimize_ctx = contextlib.nullcontext() |
| |
| if args.overhead: |
| optimize_ctx = torch._dynamo.optimize(dummy_fx_compile, nopython=args.nopython) |
| experiment = speedup_experiment |
| output_filename = "overheads.csv" |
| elif args.inductor: |
| inductor_config.debug = args.verbose |
| if ( |
| args.ci |
| and args.accuracy |
| and args.training |
| and args.only in {"dla102", "gernet_l"} |
| ): |
| # Log generated code for flaky tests, to check if there is any codegen difference |
| inductor_config.debug = True |
| |
| if args.threads: |
| inductor_config.cpp.threads = args.threads |
| |
| optimize_ctx = functools.partial( |
| torch.compile, |
| backend="inductor", |
| fullgraph=args.nopython, |
| mode=args.inductor_compile_mode, |
| ) |
| experiment = speedup_experiment |
| output_filename = "inductor.csv" |
| elif args.export: |
| optimize_ctx = torch._export.export |
| experiment = speedup_experiment |
| output_filename = "export.csv" |
| elif args.xla: |
| (dev,) = args.devices |
| os.environ["PJRT_DEVICE"] = {"cuda": "GPU", "cpu": "CPU"}[dev] |
| torch._dynamo.mark_dynamic = MagicMock() |
| experiment = xla |
| output_filename = "xla.csv" |
| elif args.torchscript_onnx: |
| optimize_ctx = functools.partial( |
| optimize_onnx_ctx, args.output_directory or ".", OnnxModelFromTorchScript |
| ) |
| experiment = functools.partial( |
| speedup_experiment_onnx, OnnxModelFromTorchScript |
| ) |
| output_filename = "torchscript_onnx.csv" |
| current_onnx_compiler = "torchscript" |
| elif args.dynamo_onnx: |
| optimize_ctx = functools.partial( |
| optimize_onnx_ctx, args.output_directory or ".", OnnxModelFromDynamo |
| ) |
| experiment = functools.partial(speedup_experiment_onnx, OnnxModelFromDynamo) |
| output_filename = "dynamo_onnx.csv" |
| current_onnx_compiler = "dynamo" |
| elif args.speedup_dynamo_ts: |
| optimize_ctx = torch._dynamo.optimize("ts", nopython=args.nopython) |
| experiment = speedup_experiment |
| output_filename = "speedup_dynamo_ts.csv" |
| elif args.prims_nvfuser: |
| optimize_ctx = torch._dynamo.optimize("prims_nvfuser", nopython=args.nopython) |
| experiment = speedup_experiment |
| backend_str = "prims_nvfuser" |
| output_filename = f"accuracy_aot_{backend_str}.csv" |
| elif args.print_fx: |
| optimize_ctx = torch._dynamo.optimize( |
| print_fx, |
| nopython=args.nopython, |
| ) |
| elif args.print_aten_ops: |
| optimize_ctx = torch._dynamo.optimize( |
| print_aten_ops, |
| nopython=args.nopython, |
| ) |
| elif args.nothing: |
| optimize_ctx = nothing |
| experiment = speedup_experiment |
| output_filename = "nothing.csv" |
| elif args.backend: |
| optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython) |
| experiment = speedup_experiment |
| if args.accuracy: |
| output_filename = f"accuracy_{args.backend}.csv" |
| elif args.tolerance: |
| output_filename = f"tolerance_{args.backend}.csv" |
| else: |
| output_filename = f"speedup_{args.backend}.csv" |
| elif args.recompile_profiler: |
| output_filename = "recompile_profiler_log.csv" |
| experiment = recompile_profiler_experiment |
| else: |
| optimize_ctx = torch._dynamo.optimize( |
| fx_insert_profiling, nopython=args.nopython |
| ) |
| experiment = coverage_experiment |
| output_filename = "coverage.csv" |
| |
| if args.inductor or args.backend == "inductor": |
| inductor_config.triton.cudagraphs = not args.disable_cudagraphs |
| inductor_config.triton.persistent_reductions = ( |
| not args.disable_persistent_reductions |
| ) |
| inductor_config.split_reductions = not args.disable_split_reductions |
| inductor_config.triton.divisible_by_16 = not args.disable_divisible_by_16 |
| inductor_config.cpp_wrapper = args.cpp_wrapper |
| |
| runner.setup_amp() |
| |
| if args.output: |
| output_filename = args.output |
| |
| if output_filename: |
| if args.output_directory: |
| output_filename = os.path.join(args.output_directory, output_filename) |
| else: |
| output_filename = os.path.join( |
| torch._dynamo.config.base_dir, output_filename |
| ) |
| |
| if args.find_batch_sizes and args.only: |
| for device in args.devices: |
| batch_size = runner.batch_size_finder(device, args.only) |
| print(args.only, batch_size) |
| output_csv(output_filename, [], [args.only, batch_size]) |
| return |
| |
| if args.export_profiler_trace: |
| if args.profiler_trace_name is None: |
| if args.backend: |
| args.profiler_trace_name = args.backend |
| elif args.inductor: |
| args.profiler_trace_name = "inductor" |
| else: |
| args.profiler_trace_name = "profile" |
| else: |
| args.profiler_trace_name = args.profiler_trace_name |
| |
| if args.no_translation_validation: |
| # Overwrite 'translation_validation' config, if specified. |
| torch._dynamo.config.translation_validation = False |
| |
| experiment = functools.partial(experiment, args, runner.model_iter_fn) |
| |
| if args.only and should_diff_branch(args): |
| import git |
| |
| repo = git.Repo() |
| main_branch = repo.active_branch.name |
| try: |
| # Adding diff-branch again to the args will override previous value |
| call_args = ( |
| [sys.executable] + sys.argv + [f"--diff-branch={diff_branch_default}"] |
| ) |
| # Run for main branch |
| subprocess.check_call(call_args + [f"--tag={main_branch}"]) |
| # Run for comparison branch |
| repo.git.checkout(args.diff_branch) |
| subprocess.check_call(call_args + [f"--tag={args.diff_branch}"]) |
| finally: |
| # Go back to main branch |
| repo.git.checkout(main_branch) |
| elif args.only: |
| model_name = args.only |
| for device in args.devices: |
| batch_size = args.batch_size |
| if args.batch_size_file: |
| batch_size = read_batch_size_from_file( |
| args, args.batch_size_file, model_name |
| ) |
| if model_specified_by_path(args.only): |
| model, example_inputs = load_model_from_path(args.only) |
| name = model.__class__.__name__ |
| model = model.to(device=device) |
| example_inputs = tree_map_only( |
| torch.Tensor, lambda x: x.to(device=device), example_inputs |
| ) |
| else: |
| try: |
| with tqdm(desc="loading model"): |
| if args.part: |
| ( |
| device, |
| name, |
| model, |
| example_inputs, |
| batch_size, |
| ) = runner.load_model( |
| device, |
| model_name, |
| batch_size=batch_size, |
| part=args.part, |
| ) |
| else: |
| ( |
| device, |
| name, |
| model, |
| example_inputs, |
| batch_size, |
| ) = runner.load_model( |
| device, model_name, batch_size=batch_size |
| ) |
| except NotImplementedError as e: |
| print(e) |
| import traceback |
| |
| print(traceback.format_exc()) |
| logging.warning("%s failed to load", args.only) |
| continue # bad benchmark implementation |
| |
| if args.trace_on_xla: |
| xla_dev = xm.xla_device() |
| model = model.to(device=xla_dev) |
| example_inputs = tree_map_only( |
| torch.Tensor, lambda x: x.to(device=xla_dev), example_inputs |
| ) |
| |
| current_name = name |
| current_device = device |
| current_batch_size = batch_size |
| set_model_name(name) |
| |
| # Look for stuff that looks like batch size, and mark it dynamic. |
| # Better integration would integrate directly with benchmark suite |
| # but cannot conveniently do this |
| # NB: This must be done late enough so that we don't do more |
| # conversions on the inputs |
| # NB: Assumes only the first batch-y like dimension is the batch |
| marked = False |
| |
| def detect_and_mark_batch(t): |
| nonlocal marked |
| for i, s in enumerate(t.size()): |
| if s == batch_size: |
| torch._dynamo.mark_dynamic(t, i) |
| marked = True |
| break |
| |
| if args.dynamic_batch_only and batch_size > 1: |
| tree_map_only(torch.Tensor, detect_and_mark_batch, example_inputs) |
| assert marked, f"nothing in example_inputs had a dim with {batch_size}" |
| |
| if args.log_operator_inputs: |
| log_operator_inputs( |
| model, example_inputs, runner.model_iter_fn, name, args |
| ) |
| continue |
| |
| if args.per_process_memory_fraction != 1: |
| torch.cuda.set_per_process_memory_fraction( |
| args.per_process_memory_fraction |
| ) |
| |
| model, example_inputs = runner.cast_based_on_args(model, example_inputs) |
| runner.run_one_model( |
| name, |
| model, |
| example_inputs, |
| optimize_ctx, |
| experiment, |
| explain=args.explain, |
| tag=args.tag, |
| ) |
| if args.generate_aot_autograd_stats: |
| stats_file = output_filename.split(".csv")[0] + "_stats.csv" |
| output_csv( |
| stats_file, |
| ("dev", "name", "batch_size", "total_aot_graphs", "ok_aot_graphs"), |
| [ |
| current_device, |
| current_name, |
| current_batch_size, |
| *Stats.aot_summary(), |
| ], |
| ) |
| else: |
| if output_filename and os.path.exists(output_filename): |
| os.unlink(output_filename) |
| if original_dir: |
| os.chdir(original_dir) |
| model_names = list(runner.iter_model_names(args)) |
| nmodels = len(model_names) |
| for i, name in enumerate(model_names): |
| current_name = name |
| placeholder_batch_size = 0 |
| if args.progress: |
| print(f"Running model {i+1}/{nmodels}", flush=True) |
| |
| def write_csv(status): |
| if args.accuracy: |
| headers = ["dev", "name", "batch_size", "accuracy"] |
| rows = [ |
| [device, name, placeholder_batch_size, status] |
| for device in args.devices |
| ] |
| elif args.performance: |
| headers = ["dev", "name", "batch_size", "speedup", "abs_latency"] |
| rows = [ |
| [device, name, placeholder_batch_size, 0.0, 0.0] |
| for device in args.devices |
| ] |
| else: |
| headers = [] |
| rows = [ |
| [device, name, placeholder_batch_size, 0.0] |
| for device in args.devices |
| ] |
| |
| for row in rows: |
| output_csv(output_filename, headers, row) |
| |
| try: |
| timeout = args.timeout |
| if should_diff_branch(args): |
| timeout *= 2 |
| subprocess.check_call( |
| [sys.executable] + sys.argv + [f"--only={name}"], timeout=timeout |
| ) |
| except subprocess.TimeoutExpired: |
| print("TIMEOUT", file=sys.stderr) |
| write_csv("timeout") |
| except subprocess.SubprocessError: |
| print("ERROR", file=sys.stderr) |
| write_csv("infra_error") |
| print_summary(output_filename, print_dataframe=args.print_dataframe_summary) |
| |
| |
| def log_operator_inputs(model, example_inputs, model_iter_fn, name, args): |
| mode = "training" if args.training else "eval" |
| output = os.path.join(os.path.dirname(args.output), f"{name}_{mode}.txt") |
| |
| # TODO - add option for coalescing inputs over multiple runs |
| if os.path.exists(output): |
| print(f"Skipping {name}, {output} already exists") |
| return |
| |
| print(f"Running {name}") |
| |
| operator_mode = OperatorInputsMode() |
| fake_tensor_mode = FakeTensorMode() |
| |
| with torch._subclasses.fake_tensor.FakeCopyMode(fake_tensor_mode): |
| model_fake = copy.deepcopy(model) |
| example_inputs_fake = copy.deepcopy(example_inputs) |
| try: |
| with fake_tensor_mode, operator_mode: |
| model_iter_fn(model_fake, example_inputs_fake, collect_outputs=False) |
| except Exception as e: |
| print(f"{name} failed to run with fake tensors, trying real. Exception: {e}") |
| operator_mode = OperatorInputsMode() |
| try: |
| with operator_mode: |
| model_iter_fn(model, example_inputs, collect_outputs=False) |
| except Exception as e2: |
| print(f"{name} failed to run with real. Exception: {e2}") |
| raise |
| |
| print(f"Writing output to {output}") |
| operator_mode.log_to_file(output) |
| |
| |
| if __name__ == "__main__": |
| raise RuntimeError( |
| f"You shouldn't run {sys.argv[0]} directly, instead try timm_model.py, torchbench.py or hugginface.py" |
| ) |