benchmarks/dynamo/runner.py - platform/external/pytorch - Git at Google

 #!/usr/bin/env python3

 """
 A wrapper over the benchmark infrastructure to generate commonly used commands,
 parse results and generate csv/graphs.

 The script works on manually written TABLE (see below). We can add more commands
 in the future.

 One example usage is
 -> python benchmarks/runner.py --suites=torchbench --inference
 This command will generate the commands for the default compilers (see DEFAULTS
 below) for inference, run them and visualize the logs.

 If you want to just print the commands, you could use the following command
 -> python benchmarks/runner.py --print-run-commands --suites=torchbench --inference

 Similarly, if you want to just visualize the already finished logs
 -> python benchmarks/runner.py --visualize-logs --suites=torchbench --inference

 If you want to test float16
 -> python benchmarks/runner.py --suites=torchbench --inference --dtypes=float16

 """


 import argparse
 import dataclasses
 import functools
 import glob
 import importlib
 import io
 import itertools
 import logging
 import os
 import re
 import shutil
 import subprocess
 import sys
 import tempfile
 from collections import defaultdict
 from datetime import datetime, timedelta, timezone
 from os.path import abspath, exists
 from random import randint

 import matplotlib.pyplot as plt

 import numpy as np
 import pandas as pd
 import torch

 import torch._dynamo
 from matplotlib import rcParams
 from scipy.stats import gmean
 from tabulate import tabulate

 rcParams.update({"figure.autolayout": True})
 plt.rc("axes", axisbelow=True)

 DEFAULT_OUTPUT_DIR = "benchmark_logs"


 log = logging.getLogger(__name__)

 TABLE = {
     "training": {
         "ts_nnc": "--training --speedup-ts ",
         "ts_nvfuser": "--training --nvfuser --speedup-dynamo-ts ",
         "eager": "--training --backend=eager ",
         "aot_eager": "--training --backend=aot_eager ",
         "cudagraphs": "--training --backend=cudagraphs ",
         "aot_nvfuser": "--training --nvfuser --backend=aot_ts_nvfuser ",
         "nvprims_nvfuser": "--training --backend=nvprims_nvfuser ",
         "inductor": "--training --inductor ",
         "inductor_no_cudagraphs": "--training --inductor --disable-cudagraphs ",
         "inductor_max_autotune": "--training --inductor --inductor-compile-mode max-autotune ",
         "inductor_max_autotune_no_cudagraphs": (
             "--training --inductor --inductor-compile-mode max-autotune-no-cudagraphs --disable-cudagraphs "
         ),
     },
     "inference": {
         "aot_eager": "--inference --backend=aot_eager ",
         "eager": "--inference --backend=eager ",
         "ts_nnc": "--inference --speedup-ts ",
         "ts_nvfuser": "--inference -n100 --speedup-ts --nvfuser ",
         "trt": "--inference -n100 --speedup-trt ",
         "ts_nvfuser_cudagraphs": "--inference --backend=cudagraphs_ts ",
         "inductor": "--inference -n50 --inductor ",
         "inductor_no_cudagraphs": "--inference -n50 --inductor --disable-cudagraphs ",
         "inductor_max_autotune": "--inference -n50 --inductor --inductor-compile-mode max-autotune ",
         "inductor_max_autotune_no_cudagraphs": (
             "--inference -n50 --inductor --inductor-compile-mode max-autotune-no-cudagraphs --disable-cudagraphs "
         ),
         "torchscript-onnx": "--inference -n5 --torchscript-onnx",
         "dynamo-onnx": "--inference -n5 --dynamo-onnx",
     },
 }

 INFERENCE_COMPILERS = tuple(TABLE["inference"].keys())
 TRAINING_COMPILERS = tuple(TABLE["training"].keys())

 DEFAULTS = {
     "training": [
         "eager",
         "aot_eager",
         "inductor",
         "inductor_no_cudagraphs",
     ],
     "inference": [
         "eager",
         "aot_eager",
         "inductor",
         "inductor_no_cudagraphs",
     ],
     "flag_compilers": {
         "training": ["inductor", "inductor_no_cudagraphs"],
         "inference": ["inductor", "inductor_no_cudagraphs"],
     },
     "dtypes": [
         "float32",
     ],
     "suites": ["torchbench", "huggingface", "timm_models"],
     "devices": [
         "cuda",
     ],
     "quick": {
         "torchbench": '-k "resnet..$"',
         "huggingface": "-k Albert",
         "timm_models": ' -k "^resnet" -k "^inception"',
     },
 }


 DASHBOARD_DEFAULTS = {
     "dashboard_image_uploader": "/fsx/users/anijain/bin/imgur.sh",
     "dashboard_archive_path": "/data/home/anijain/cluster/cron_logs",
     "dashboard_gh_cli_path": "/data/home/anijain/miniconda/bin/gh",
 }


 def flag_speedup(x):
     return x < 0.95


 def flag_compilation_latency(x):
     return x > 120


 def flag_compression_ratio(x):
     return x < 0.9


 def flag_accuracy(x):
     return "pass" not in x


 FLAG_FNS = {
     "speedup": flag_speedup,
     "compilation_latency": flag_compilation_latency,
     "compression_ratio": flag_compression_ratio,
     "accuracy": flag_accuracy,
 }


 def percentage(part, whole, decimals=2):
     if whole == 0:
         return 0
     return round(100 * float(part) / float(whole), decimals)


 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--devices", action="append", help="cpu or cuda")
     parser.add_argument("--dtypes", action="append", help="float16/float32/amp")
     parser.add_argument("--suites", action="append", help="huggingface/torchbench/timm")
     parser.add_argument(
         "--compilers",
         action="append",
         help=f"For --inference, options are {INFERENCE_COMPILERS}. For --training, options are {TRAINING_COMPILERS}",
     )

     parser.add_argument(
         "--flag-compilers",
         action="append",
         help="List of compilers to flag issues. Same format as --compilers.",
     )
     parser.add_argument(
         "--quick", action="store_true", help="Just runs one model. Helps in debugging"
     )
     parser.add_argument(
         "--output-dir",
         help="Choose the output directory to save the logs",
         default=DEFAULT_OUTPUT_DIR,
     )
     parser.add_argument(
         "--keep-output-dir",
         action="store_true",
         help="Do not cleanup the output directory before running",
     )

     # Choose either generation of commands, pretty parsing or e2e runs
     group = parser.add_mutually_exclusive_group(required=False)
     group.add_argument(
         "--print-run-commands",
         "--print_run_commands",
         action="store_true",
         help="Generate commands and saves them to run.sh",
     )
     group.add_argument(
         "--visualize-logs",
         "--visualize_logs",
         action="store_true",
         help="Pretty print the log files and draw graphs",
     )
     group.add_argument(
         "--run",
         action="store_true",
         default=True,
         help="Generate commands, run and parses the files",
     )

     parser.add_argument(
         "--log-operator-inputs",
         action="store_true",
         default=False,
         help="Log operator inputs",
     )
     parser.add_argument(
         "--include-slowdowns",
         "--include_slowdowns",
         action="store_true",
         default=False,
         help="Include slowdowns in geomean performance speedup report. By default, slowdowns are ignored. "
         "This is because one can always use eager if compile is not speeding things up",
     )

     parser.add_argument(
         "--extra-args", default="", help="Append commandline with these args"
     )

     # Choose either inference or training
     group_mode = parser.add_mutually_exclusive_group(required=True)
     group_mode.add_argument(
         "--inference", action="store_true", help="Only run inference related tasks"
     )
     group_mode.add_argument(
         "--training", action="store_true", help="Only run training related tasks"
     )

     parser.add_argument(
         "--base-sha",
         help="commit id for the tested pytorch",
     )
     parser.add_argument(
         "--total-partitions",
         type=int,
         help="Total number of partitions, to be passed to the actual benchmark script",
     )
     parser.add_argument(
         "--partition-id",
         type=int,
         help="ID of partition, to be passed to the actual benchmark script",
     )

     parser.add_argument(
         "--update-dashboard",
         action="store_true",
         default=False,
         help="Updates to dashboard",
     )
     parser.add_argument(
         "--no-graphs",
         action="store_true",
         default=False,
         help="Do not genenerate and upload metric graphs",
     )
     parser.add_argument(
         "--no-update-archive",
         action="store_true",
         default=False,
         help="Do not update lookup.csv or the log archive",
     )
     parser.add_argument(
         "--no-gh-comment",
         action="store_true",
         default=False,
         help="Do not write a comment to github",
     )
     parser.add_argument(
         "--no-detect-regressions",
         action="store_true",
         default=False,
         help="Do not compare to previous runs for regressions or metric graphs.",
     )
     parser.add_argument(
         "--update-dashboard-test",
         action="store_true",
         default=False,
         help="does all of --no-graphs, --no-update-archive, and --no-gh-comment",
     )
     parser.add_argument(
         "--dashboard-image-uploader",
         default=DASHBOARD_DEFAULTS["dashboard_image_uploader"],
         help="Image uploader command",
     )
     parser.add_argument(
         "--dashboard-archive-path",
         default=DASHBOARD_DEFAULTS["dashboard_archive_path"],
         help="Archived directory path",
     )
     parser.add_argument(
         "--archive-name",
         help="Directory name under dashboard-archive-path to copy output-dir to. "
         "If not provided, a generated name is used.",
     )
     parser.add_argument(
         "--dashboard-gh-cli-path",
         default=DASHBOARD_DEFAULTS["dashboard_gh_cli_path"],
         help="Github CLI path",
     )
     parser.add_argument(
         "--batch-size",
         "--batch_size",
         type=int,
         default=None,
         help="batch size for benchmarking",
     )
     parser.add_argument(
         "--threads",
         "-t",
         type=int,
         default=None,
         help="number of threads to use for eager and inductor.",
     )
     launcher_group = parser.add_argument_group("CPU Launcher Parameters")
     launcher_group.add_argument(
         "--enable-cpu-launcher",
         "--enable_cpu_launcher",
         action="store_true",
         default=False,
         help="Use torch.backends.xeon.run_cpu to get the peak performance on Intel(R) Xeon(R) Scalable Processors.",
     )
     launcher_group.add_argument(
         "--cpu-launcher-args",
         "--cpu_launcher_args",
         type=str,
         default="",
         help="Provide the args of torch.backends.xeon.run_cpu. "
         "To look up what optional arguments this launcher offers: python -m torch.backends.xeon.run_cpu --help",
     )
     parser.add_argument(
         "--no-cold-start-latency",
         action="store_true",
         default=False,
         help="Do not include --cold-start-latency on inductor benchmarks",
     )
     parser.add_argument(
         "--inductor-compile-mode",
         default=None,
         help="torch.compile mode argument for inductor runs.",
     )
     args = parser.parse_args()
     return args


 def get_mode(args):
     if args.inference:
         return "inference"
     return "training"


 def get_skip_tests(suite, device, is_training: bool):
     """
     Generate -x seperated string to skip the unusual setup training tests
     """
     skip_tests = set()
     original_dir = abspath(os.getcwd())
     module = importlib.import_module(suite)
     os.chdir(original_dir)

     if suite == "torchbench":
         skip_tests.update(module.TorchBenchmarkRunner().skip_models)
         if is_training:
             skip_tests.update(
                 module.TorchBenchmarkRunner().skip_not_suitable_for_training_models
             )
         if device == "cpu":
             skip_tests.update(module.TorchBenchmarkRunner().skip_models_for_cpu)
         elif device == "cuda":
             skip_tests.update(module.TorchBenchmarkRunner().skip_models_for_cuda)
     else:
         if hasattr(module, "SKIP"):
             skip_tests.update(module.SKIP)
         if is_training and hasattr(module, "SKIP_TRAIN"):
             skip_tests.update(module.SKIP_TRAIN)

     skip_tests = (f"-x {name}" for name in skip_tests)
     skip_str = " ".join(skip_tests)
     return skip_str


 def generate_csv_name(args, dtype, suite, device, compiler, testing):
     mode = get_mode(args)
     return f"{compiler}_{suite}_{dtype}_{mode}_{device}_{testing}.csv"


 def generate_commands(args, dtypes, suites, devices, compilers, output_dir):
     mode = get_mode(args)
     suites_str = "_".join(suites)
     devices_str = "_".join(devices)
     dtypes_str = "_".join(dtypes)
     compilers_str = "_".join(compilers)
     generated_file = (
         f"run_{mode}_{devices_str}_{dtypes_str}_{suites_str}_{compilers_str}.sh"
     )
     with open(generated_file, "w") as runfile:
         lines = []

         lines.append("#!/bin/bash")
         lines.append("set -x")
         lines.append("# Setup the output directory")
         if not args.keep_output_dir:
             lines.append(f"rm -rf {output_dir}")
         # It's ok if the output directory already exists
         lines.append(f"mkdir -p {output_dir}")
         lines.append("")

         for testing in ["performance", "accuracy"]:
             for iter in itertools.product(suites, devices, dtypes):
                 suite, device, dtype = iter
                 lines.append(
                     f"# Commands for {suite} for device={device}, dtype={dtype} for {mode} and for {testing} testing"
                 )
                 info = TABLE[mode]
                 for compiler in compilers:
                     base_cmd = info[compiler]
                     output_filename = f"{output_dir}/{generate_csv_name(args, dtype, suite, device, compiler, testing)}"
                     launcher_cmd = "python"
                     if args.enable_cpu_launcher:
                         launcher_cmd = f"python -m torch.backends.xeon.run_cpu {args.cpu_launcher_args}"
                     cmd = f"{launcher_cmd} benchmarks/dynamo/{suite}.py --{testing} --{dtype} -d{device} --output={output_filename}"
                     cmd = f"{cmd} {base_cmd} {args.extra_args} --no-skip --dashboard"
                     skip_tests_str = get_skip_tests(suite, device, args.training)
                     cmd = f"{cmd} {skip_tests_str}"

                     if args.log_operator_inputs:
                         cmd = f"{cmd} --log-operator-inputs"

                     if args.quick:
                         filters = DEFAULTS["quick"][suite]
                         cmd = f"{cmd} {filters}"

                     if (
                         compiler
                         in (
                             "inductor",
                             "inductor_no_cudagraphs",
                         )
                         and not args.no_cold_start_latency
                     ):
                         cmd = f"{cmd} --cold-start-latency"

                     if args.batch_size is not None:
                         cmd = f"{cmd} --batch-size {args.batch_size}"

                     if args.threads is not None:
                         cmd = f"{cmd} --threads {args.threads}"

                     if args.total_partitions is not None:
                         cmd = f"{cmd} --total-partitions {args.total_partitions}"

                     if args.partition_id is not None:
                         cmd = f"{cmd} --partition-id {args.partition_id}"

                     if args.inductor_compile_mode is not None:
                         cmd = f"{cmd} --inductor-compile-mode {args.inductor_compile_mode}"
                     lines.append(cmd)
                 lines.append("")
         runfile.writelines([line + "\n" for line in lines])
     return generated_file


 def generate_dropdown_comment(title, body):
     str_io = io.StringIO()
     str_io.write(f"{title}\n")
     str_io.write("<details>\n")
     str_io.write("<summary>see more</summary>\n")
     str_io.write(f"{body}")
     str_io.write("\n")
     str_io.write("</details>\n\n")
     return str_io.getvalue()


 def build_summary(args):
     out_io = io.StringIO()

     def print_commit_hash(path, name):
         if args.base_sha is not None:
             if name == "pytorch":
                 out_io.write(f"{name} commit: {args.base_sha}\n")
         elif exists(path):
             import git

             repo = git.Repo(path, search_parent_directories=True)
             sha = repo.head.object.hexsha
             date = repo.head.object.committed_datetime
             out_io.write(f"{name} commit: {sha}\n")
             out_io.write(f"{name} commit date: {date}\n")
         else:
             out_io.write(f"{name} Absent\n")

     def env_var(name):
         if name in os.environ:
             out_io.write(f"{name} = {os.environ[name]}\n")
         else:
             out_io.write(f"{name} = {None}\n")

     out_io.write("\n")
     out_io.write("### Run name ###\n")
     out_io.write(get_archive_name(args, args.dtypes[0]))
     out_io.write("\n")

     out_io.write("\n")
     out_io.write("### Commit hashes ###\n")
     print_commit_hash("../pytorch", "pytorch")
     print_commit_hash("../torchbenchmark", "torchbench")

     out_io.write("\n")
     out_io.write("### TorchDynamo config flags ###\n")
     for key in dir(torch._dynamo.config):
         val = getattr(torch._dynamo.config, key)
         if not key.startswith("__") and isinstance(val, bool):
             out_io.write(f"torch._dynamo.config.{key} = {val}\n")

     out_io.write("\n")
     out_io.write("### Torch version ###\n")
     out_io.write(f"torch: {torch.__version__}\n")

     out_io.write("\n")
     out_io.write("### Environment variables ###\n")
     env_var("TORCH_CUDA_ARCH_LIST")
     env_var("CUDA_HOME")
     env_var("USE_LLVM")

     if "cuda" in args.devices:
         out_io.write("\n")
         out_io.write("### GPU details ###\n")
         out_io.write(f"CUDNN VERSION: {torch.backends.cudnn.version()}\n")
         out_io.write(f"Number CUDA Devices: {torch.cuda.device_count()}\n")
         out_io.write(f"Device Name: {torch.cuda.get_device_name(0)}\n")
         out_io.write(
             f"Device Memory [GB]: {torch.cuda.get_device_properties(0).total_memory/1e9}\n"
         )

     title = "## Build Summary"
     comment = generate_dropdown_comment(title, out_io.getvalue())
     with open(f"{output_dir}/gh_build_summary.txt", "w") as gh_fh:
         gh_fh.write(comment)


 @functools.lru_cache(None)
 def archive_data(archive_name):
     if archive_name is not None:
         prefix_match = re.search(r"\w+(?=_performance)", archive_name)
         if prefix_match is not None:
             prefix = prefix_match.group(0)
         else:
             prefix = ""
         day_match = re.search(r"day_(\d+)_", archive_name)
         if day_match is not None:
             day = day_match.group(1)
         else:
             day = "000"
     else:
         now = datetime.now(tz=timezone(timedelta(hours=-8)))
         day = now.strftime("%j")
         prefix = now.strftime(f"day_{day}_%d_%m_%y")
     return day, prefix


 @functools.lru_cache(None)
 def default_archive_name(dtype):
     _, prefix = archive_data(None)
     return f"{prefix}_performance_{dtype}_{randint(100, 999)}"


 def get_archive_name(args, dtype):
     return (
         default_archive_name(dtype) if args.archive_name is None else args.archive_name
     )


 def archive(src_dir, dest_dir_prefix, archive_name, dtype):
     if archive_name is None:
         archive_name = default_archive_name(dtype)
     # Copy the folder to archived location
     dest = os.path.join(dest_dir_prefix, archive_name)
     shutil.copytree(src_dir, dest, dirs_exist_ok=True)
     print(f"copied contents of {src_dir} to {dest}")


 def get_metric_title(metric):
     if metric == "speedup":
         return "Performance speedup"
     elif metric == "accuracy":
         return "Accuracy"
     elif metric == "compilation_latency":
         return "Compilation latency (sec)"
     elif metric == "compression_ratio":
         return "Peak Memory Compression Ratio"
     elif metric == "abs_latency":
         return "Absolute latency (ms)"
     raise RuntimeError("unknown metric")


 class Parser:
     def __init__(
         self, suites, devices, dtypes, compilers, flag_compilers, mode, output_dir
     ):
         self.suites = suites
         self.devices = devices
         self.dtypes = dtypes
         self.compilers = compilers
         self.flag_compilers = flag_compilers
         self.output_dir = output_dir
         self.mode = mode

     def has_header(self, output_filename):
         header_present = False
         with open(output_filename) as f:
             line = f.readline()
             if "dev" in line:
                 header_present = True
         return header_present


 class ParsePerformanceLogs(Parser):
     def __init__(
         self,
         suites,
         devices,
         dtypes,
         compilers,
         flag_compilers,
         mode,
         output_dir,
         include_slowdowns=False,
     ):
         super().__init__(
             suites,
             devices,
             dtypes,
             compilers,
             flag_compilers,
             mode,
             output_dir,
         )
         self.parsed_frames = defaultdict(lambda: defaultdict(None))
         self.untouched_parsed_frames = defaultdict(lambda: defaultdict(None))
         self.metrics = [
             "speedup",
             "abs_latency",
             "compilation_latency",
             "compression_ratio",
         ]
         self.bottom_k = 50
         self.parse()
         self.include_slowdowns = include_slowdowns

     def plot_graph(self, df, title):
         labels = df.columns.values.tolist()
         labels = labels[3:]
         df.plot(
             x="name",
             y=labels,
             kind="bar",
             width=0.65,
             title=title,
             ylabel="Speedup over eager",
             xlabel="",
             grid=True,
             figsize=(max(len(df.index) / 4, 5), 10),
             edgecolor="black",
         )
         plt.tight_layout()
         plt.savefig(f"{self.output_dir}/{title}.png")

     def read_csv(self, output_filename):
         if self.has_header(output_filename):
             return pd.read_csv(output_filename)
         else:
             return pd.read_csv(
                 output_filename,
                 names=[
                     "dev",
                     "name",
                     "batch_size",
                     "speedup",
                     "abs_latency",
                     "compilation_latency",
                     "compression_ratio",
                 ],
                 header=None,
                 engine="python",
             )

     def parse(self):
         self.extract_df("accuracy", "accuracy")
         for metric in self.metrics:
             self.extract_df(metric, "performance")

     def clean_batch_sizes(self, frames):
         # Clean up batch sizes when its 0
         if len(frames) == 1:
             return frames
         batch_sizes = frames[0]["batch_size"].to_list()
         for frame in frames[1:]:
             frame_batch_sizes = frame["batch_size"].to_list()
             for idx, (batch_a, batch_b) in enumerate(
                 zip(batch_sizes, frame_batch_sizes)
             ):
                 assert batch_a == batch_b or batch_a == 0 or batch_b == 0, print(
                     f"a={batch_a}, b={batch_b}"
                 )
                 batch_sizes[idx] = max(batch_a, batch_b)
         for frame in frames:
             frame["batch_size"] = batch_sizes
         return frames

     def extract_df(self, metric, testing):
         for iter in itertools.product(self.suites, self.devices, self.dtypes):
             suite, device, dtype = iter
             frames = []
             for compiler in self.compilers:
                 output_filename = f"{self.output_dir}/{compiler}_{suite}_{dtype}_{self.mode}_{device}_{testing}.csv"
                 df = self.read_csv(output_filename)
                 if metric not in df:
                     df.insert(len(df.columns), metric, np.nan)
                 df = df[["dev", "name", "batch_size", metric]]
                 df.rename(columns={metric: compiler}, inplace=True)
                 df["batch_size"] = df["batch_size"].astype(int)
                 frames.append(df)

             # Merge the results
             frames = self.clean_batch_sizes(frames)
             if len(self.compilers) == 1:
                 df = frames[0]
             else:
                 # Merge data frames
                 df = pd.merge(frames[0], frames[1], on=["dev", "name", "batch_size"])
                 for idx in range(2, len(frames)):
                     df = pd.merge(df, frames[idx], on=["dev", "name", "batch_size"])

             if testing == "performance":
                 for compiler in self.compilers:
                     df[compiler] = pd.to_numeric(df[compiler], errors="coerce").fillna(
                         0
                     )

             df_copy = df.copy()
             df_copy = df_copy.sort_values(
                 by=list(reversed(self.compilers)), ascending=False
             )
             if "inductor" in self.compilers:
                 df_copy = df_copy.sort_values(by="inductor", ascending=False)
             self.untouched_parsed_frames[suite][metric] = df_copy

             if testing == "performance":
                 df_accuracy = self.parsed_frames[suite]["accuracy"]
                 perf_rows = []
                 for model_name in df["name"]:
                     perf_row = df[df["name"] == model_name].copy()
                     acc_row = df_accuracy[df_accuracy["name"] == model_name]
                     for compiler in self.compilers:
                         if not perf_row.empty:
                             if acc_row.empty:
                                 perf_row[compiler] = 0.0
                             elif acc_row[compiler].iloc[0] not in (
                                 "pass",
                                 "pass_due_to_skip",
                             ):
                                 perf_row[compiler] = 0.0
                     perf_rows.append(perf_row)
                 df = pd.concat(perf_rows)
             df = df.sort_values(by=list(reversed(self.compilers)), ascending=False)

             if "inductor" in self.compilers:
                 df = df.sort_values(by="inductor", ascending=False)
             self.parsed_frames[suite][metric] = df

     def get_passing_entries(self, compiler, df):
         return df[compiler][df[compiler] > 0]

     def comp_time(self, compiler, df):
         df = self.get_passing_entries(compiler, df)
         # df = df.sort_values(by=compiler, ascending=False)[compiler][: self.bottom_k]
         if df.empty:
             return "0.0"

         return f"{df.mean():.2f}"

     def geomean(self, compiler, df):
         cleaned_df = self.get_passing_entries(compiler, df)
         if not self.include_slowdowns:
             cleaned_df = cleaned_df.clip(1)
         if cleaned_df.empty:
             return "0.0x"
         return f"{gmean(cleaned_df):.2f}x"

     def passrate(self, compiler, df):
         total = len(df.index)
         passing = df[df[compiler] > 0.0][compiler].count()
         perc = int(percentage(passing, total, decimals=0))
         return f"{perc}%, {passing}/{total}"

     def memory(self, compiler, df):
         df = self.get_passing_entries(compiler, df)
         df = df.fillna(0)
         df = df[df > 0]
         if df.empty:
             return "0.0x"
         return f"{df.mean():.2f}x"

     def exec_summary_df(self, fn, metric):
         """
         Generate a table with passrate and geomean perf
         """
         cols = {}
         cols["Compiler"] = self.compilers
         for suite in self.suites:
             df = self.parsed_frames[suite][metric]
             # speedups = [self.geomean(compiler, df) for compiler in self.compilers]
             speedups = [fn(compiler, df) for compiler in self.compilers]
             col = pd.Series(data=speedups, index=self.compilers)
             cols[suite] = col
         df = pd.DataFrame(cols)
         df = df.fillna(0)
         df.to_csv(os.path.join(self.output_dir, f"{fn.__name__}.csv"))
         return df

     def exec_summary_text(self, caption, fn, metric):
         df = self.exec_summary_df(fn, metric)
         tabform = tabulate(df, headers="keys", tablefmt="pretty", showindex="never")

         str_io = io.StringIO()
         str_io.write(f"{caption}")
         str_io.write("~~~\n")
         str_io.write(f"{tabform}\n")
         str_io.write("~~~\n")
         return str_io.getvalue()

     def generate_executive_summary(self):
         machine = "A100 GPUs"
         if "cpu" in self.devices:
             get_machine_cmd = "lscpu| grep 'Model name' | awk -F':' '{print $2}'"
             machine = subprocess.getstatusoutput(get_machine_cmd)[1].strip()
         description = (
             "We evaluate different backends "
             "across three benchmark suites - torchbench, huggingface and timm. We run "
             "these experiments on "
             + machine
             + ". Each experiment runs one iteration of forward pass "
             "and backward pass for training and forward pass only for inference. "
             "For accuracy, we check the numerical correctness of forward pass outputs and gradients "
             "by comparing with native pytorch. We measure speedup "
             "by normalizing against the performance of native pytorch. We report mean "
             "compilation latency numbers and peak memory footprint reduction ratio. \n\n"
             "Caveats\n"
             "1) Batch size has been reduced to workaround OOM errors. Work is in progress to "
             "reduce peak memory footprint.\n"
             "2) Experiments do not cover dynamic shapes.\n"
             "3) Experimental setup does not have optimizer.\n\n"
         )
         comment = generate_dropdown_comment("", description)
         str_io = io.StringIO()
         str_io.write("\n")
         str_io.write("## Executive Summary ##\n")
         str_io.write(comment)

         speedup_caption = "Geometric mean speedup \n"
         speedup_summary = self.exec_summary_text(
             speedup_caption, self.geomean, "speedup"
         )

         passrate_caption = "Passrate\n"
         passrate_summary = self.exec_summary_text(
             passrate_caption, self.passrate, "speedup"
         )

         comp_time_caption = "Mean compilation time (seconds)\n"
         comp_time_summary = self.exec_summary_text(
             comp_time_caption, self.comp_time, "compilation_latency"
         )

         peak_memory_caption = (
             "Peak memory footprint compression ratio (higher is better)\n"
         )
         peak_memory_summary = self.exec_summary_text(
             peak_memory_caption, self.memory, "compression_ratio"
         )

         str_io.write(
             "To measure performance, compilation latency and memory footprint reduction, "
             "we remove the models that fail accuracy checks.\n\n"
         )
         str_io.write(passrate_summary)
         str_io.write(speedup_summary)
         str_io.write(comp_time_summary)
         str_io.write(peak_memory_summary)
         self.executive_summary = str_io.getvalue()

     def flag_bad_entries(self, suite, metric, flag_fn):
         df = self.untouched_parsed_frames[suite][metric]
         df = df.drop("dev", axis=1)
         df = df.rename(columns={"batch_size": "bs"})
         # apply flag_fn elementwise to flag_compilers columns,
         # if one element fails, the entire row is flagged
         flag = np.logical_or.reduce(
             df[self.flag_compilers].applymap(flag_fn),
             axis=1,
         )
         df = df[flag]
         df = df.assign(suite=suite)
         return df.reindex(columns=["suite", "name"] + self.flag_compilers)

     def generate_warnings(self):
         title = "## Warnings ##"
         body = (
             "We flag models where:\n\n"
             " - accuracy fails\n"
             " - speedup < 0.95x (NOTE: 0.0 speedup typically signifies a failure in the performance test)\n"
             " - compilation latency > 120 sec.\n"
             " - compression ratio < 0.9\n"
             "\n"
         )
         for metric in [
             "accuracy",
             "speedup",
             "compilation_latency",
             "compression_ratio",
         ]:
             dfs = []
             for suite in self.suites:
                 dfs.append(self.flag_bad_entries(suite, metric, FLAG_FNS[metric]))
             df = pd.concat(dfs, axis=0)
             if df.empty:
                 continue
             tabform = tabulate(df, headers="keys", tablefmt="pretty", showindex="never")
             str_io = io.StringIO()
             str_io.write("\n")
             str_io.write(get_metric_title(metric) + " warnings\n")
             str_io.write("~~~\n")
             str_io.write(f"{tabform}\n")
             str_io.write("~~~\n")
             body += str_io.getvalue()

         comment = generate_dropdown_comment(title, body)
         return comment

     def prepare_message(self, suite):
         title = f"## {suite} suite with {self.dtypes[0]} precision ##"
         body = ""
         for metric in [
             "speedup",
             "accuracy",
             "compilation_latency",
             "compression_ratio",
             "abs_latency",
         ]:
             df = self.untouched_parsed_frames[suite][metric]
             df = df.drop("dev", axis=1)
             df = df.rename(columns={"batch_size": "bs"})
             tabform = tabulate(df, headers="keys", tablefmt="pretty", showindex="never")
             str_io = io.StringIO()
             str_io.write("\n")
             str_io.write(get_metric_title(metric) + "\n")
             str_io.write("~~~\n")
             str_io.write(f"{tabform}\n")
             str_io.write("~~~\n")
             body += str_io.getvalue()

         comment = generate_dropdown_comment(title, body)
         return comment

     def gen_summary_files(self):
         self.generate_executive_summary()
         for suite in self.suites:
             self.plot_graph(
                 self.untouched_parsed_frames[suite]["speedup"],
                 f"{suite}_{self.dtypes[0]}",
             )

         with open(f"{self.output_dir}/gh_title.txt", "w") as gh_fh:
             str_io = io.StringIO()
             str_io.write("\n")
             str_io.write(f"# Performance Dashboard for {self.dtypes[0]} precision ##\n")
             str_io.write("\n")
             gh_fh.write(str_io.getvalue())

         with open(f"{self.output_dir}/gh_executive_summary.txt", "w") as gh_fh:
             gh_fh.write(self.executive_summary)

         with open(f"{self.output_dir}/gh_warnings.txt", "w") as gh_fh:
             warnings_body = self.generate_warnings()
             gh_fh.write(warnings_body)

         str_io = io.StringIO()
         for suite in self.suites:
             str_io.write(self.prepare_message(suite))
         str_io.write("\n")
         with open(f"{self.output_dir}/gh_{self.mode}.txt", "w") as gh_fh:
             gh_fh.write(str_io.getvalue())


 def parse_logs(args, dtypes, suites, devices, compilers, flag_compilers, output_dir):
     mode = get_mode(args)
     build_summary(args)
     include_slowdowns = args.include_slowdowns

     parser_class = ParsePerformanceLogs
     parser = parser_class(
         suites,
         devices,
         dtypes,
         compilers,
         flag_compilers,
         mode,
         output_dir,
         include_slowdowns,
     )
     parser.gen_summary_files()
     return


 @dataclasses.dataclass
 class LogInfo:
     # Day of the year this log was generated
     day: str

     # Directory path where all logs are present
     dir_path: str


 def get_date(log_info):
     return datetime.strptime(f"{log_info.day}", "%j").strftime("%m-%d")


 def find_last_2_with_filenames(lookup_file, dashboard_archive_path, dtype, filenames):
     df = pd.read_csv(lookup_file, names=("day", "mode", "prec", "path"))
     df = df[df["mode"] == "performance"]
     df = df[df["prec"] == dtype]
     df = df[::-1]
     last2 = []
     for path in df["path"]:
         output_dir = os.path.join(dashboard_archive_path, path)
         fullpaths = [
             os.path.join(dashboard_archive_path, path, name) for name in filenames
         ]
         if all(os.path.exists(fullpath) for fullpath in fullpaths):
             last2.append(output_dir)
         if len(last2) >= 2:
             return last2
     return None


 class SummaryStatDiffer:
     def __init__(self, args):
         self.args = args
         self.lookup_file = os.path.join(self.args.dashboard_archive_path, "lookup.csv")
         assert os.path.exists(self.lookup_file)

     def generate_diff(self, last2, filename, caption):
         df_cur, df_prev = (pd.read_csv(os.path.join(path, filename)) for path in last2)
         df_merge = df_cur.merge(df_prev, on="Compiler", suffixes=("_cur", "_prev"))
         data = {col: [] for col in ("compiler", "suite", "prev_value", "cur_value")}
         for _, row in df_merge.iterrows():
             if row["Compiler"] in self.args.flag_compilers:
                 for suite in self.args.suites:
                     if suite + "_prev" not in row or suite + "_cur" not in row:
                         continue
                     data["compiler"].append(row["Compiler"])
                     data["suite"].append(suite)
                     data["prev_value"].append(row[suite + "_prev"])
                     data["cur_value"].append(row[suite + "_cur"])

         df = pd.DataFrame(data)
         tabform = tabulate(df, headers="keys", tablefmt="pretty", showindex="never")
         str_io = io.StringIO()
         str_io.write("\n")
         str_io.write(f"{caption}\n")
         str_io.write("~~~\n")
         str_io.write(f"{tabform}\n")
         str_io.write("~~~\n")
         return str_io.getvalue()

     def generate_comment(self):
         title = "## Summary Statistics Diff ##\n"
         body = (
             "For each relevant compiler, we compare the summary statistics "
             "for the most 2 recent reports that actually run the compiler.\n\n"
         )
         dtype = self.args.dtypes[0]
         last2 = find_last_2_with_filenames(
             self.lookup_file,
             self.args.dashboard_archive_path,
             dtype,
             ["geomean.csv", "passrate.csv"],
         )

         if last2 is None:
             body += "Could not find most 2 recent reports.\n\n"
         else:
             for state, path in zip(("Current", "Previous"), last2):
                 body += f"{state} report name: {path}\n\n"
             body += self.generate_diff(last2, "passrate.csv", "Passrate diff")
             body += self.generate_diff(
                 last2, "geomean.csv", "Geometric mean speedup diff"
             )

         comment = generate_dropdown_comment(title, body)

         with open(f"{self.args.output_dir}/gh_summary_diff.txt", "w") as gh_fh:
             gh_fh.write(comment)


 class RegressionDetector:
     """
     Compares the most recent 2 benchmarks to find previously unflagged models
     that are now flagged.
     """

     def __init__(self, args):
         self.args = args
         self.lookup_file = os.path.join(self.args.dashboard_archive_path, "lookup.csv")
         assert os.path.exists(self.lookup_file)

     def generate_comment(self):
         title = "## Recent Regressions ##\n"
         body = (
             "For each relevant compiler, we compare the most recent 2 reports "
             "(that actually run the compiler) to find previously unflagged "
             "models that are now flagged as problematic (according to the "
             "'Warnings' section).\n\n"
         )
         dtype = self.args.dtypes[0]
         device = self.args.devices[0]
         for suite in self.args.suites:
             body += f"### Regressions for {suite} ###\n"
             last2 = {}

             for compiler in self.args.flag_compilers:
                 filenames = [
                     generate_csv_name(
                         self.args, dtype, suite, device, compiler, testing
                     )
                     for testing in ["performance", "accuracy"]
                 ]
                 compiler_last2 = find_last_2_with_filenames(
                     self.lookup_file, self.args.dashboard_archive_path, dtype, filenames
                 )
                 if compiler_last2 is not None:
                     last2[compiler] = [
                         ParsePerformanceLogs(
                             [suite],
                             [device],
                             [dtype],
                             [compiler],
                             [compiler],
                             get_mode(self.args),
                             output_dir,
                         )
                         for output_dir in compiler_last2
                     ]
                     for state, path in zip(("Current", "Previous"), compiler_last2):
                         body += (
                             f"{state} report name (compiler: {compiler}, "
                             f"suite: {suite}): {path}\n\n"
                         )

             regressions_present = False
             for metric in [
                 "accuracy",
                 "speedup",
                 "compilation_latency",
                 "compression_ratio",
             ]:
                 dfs = []
                 for compiler in self.args.flag_compilers:
                     if last2[compiler] is None:
                         continue

                     df_cur, df_prev = (
                         last2[compiler][i].untouched_parsed_frames[suite][metric]
                         for i in (0, 1)
                     )
                     df_merge = df_cur.merge(
                         df_prev, on="name", suffixes=("_cur", "_prev")
                     )
                     flag_fn = FLAG_FNS[metric]
                     flag = np.logical_and(
                         df_merge[compiler + "_prev"].apply(
                             lambda x: not pd.isna(x) and not flag_fn(x)
                         ),
                         df_merge[compiler + "_cur"].apply(
                             lambda x: not pd.isna(x) and flag_fn(x)
                         ),
                     )
                     df_bad = df_merge[flag]
                     dfs.append(
                         pd.DataFrame(
                             data={
                                 "compiler": compiler,
                                 "name": df_bad["name"],
                                 "prev_status": df_bad[compiler + "_prev"],
                                 "cur_status": df_bad[compiler + "_cur"],
                             }
                         )
                     )

                 if not dfs:
                     continue
                 df = pd.concat(dfs, axis=0)
                 if df.empty:
                     continue
                 regressions_present = True
                 tabform = tabulate(
                     df, headers="keys", tablefmt="pretty", showindex="never"
                 )
                 str_io = io.StringIO()
                 str_io.write("\n")
                 str_io.write(f"{get_metric_title(metric)} regressions\n")
                 str_io.write("~~~\n")
                 str_io.write(f"{tabform}\n")
                 str_io.write("~~~\n")
                 body += str_io.getvalue()

             if not regressions_present:
                 body += "No regressions found.\n"

         comment = generate_dropdown_comment(title, body)

         with open(f"{self.args.output_dir}/gh_metric_regression.txt", "w") as gh_fh:
             gh_fh.write(comment)


 class RegressionTracker:
     """
     Plots progress of different metrics over time to detect regressions.
     """

     def __init__(self, args):
         self.args = args
         self.suites = self.args.suites
         self.lookup_file = os.path.join(self.args.dashboard_archive_path, "lookup.csv")
         assert os.path.exists(self.lookup_file)
         self.k = 10

     def find_last_k(self):
         """
         Find the last k pairs of (day number, log_path)
         """
         dtype = self.args.dtypes[0]
         df = pd.read_csv(self.lookup_file, names=("day", "mode", "prec", "path"))
         df = df[df["mode"] == "performance"]
         df = df[df["prec"] == dtype]
         log_infos = []
         for day, path in zip(df["day"], df["path"]):
             log_infos.append(LogInfo(day, path))

         assert len(log_infos) >= self.k
         log_infos = log_infos[len(log_infos) - self.k :]
         return log_infos

     def generate_comment(self):
         title = "## Metrics over time ##\n"
         str_io = io.StringIO()
         if not self.args.update_dashboard_test and not self.args.no_graphs:
             for name in glob.glob(self.args.output_dir + "/*over_time.png"):
                 output = (
                     subprocess.check_output([self.args.dashboard_image_uploader, name])
                     .decode("ascii")
                     .rstrip()
                 )
                 str_io.write(f"\n{name} : ![]({output})\n")
         comment = generate_dropdown_comment(title, str_io.getvalue())

         with open(f"{self.args.output_dir}/gh_regression.txt", "w") as gh_fh:
             gh_fh.write(comment)

     def diff(self):
         log_infos = self.find_last_k()

         for metric in ["geomean", "passrate", "comp_time", "memory"]:
             fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))
             for idx, suite in enumerate(self.suites):
                 dfs = []
                 for log_info in log_infos:
                     dir_path = os.path.join(
                         self.args.dashboard_archive_path, log_info.dir_path
                     )
                     assert os.path.exists(dir_path)
                     gmean_filename = os.path.join(dir_path, f"{metric}.csv")
                     if not os.path.exists(gmean_filename):
                         continue
                     df = pd.read_csv(gmean_filename)
                     if suite not in df:
                         continue
                     if metric == "geomean" or metric == "memory":
                         df[suite] = df[suite].str.replace("x", "").astype(float)
                     elif metric == "passrate":
                         df[suite] = df[suite].str.split("%").str[0].astype(float)
                     df.insert(0, "day", get_date(log_info))
                     df = df.pivot(index="day", columns="Compiler", values=suite)

                     # Interim stage when both inductor_cudagraphs and inductor exist
                     df = df.rename(columns={"inductor_cudagraphs": "inductor"})
                     for col_name in df.columns:
                         if col_name not in self.args.compilers:
                             df = df.drop(columns=[col_name])
                     dfs.append(df)

                 df = pd.concat(dfs)
                 df = df.interpolate(method="linear")
                 ax = df.plot(
                     ax=axes[idx],
                     kind="line",
                     ylabel=metric,
                     xlabel="Date",
                     grid=True,
                     ylim=0 if metric == "passrate" else 0.8,
                     title=suite,
                     style=".-",
                     legend=False,
                 )
                 ax.legend(loc="lower right", ncol=2)

             plt.tight_layout()
             plt.savefig(os.path.join(output_dir, f"{metric}_over_time.png"))

         self.generate_comment()


 class DashboardUpdater:
     """
     Aggregates the information and makes a comment to Performance Dashboard.
     https://github.com/pytorch/torchdynamo/issues/681
     """

     def __init__(self, args):
         self.args = args
         self.output_dir = args.output_dir
         self.lookup_file = os.path.join(self.args.dashboard_archive_path, "lookup.csv")
         assert os.path.exists(self.lookup_file)
         try:
             if not self.args.update_dashboard_test and not self.args.no_update_archive:
                 self.update_lookup_file()
         except subprocess.CalledProcessError:
             sys.stderr.write("failed to update lookup file\n")

     def update_lookup_file(self):
         dtype = self.args.dtypes[0]
         day, _ = archive_data(self.args.archive_name)
         target_dir = get_archive_name(self.args, dtype)
         # Update lookup csv the folder to arhived logs
         subprocess.check_call(
             f'echo "{day},performance,{dtype},{target_dir}" >> {self.lookup_file}',
             shell=True,
         )

     def archive(self):
         dtype = self.args.dtypes[0]
         # Copy the folder to archived location
         archive(
             self.output_dir,
             self.args.dashboard_archive_path,
             self.args.archive_name,
             dtype,
         )

     def upload_graphs(self):
         title = "## Performance graphs ##\n"
         str_io = io.StringIO()
         if not self.args.update_dashboard_test and not self.args.no_graphs:
             for name in glob.glob(self.output_dir + "/*png"):
                 if "over_time" not in name:
                     output = (
                         subprocess.check_output(
                             [self.args.dashboard_image_uploader, name]
                         )
                         .decode("ascii")
                         .rstrip()
                     )
                     str_io.write(f"\n{name} : ![]({output})\n")
         comment = generate_dropdown_comment(title, str_io.getvalue())

         with open(f"{self.output_dir}/gh_graphs.txt", "w") as gh_fh:
             gh_fh.write(comment)

     def gen_comment(self):
         files = [
             "gh_title.txt",
             "gh_executive_summary.txt",
             "gh_summary_diff.txt",
             "gh_warnings.txt",
             "gh_regression.txt",
             "gh_metric_regression.txt",
             "gh_training.txt" if self.args.training else "gh_inference.txt",
             "gh_graphs.txt",
             "gh_build_summary.txt",
         ]
         all_lines = []
         for f in files:
             try:
                 with open(os.path.join(self.output_dir, f)) as fh:
                     all_lines.extend(fh.readlines())
             except FileNotFoundError:
                 pass

         return "\n".join([x.rstrip() for x in all_lines])

     def comment_on_gh(self, comment):
         """
         Send a commment to dashboard
         """
         with tempfile.NamedTemporaryFile(mode="w", delete=False) as f:
             f.write(comment)
             filename = f.name

         issue_number = "93794"
         if self.args.dtypes[0] == "float32":
             issue_number = "93518"

         subprocess.check_call(
             [
                 self.args.dashboard_gh_cli_path,
                 "issue",
                 "comment",
                 "--repo=https://github.com/pytorch/pytorch.git",
                 issue_number,
                 "-F",
                 filename,
             ]
         )

         os.remove(filename)

     def update(self):
         self.upload_graphs()
         if not self.args.no_detect_regressions:
             SummaryStatDiffer(self.args).generate_comment()
             RegressionDetector(self.args).generate_comment()
             try:
                 RegressionTracker(self.args).diff()
             except Exception as e:
                 logging.exception("")
                 with open(f"{self.args.output_dir}/gh_regression.txt", "w") as gh_fh:
                     gh_fh.write("")

         comment = self.gen_comment()
         print(comment)

         if not self.args.update_dashboard_test:
             if not self.args.no_gh_comment:
                 self.comment_on_gh(comment)
             if not self.args.no_update_archive:
                 self.archive()


 if __name__ == "__main__":
     args = parse_args()

     def extract(key):
         return DEFAULTS[key] if getattr(args, key, None) is None else getattr(args, key)

     dtypes = extract("dtypes")
     suites = extract("suites")
     devices = extract("devices")

     if args.inference:
         compilers = DEFAULTS["inference"] if args.compilers is None else args.compilers
         flag_compilers = (
             DEFAULTS["flag_compilers"]["inference"]
             if args.flag_compilers is None
             else args.flag_compilers
         )
     else:
         assert args.training
         compilers = DEFAULTS["training"] if args.compilers is None else args.compilers
         flag_compilers = (
             DEFAULTS["flag_compilers"]["training"]
             if args.flag_compilers is None
             else args.flag_compilers
         )

     output_dir = args.output_dir
     args.compilers = compilers
     args.devices = devices
     args.dtypes = dtypes
     flag_compilers = list(set(flag_compilers) & set(compilers))
     args.flag_compilers = flag_compilers
     args.suites = suites

     if args.print_run_commands:
         generated_file = generate_commands(
             args, dtypes, suites, devices, compilers, output_dir
         )
         print(
             f"Running commands are generated in file {generated_file}. Please run (bash {generated_file})."
         )
     elif args.visualize_logs:
         parse_logs(args, dtypes, suites, devices, compilers, flag_compilers, output_dir)
     elif args.run:
         generated_file = generate_commands(
             args, dtypes, suites, devices, compilers, output_dir
         )
         # generate memoized archive name now so that the date is reflective
         # of when the run started
         get_archive_name(args, dtypes[0])
         # TODO - Do we need to worry about segfaults
         try:
             os.system(f"bash {generated_file}")
         except Exception as e:
             print(
                 f"Running commands failed. Please run manually (bash {generated_file}) and inspect the errors."
             )
             raise e
         if not args.log_operator_inputs:
             if not args.no_update_archive:
                 archive(
                     output_dir,
                     args.dashboard_archive_path,
                     args.archive_name,
                     dtypes[0],
                 )
             parse_logs(
                 args, dtypes, suites, devices, compilers, flag_compilers, output_dir
             )
             if not args.no_update_archive:
                 archive(
                     output_dir,
                     args.dashboard_archive_path,
                     args.archive_name,
                     dtypes[0],
                 )

     if args.update_dashboard:
         DashboardUpdater(args).update()