| import time |
| import os |
| import json |
| |
| import torch |
| from torch.profiler import profile, ProfilerActivity |
| |
| |
| def synchronize(): |
| pass |
| |
| |
| class NullContext: |
| def __enter__(self): |
| pass |
| |
| def __exit__(self, exc_type, exc_val, exc_tb): |
| pass |
| |
| |
| def dump_chrome_trace(f, input, trace_filename, optimize_ctx, activities, num_runs=1, |
| devices=None, kwargs_for_f=None, kwargs_for_profiler=None): |
| """ |
| Output the chrome trace of running f(input, **kwargs_for_f) with [optimize_ctx] |
| [num_runs] times to [trace_filename]. |
| |
| [activities] are the activities that the profiler will record, e.g. ProfilerActivity.CUDA. |
| Return total runtime without the profiler |
| |
| Outputs to trace_filename |
| """ |
| |
| if devices is None: |
| devices = ["cuda"] |
| |
| global synchronize |
| if devices != ["cpu"] and torch.cuda.is_available(): |
| synchronize = torch.cuda.synchronize |
| |
| if kwargs_for_f is None: |
| kwargs_for_f = {} |
| if kwargs_for_profiler is None: |
| kwargs_for_profiler = {} |
| |
| with optimize_ctx: |
| torch.manual_seed(1337) |
| for _ in range(5): # warmup runs |
| f(input, **kwargs_for_f) |
| synchronize() |
| torch.manual_seed(1337) |
| t0 = time.perf_counter() |
| for _ in range(num_runs): |
| f(input, **kwargs_for_f) |
| synchronize() |
| t1 = time.perf_counter() |
| timing = t1 - t0 |
| |
| with profile(activities=activities, **kwargs_for_profiler) as prof: |
| with optimize_ctx: |
| synchronize() |
| torch.manual_seed(1337) |
| for _ in range(num_runs): |
| f(input, **kwargs_for_f) |
| synchronize() |
| prof.export_chrome_trace(trace_filename) |
| |
| return timing |
| |
| |
| def get_chrome_trace_events(filename): |
| f = open(filename) |
| data = json.load(f) |
| events = data["traceEvents"] |
| return events |
| |
| |
| def is_gpu_compute_event(event): |
| global gpu_pids |
| return "pid" in event and event["pid"] in gpu_pids and "ph" in event and event["ph"] == "X" |
| |
| |
| def get_sorted_gpu_events(events): |
| sorted_gpu_events = [] |
| for event in events: |
| if(not is_gpu_compute_event(event)): |
| continue |
| sorted_gpu_events.append(event) |
| return sorted(sorted_gpu_events, key=lambda x: x["ts"]) |
| |
| |
| def get_duration(sorted_gpu_events): |
| if len(sorted_gpu_events) == 0: |
| return 0 |
| event = sorted_gpu_events[0] |
| current_end_time = event["ts"] + event["dur"] |
| total_duration = event["dur"] |
| for event in sorted_gpu_events[1:]: |
| start_time = max(event["ts"], current_end_time) |
| end_time = event["ts"] + event["dur"] |
| total_duration = total_duration + max(end_time - start_time, 0) |
| current_end_time = max(current_end_time, end_time) |
| return total_duration |
| |
| |
| def get_sorted_gpu_mm_conv_events(events): |
| def is_mm_conv_event(event): |
| return "name" in event and ("gemm" in event["name"] or "conv" in event["name"] |
| or "cutlass" in event["name"] or "wgrad" in event["name"]) |
| gpu_events = get_sorted_gpu_events(events) |
| sorted_events = [] |
| for event in gpu_events: |
| if(not is_mm_conv_event(event)): |
| continue |
| sorted_events.append(event) |
| return sorted_events |
| |
| |
| gpu_pids = [] |
| |
| |
| def compute_utilization(filename: str, total_length: float): |
| """ |
| Process the chrome traces outputs by the pytorch profiler to compute GPU Utilization |
| and percent of times spent on matmal and convolution |
| |
| Args: |
| filename(str): Name of chrome traces file produced by pytorch profiler |
| |
| total_length(float): total length of the process without profiler in second |
| |
| Return: |
| tuple: (GPU Utilization, percent of time spent on matmal and convolution) |
| """ |
| events = get_chrome_trace_events(filename) |
| |
| # get pids of GPU events |
| global gpu_pids |
| gpu_pids = [] |
| for event in events: |
| if "name" not in event: |
| continue |
| if event["name"] == 'process_labels' and "GPU" in event["args"]["labels"]: |
| gpu_pids.append(event["pid"]) |
| |
| total_length = total_length * 1e6 |
| sorted_gpu_events = get_sorted_gpu_events(events) |
| utilization = get_duration(sorted_gpu_events) / total_length |
| |
| sorted_gpu_mm_conv_events = get_sorted_gpu_mm_conv_events(events) |
| mm_conv_utilization = get_duration(sorted_gpu_mm_conv_events) / total_length |
| |
| return utilization, mm_conv_utilization |
| |
| |
| def benchmark_utilization(f, input, trace_folder, optimize_ctx=None, trace_file_name="tmp_chrome_trace", num_runs=1): |
| """ |
| Benchmark the GPU Utilization and percent of time spent on matmal and convolution operations of |
| running f(input, **kwargs_for_f) with [optimize_ctx] [num_runs] times. |
| It will produce a chrome trace file in trace_folder/trace_file_name.json |
| |
| Example: |
| |
| ``` |
| def f(a): |
| return a.sum() |
| a = torch.rand(2**20, device="cuda") |
| utilization, mm_conv_utilization = benchmark_utilization(f, a, "tmp", trace_file_name = "tmp_chrome_trace") |
| ``` |
| |
| Args: |
| f: function to benchmark |
| |
| input: input to :attr:`f` |
| |
| trace_folder: name of the folder to store the chrome trace |
| |
| optimize_ctx: the context in which f will run |
| |
| trace_file_name: name of the dumped chrome trace file, default to "tmp_chrome_trace" |
| |
| num_runs: number of times to run f, excluding the warm-up runs, default to 1. |
| |
| Return: |
| tuple: (GPU Utilization, percent of time spent on matmal and convolution) |
| |
| """ |
| isExist = os.path.exists(trace_folder) |
| if not isExist: |
| os.makedirs(trace_folder) |
| print("create folder " + trace_folder) |
| |
| if optimize_ctx is None: |
| optimize_ctx = NullContext() |
| |
| chrome_trace_file_name = os.path.join(trace_folder, trace_file_name + ".json") |
| total_length = dump_chrome_trace(f, input, chrome_trace_file_name, optimize_ctx, |
| [ProfilerActivity.CUDA], num_runs=num_runs, devices="cuda") |
| utilization, mm_conv_utilization = compute_utilization(chrome_trace_file_name, total_length) |
| |
| return utilization, mm_conv_utilization |