functorch/_src/benchmark_utils.py - platform/external/pytorch - Git at Google

 import time
 import os
 import json

 import torch
 from torch.profiler import profile, ProfilerActivity


 def synchronize():
     pass


 class NullContext:
     def __enter__(self):
         pass

     def __exit__(self, exc_type, exc_val, exc_tb):
         pass


 def dump_chrome_trace(f, input, trace_filename, optimize_ctx, activities, num_runs=1,
                       devices=None, kwargs_for_f=None, kwargs_for_profiler=None):
     """
     Output the chrome trace of running f(input, **kwargs_for_f) with [optimize_ctx]
     [num_runs] times to [trace_filename].

     [activities] are the activities that the profiler will record, e.g. ProfilerActivity.CUDA.
     Return total runtime without the profiler

     Outputs to trace_filename
     """

     if devices is None:
         devices = ["cuda"]

     global synchronize
     if devices != ["cpu"] and torch.cuda.is_available():
         synchronize = torch.cuda.synchronize

     if kwargs_for_f is None:
         kwargs_for_f = {}
     if kwargs_for_profiler is None:
         kwargs_for_profiler = {}

     with optimize_ctx:
         torch.manual_seed(1337)
         for _ in range(5):  # warmup runs
             f(input, **kwargs_for_f)
             synchronize()
         torch.manual_seed(1337)
         t0 = time.perf_counter()
         for _ in range(num_runs):
             f(input, **kwargs_for_f)
             synchronize()
         t1 = time.perf_counter()
     timing = t1 - t0

     with profile(activities=activities, **kwargs_for_profiler) as prof:
         with optimize_ctx:
             synchronize()
             torch.manual_seed(1337)
             for _ in range(num_runs):
                 f(input, **kwargs_for_f)
                 synchronize()
     prof.export_chrome_trace(trace_filename)

     return timing


 def get_chrome_trace_events(filename):
     f = open(filename)
     data = json.load(f)
     events = data["traceEvents"]
     return events


 def is_gpu_compute_event(event):
     global gpu_pids
     return "pid" in event and event["pid"] in gpu_pids and "ph" in event and event["ph"] == "X"


 def get_sorted_gpu_events(events):
     sorted_gpu_events = []
     for event in events:
         if(not is_gpu_compute_event(event)):
             continue
         sorted_gpu_events.append(event)
     return sorted(sorted_gpu_events, key=lambda x: x["ts"])


 def get_duration(sorted_gpu_events):
     if len(sorted_gpu_events) == 0:
         return 0
     event = sorted_gpu_events[0]
     current_end_time = event["ts"] + event["dur"]
     total_duration = event["dur"]
     for event in sorted_gpu_events[1:]:
         start_time = max(event["ts"], current_end_time)
         end_time = event["ts"] + event["dur"]
         total_duration = total_duration + max(end_time - start_time, 0)
         current_end_time = max(current_end_time, end_time)
     return total_duration


 def get_sorted_gpu_mm_conv_events(events):
     def is_mm_conv_event(event):
         return "name" in event and ("gemm" in event["name"] or "conv" in event["name"]
                                     or "cutlass" in event["name"] or "wgrad" in event["name"])
     gpu_events = get_sorted_gpu_events(events)
     sorted_events = []
     for event in gpu_events:
         if(not is_mm_conv_event(event)):
             continue
         sorted_events.append(event)
     return sorted_events


 gpu_pids = []


 def compute_utilization(filename: str, total_length: float):
     """
     Process the chrome traces outputs by the pytorch profiler to compute GPU Utilization
     and percent of times spent on matmal and convolution

     Args:
         filename(str): Name of chrome traces file produced by pytorch profiler

         total_length(float): total length of the process without profiler in second

     Return:
         tuple: (GPU Utilization, percent of time spent on matmal and convolution)
     """
     events = get_chrome_trace_events(filename)

     # get pids of GPU events
     global gpu_pids
     gpu_pids = []
     for event in events:
         if "name" not in event:
             continue
         if event["name"] == 'process_labels' and "GPU" in event["args"]["labels"]:
             gpu_pids.append(event["pid"])

     total_length = total_length * 1e6
     sorted_gpu_events = get_sorted_gpu_events(events)
     utilization = get_duration(sorted_gpu_events) / total_length

     sorted_gpu_mm_conv_events = get_sorted_gpu_mm_conv_events(events)
     mm_conv_utilization = get_duration(sorted_gpu_mm_conv_events) / total_length

     return utilization, mm_conv_utilization


 def benchmark_utilization(f, input, trace_folder, optimize_ctx=None, trace_file_name="tmp_chrome_trace", num_runs=1):
     """
     Benchmark the GPU Utilization and percent of time spent on matmal and convolution operations of
     running f(input, **kwargs_for_f) with [optimize_ctx] [num_runs] times.
     It will produce a chrome trace file in trace_folder/trace_file_name.json

     Example:

     ```
     def f(a):
         return a.sum()
     a = torch.rand(2**20, device="cuda")
     utilization, mm_conv_utilization = benchmark_utilization(f, a, "tmp", trace_file_name = "tmp_chrome_trace")
     ```

     Args:
         f: function to benchmark

         input: input to :attr:`f`

         trace_folder: name of the folder to store the chrome trace

         optimize_ctx: the context in which f will run

         trace_file_name: name of the dumped chrome trace file, default to "tmp_chrome_trace"

         num_runs: number of times to run f, excluding the warm-up runs, default to 1.

     Return:
         tuple: (GPU Utilization, percent of time spent on matmal and convolution)

     """
     isExist = os.path.exists(trace_folder)
     if not isExist:
         os.makedirs(trace_folder)
         print("create folder " + trace_folder)

     if optimize_ctx is None:
         optimize_ctx = NullContext()

     chrome_trace_file_name = os.path.join(trace_folder, trace_file_name + ".json")
     total_length = dump_chrome_trace(f, input, chrome_trace_file_name, optimize_ctx,
                                      [ProfilerActivity.CUDA], num_runs=num_runs, devices="cuda")
     utilization, mm_conv_utilization = compute_utilization(chrome_trace_file_name, total_length)

     return utilization, mm_conv_utilization
	import time
	import os
	import json

	import torch
	from torch.profiler import profile, ProfilerActivity


	def synchronize():
	pass


	class NullContext:
	def __enter__(self):
	pass

	def __exit__(self, exc_type, exc_val, exc_tb):
	pass


	def dump_chrome_trace(f, input, trace_filename, optimize_ctx, activities, num_runs=1,
	devices=None, kwargs_for_f=None, kwargs_for_profiler=None):
	"""
	Output the chrome trace of running f(input, **kwargs_for_f) with [optimize_ctx]
	[num_runs] times to [trace_filename].

	[activities] are the activities that the profiler will record, e.g. ProfilerActivity.CUDA.
	Return total runtime without the profiler

	Outputs to trace_filename
	"""

	if devices is None:
	devices = ["cuda"]

	global synchronize
	if devices != ["cpu"] and torch.cuda.is_available():
	synchronize = torch.cuda.synchronize

	if kwargs_for_f is None:
	kwargs_for_f = {}
	if kwargs_for_profiler is None:
	kwargs_for_profiler = {}

	with optimize_ctx:
	torch.manual_seed(1337)
	for _ in range(5): # warmup runs
	f(input, **kwargs_for_f)
	synchronize()
	torch.manual_seed(1337)
	t0 = time.perf_counter()
	for _ in range(num_runs):
	f(input, **kwargs_for_f)
	synchronize()
	t1 = time.perf_counter()
	timing = t1 - t0

	with profile(activities=activities, **kwargs_for_profiler) as prof:
	with optimize_ctx:
	synchronize()
	torch.manual_seed(1337)
	for _ in range(num_runs):
	f(input, **kwargs_for_f)
	synchronize()
	prof.export_chrome_trace(trace_filename)

	return timing


	def get_chrome_trace_events(filename):
	f = open(filename)
	data = json.load(f)
	events = data["traceEvents"]
	return events


	def is_gpu_compute_event(event):
	global gpu_pids
	return "pid" in event and event["pid"] in gpu_pids and "ph" in event and event["ph"] == "X"


	def get_sorted_gpu_events(events):
	sorted_gpu_events = []
	for event in events:
	if(not is_gpu_compute_event(event)):
	continue
	sorted_gpu_events.append(event)
	return sorted(sorted_gpu_events, key=lambda x: x["ts"])


	def get_duration(sorted_gpu_events):
	if len(sorted_gpu_events) == 0:
	return 0
	event = sorted_gpu_events[0]
	current_end_time = event["ts"] + event["dur"]
	total_duration = event["dur"]
	for event in sorted_gpu_events[1:]:
	start_time = max(event["ts"], current_end_time)
	end_time = event["ts"] + event["dur"]
	total_duration = total_duration + max(end_time - start_time, 0)
	current_end_time = max(current_end_time, end_time)
	return total_duration


	def get_sorted_gpu_mm_conv_events(events):
	def is_mm_conv_event(event):
	return "name" in event and ("gemm" in event["name"] or "conv" in event["name"]
	or "cutlass" in event["name"] or "wgrad" in event["name"])
	gpu_events = get_sorted_gpu_events(events)
	sorted_events = []
	for event in gpu_events:
	if(not is_mm_conv_event(event)):
	continue
	sorted_events.append(event)
	return sorted_events


	gpu_pids = []


	def compute_utilization(filename: str, total_length: float):
	"""
	Process the chrome traces outputs by the pytorch profiler to compute GPU Utilization
	and percent of times spent on matmal and convolution

	Args:
	filename(str): Name of chrome traces file produced by pytorch profiler

	total_length(float): total length of the process without profiler in second

	Return:
	tuple: (GPU Utilization, percent of time spent on matmal and convolution)
	"""
	events = get_chrome_trace_events(filename)

	# get pids of GPU events
	global gpu_pids
	gpu_pids = []
	for event in events:
	if "name" not in event:
	continue
	if event["name"] == 'process_labels' and "GPU" in event["args"]["labels"]:
	gpu_pids.append(event["pid"])

	total_length = total_length * 1e6
	sorted_gpu_events = get_sorted_gpu_events(events)
	utilization = get_duration(sorted_gpu_events) / total_length

	sorted_gpu_mm_conv_events = get_sorted_gpu_mm_conv_events(events)
	mm_conv_utilization = get_duration(sorted_gpu_mm_conv_events) / total_length

	return utilization, mm_conv_utilization


	def benchmark_utilization(f, input, trace_folder, optimize_ctx=None, trace_file_name="tmp_chrome_trace", num_runs=1):
	"""
	Benchmark the GPU Utilization and percent of time spent on matmal and convolution operations of
	running f(input, **kwargs_for_f) with [optimize_ctx] [num_runs] times.
	It will produce a chrome trace file in trace_folder/trace_file_name.json

	Example:

	```
	def f(a):
	return a.sum()
	a = torch.rand(2**20, device="cuda")
	utilization, mm_conv_utilization = benchmark_utilization(f, a, "tmp", trace_file_name = "tmp_chrome_trace")
	```

	Args:
	f: function to benchmark

	input: input to :attr:`f`

	trace_folder: name of the folder to store the chrome trace

	optimize_ctx: the context in which f will run

	trace_file_name: name of the dumped chrome trace file, default to "tmp_chrome_trace"

	num_runs: number of times to run f, excluding the warm-up runs, default to 1.

	Return:
	tuple: (GPU Utilization, percent of time spent on matmal and convolution)

	"""
	isExist = os.path.exists(trace_folder)
	if not isExist:
	os.makedirs(trace_folder)
	print("create folder " + trace_folder)

	if optimize_ctx is None:
	optimize_ctx = NullContext()

	chrome_trace_file_name = os.path.join(trace_folder, trace_file_name + ".json")
	total_length = dump_chrome_trace(f, input, chrome_trace_file_name, optimize_ctx,
	[ProfilerActivity.CUDA], num_runs=num_runs, devices="cuda")
	utilization, mm_conv_utilization = compute_utilization(chrome_trace_file_name, total_length)

	return utilization, mm_conv_utilization