| import argparse |
| import sys |
| import timeit |
| |
| import torch |
| |
| from torch.utils.benchmark import Timer |
| |
| PARALLEL_TASKS_NUM = 4 |
| INTERNAL_ITER = None |
| |
| |
| def loop_workload(x): |
| for i in range(INTERNAL_ITER): |
| x = torch.mm(x, x) |
| return x |
| |
| |
| def parallel_workload(x): |
| def parallel_task(x): |
| for i in range(int(INTERNAL_ITER / PARALLEL_TASKS_NUM)): |
| x = torch.mm(x, x) |
| return x |
| |
| futs = [] |
| for i in range(PARALLEL_TASKS_NUM): |
| futs.append(torch.jit._fork(parallel_task, x)) |
| for i in range(PARALLEL_TASKS_NUM): |
| torch.jit._wait(futs[i]) |
| return x |
| |
| |
| if __name__ == "__main__": |
| torch._C._set_graph_executor_optimize(False) |
| parser = argparse.ArgumentParser(description="Profiler benchmark") |
| |
| parser.add_argument("--with-cuda", "--with_cuda", action="store_true") |
| parser.add_argument("--with-stack", "--with_stack", action="store_true") |
| parser.add_argument("--use-script", "--use_script", action="store_true") |
| parser.add_argument("--use-kineto", "--use_kineto", action="store_true") |
| parser.add_argument( |
| "--profiling-tensor-size", "--profiling_tensor_size", default=1, type=int |
| ) |
| parser.add_argument("--workload", "--workload", default="loop", type=str) |
| parser.add_argument("--internal-iter", "--internal_iter", default=256, type=int) |
| parser.add_argument( |
| "--timer-min-run-time", "--timer_min_run_time", default=10, type=int |
| ) |
| parser.add_argument("--cuda-only", "--cuda_only", action="store_true") |
| |
| args = parser.parse_args() |
| |
| if args.with_cuda and not torch.cuda.is_available(): |
| print("No CUDA available") |
| sys.exit() |
| |
| print( |
| f"Payload: {args.workload}, {args.internal_iter} iterations; timer min. runtime = {args.timer_min_run_time}\n" |
| ) |
| INTERNAL_ITER = args.internal_iter |
| |
| for profiling_enabled in [False, True]: |
| print( |
| "Profiling {}, tensor size {}x{}, use cuda: {}, use kineto: {}, with stacks: {}, use script: {}".format( |
| "enabled" if profiling_enabled else "disabled", |
| args.profiling_tensor_size, |
| args.profiling_tensor_size, |
| args.with_cuda, |
| args.use_kineto, |
| args.with_stack, |
| args.use_script, |
| ) |
| ) |
| |
| input_x = torch.rand(args.profiling_tensor_size, args.profiling_tensor_size) |
| |
| if args.with_cuda: |
| input_x = input_x.cuda() |
| |
| workload = None |
| assert args.workload in ["loop", "parallel"] |
| if args.workload == "loop": |
| workload = loop_workload |
| else: |
| workload = parallel_workload |
| |
| if args.use_script: |
| traced_workload = torch.jit.trace(workload, (input_x,)) |
| workload = traced_workload |
| |
| if profiling_enabled: |
| |
| def payload(): |
| x = None |
| with torch.autograd.profiler.profile( |
| use_cuda=args.with_cuda, |
| with_stack=args.with_stack, |
| use_kineto=args.use_kineto, |
| use_cpu=not args.cuda_only, |
| ) as prof: |
| x = workload(input_x) |
| return x |
| |
| else: |
| |
| def payload(): |
| return workload(input_x) |
| |
| t = Timer( |
| "payload()", |
| globals={"payload": payload}, |
| timer=timeit.default_timer, |
| ).blocked_autorange(min_run_time=args.timer_min_run_time) |
| print(t) |