benchmarks/profiler_benchmark/profiler_bench.py - platform/external/pytorch - Git at Google

 import argparse
 import sys
 import timeit

 import torch

 from torch.utils.benchmark import Timer

 PARALLEL_TASKS_NUM = 4
 INTERNAL_ITER = None


 def loop_workload(x):
     for i in range(INTERNAL_ITER):
         x = torch.mm(x, x)
     return x


 def parallel_workload(x):
     def parallel_task(x):
         for i in range(int(INTERNAL_ITER / PARALLEL_TASKS_NUM)):
             x = torch.mm(x, x)
         return x

     futs = []
     for i in range(PARALLEL_TASKS_NUM):
         futs.append(torch.jit._fork(parallel_task, x))
     for i in range(PARALLEL_TASKS_NUM):
         torch.jit._wait(futs[i])
     return x


 if __name__ == "__main__":
     torch._C._set_graph_executor_optimize(False)
     parser = argparse.ArgumentParser(description="Profiler benchmark")

     parser.add_argument("--with-cuda", "--with_cuda", action="store_true")
     parser.add_argument("--with-stack", "--with_stack", action="store_true")
     parser.add_argument("--use-script", "--use_script", action="store_true")
     parser.add_argument("--use-kineto", "--use_kineto", action="store_true")
     parser.add_argument(
         "--profiling-tensor-size", "--profiling_tensor_size", default=1, type=int
     )
     parser.add_argument("--workload", "--workload", default="loop", type=str)
     parser.add_argument("--internal-iter", "--internal_iter", default=256, type=int)
     parser.add_argument(
         "--timer-min-run-time", "--timer_min_run_time", default=10, type=int
     )
     parser.add_argument("--cuda-only", "--cuda_only", action="store_true")

     args = parser.parse_args()

     if args.with_cuda and not torch.cuda.is_available():
         print("No CUDA available")
         sys.exit()

     print(
         f"Payload: {args.workload}, {args.internal_iter} iterations; timer min. runtime = {args.timer_min_run_time}\n"
     )
     INTERNAL_ITER = args.internal_iter

     for profiling_enabled in [False, True]:
         print(
             "Profiling {}, tensor size {}x{}, use cuda: {}, use kineto: {}, with stacks: {}, use script: {}".format(
                 "enabled" if profiling_enabled else "disabled",
                 args.profiling_tensor_size,
                 args.profiling_tensor_size,
                 args.with_cuda,
                 args.use_kineto,
                 args.with_stack,
                 args.use_script,
             )
         )

         input_x = torch.rand(args.profiling_tensor_size, args.profiling_tensor_size)

         if args.with_cuda:
             input_x = input_x.cuda()

         workload = None
         assert args.workload in ["loop", "parallel"]
         if args.workload == "loop":
             workload = loop_workload
         else:
             workload = parallel_workload

         if args.use_script:
             traced_workload = torch.jit.trace(workload, (input_x,))
             workload = traced_workload

         if profiling_enabled:

             def payload():
                 x = None
                 with torch.autograd.profiler.profile(
                     use_cuda=args.with_cuda,
                     with_stack=args.with_stack,
                     use_kineto=args.use_kineto,
                     use_cpu=not args.cuda_only,
                 ) as prof:
                     x = workload(input_x)
                 return x

         else:

             def payload():
                 return workload(input_x)

         t = Timer(
             "payload()",
             globals={"payload": payload},
             timer=timeit.default_timer,
         ).blocked_autorange(min_run_time=args.timer_min_run_time)
         print(t)
	import argparse
	import sys
	import timeit

	import torch

	from torch.utils.benchmark import Timer

	PARALLEL_TASKS_NUM = 4
	INTERNAL_ITER = None


	def loop_workload(x):
	for i in range(INTERNAL_ITER):
	x = torch.mm(x, x)
	return x


	def parallel_workload(x):
	def parallel_task(x):
	for i in range(int(INTERNAL_ITER / PARALLEL_TASKS_NUM)):
	x = torch.mm(x, x)
	return x

	futs = []
	for i in range(PARALLEL_TASKS_NUM):
	futs.append(torch.jit._fork(parallel_task, x))
	for i in range(PARALLEL_TASKS_NUM):
	torch.jit._wait(futs[i])
	return x


	if __name__ == "__main__":
	torch._C._set_graph_executor_optimize(False)
	parser = argparse.ArgumentParser(description="Profiler benchmark")

	parser.add_argument("--with-cuda", "--with_cuda", action="store_true")
	parser.add_argument("--with-stack", "--with_stack", action="store_true")
	parser.add_argument("--use-script", "--use_script", action="store_true")
	parser.add_argument("--use-kineto", "--use_kineto", action="store_true")
	parser.add_argument(
	"--profiling-tensor-size", "--profiling_tensor_size", default=1, type=int
	)
	parser.add_argument("--workload", "--workload", default="loop", type=str)
	parser.add_argument("--internal-iter", "--internal_iter", default=256, type=int)
	parser.add_argument(
	"--timer-min-run-time", "--timer_min_run_time", default=10, type=int
	)
	parser.add_argument("--cuda-only", "--cuda_only", action="store_true")

	args = parser.parse_args()

	if args.with_cuda and not torch.cuda.is_available():
	print("No CUDA available")
	sys.exit()

	print(
	f"Payload: {args.workload}, {args.internal_iter} iterations; timer min. runtime = {args.timer_min_run_time}\n"
	)
	INTERNAL_ITER = args.internal_iter

	for profiling_enabled in [False, True]:
	print(
	"Profiling {}, tensor size {}x{}, use cuda: {}, use kineto: {}, with stacks: {}, use script: {}".format(
	"enabled" if profiling_enabled else "disabled",
	args.profiling_tensor_size,
	args.profiling_tensor_size,
	args.with_cuda,
	args.use_kineto,
	args.with_stack,
	args.use_script,
	)
	)

	input_x = torch.rand(args.profiling_tensor_size, args.profiling_tensor_size)

	if args.with_cuda:
	input_x = input_x.cuda()

	workload = None
	assert args.workload in ["loop", "parallel"]
	if args.workload == "loop":
	workload = loop_workload
	else:
	workload = parallel_workload

	if args.use_script:
	traced_workload = torch.jit.trace(workload, (input_x,))
	workload = traced_workload

	if profiling_enabled:

	def payload():
	x = None
	with torch.autograd.profiler.profile(
	use_cuda=args.with_cuda,
	with_stack=args.with_stack,
	use_kineto=args.use_kineto,
	use_cpu=not args.cuda_only,
	) as prof:
	x = workload(input_x)
	return x

	else:

	def payload():
	return workload(input_x)

	t = Timer(
	"payload()",
	globals={"payload": payload},
	timer=timeit.default_timer,
	).blocked_autorange(min_run_time=args.timer_min_run_time)
	print(t)