benchmarks/dynamo/microbenchmarks/profile_conv.py - platform/external/pytorch - Git at Google

 import torch

 import torch._inductor.triton_ops
 from torch.profiler import profile, ProfilerActivity, record_function

 # The flag below controls whether to allow TF32 on matmul. This flag defaults to True.
 torch.backends.cuda.matmul.allow_tf32 = True
 # The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
 torch.backends.cudnn.allow_tf32 = True


 (
     BATCH,
     IN_C,
     IN_H,
     IN_W,
     KERNEL_N,
     KERNEL_H,
     KERNEL_W,
     stride,
     padding,
     dilation,
     groups,
     dtype,
 ) = (32, 56, 56, 64, 3, 3, 64, (1, 1), (0, 0), (1, 1), 1, torch.float32)


 def profile_op(
     # provider
     provider,
     # Tensor dimensions
     BATCH,
     IN_C,
     IN_H,
     IN_W,
     KERNEL_N,
     KERNEL_H,
     KERNEL_W,
     # parameters of conv
     stride=(1, 1),
     padding=(0, 0),
     dilation=(1, 1),
     groups=1,
     dtype=torch.float16,
     layout="nhwc",
     warmup=25,
     rep=50,
 ):
     # allocate inputs, nchw
     x = torch.randn((BATCH, IN_C, IN_H, IN_W), dtype=dtype, device="cuda")
     w = torch.randn(
         (KERNEL_N, IN_C // groups, KERNEL_H, KERNEL_W), dtype=dtype, device="cuda"
     )
     bias = torch.randn((KERNEL_N), dtype=dtype, device="cuda")
     if layout == "nhwc":
         x = x.to(memory_format=torch.channels_last)
         w = w.to(memory_format=torch.channels_last)

     if provider == "cublas":

         def fn():
             return torch.conv2d(x, w, bias, stride, padding, dilation, groups)

     elif provider == "triton":

         def fn():
             return torch._inductor.triton_ops.conv(
                 x, w, bias, stride, padding, dilation, False, (0, 0), groups
             )

     else:
         raise ValueError(f"{provider} not supported")
     # warm up
     for _ in range(warmup):
         fn()
     with profile(activities=[ProfilerActivity.CUDA], record_shapes=True) as prof:
         with record_function("model_inference"):
             for _ in range(rep):
                 fn()

     print("Profiling ", provider)
     print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))


 for provider in ["cublas", "triton"]:
     profile_op(
         # provider
         provider,
         # Tensor dimensions
         BATCH,
         IN_C,
         IN_H,
         IN_W,
         KERNEL_N,
         KERNEL_H,
         KERNEL_W,
         # parameters of conv
         stride,
         padding,
         dilation,
         groups,
         dtype=dtype,
         layout="nhwc",
         warmup=25,
         rep=50,
     )
	import torch

	import torch._inductor.triton_ops
	from torch.profiler import profile, ProfilerActivity, record_function

	# The flag below controls whether to allow TF32 on matmul. This flag defaults to True.
	torch.backends.cuda.matmul.allow_tf32 = True
	# The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
	torch.backends.cudnn.allow_tf32 = True


	(
	BATCH,
	IN_C,
	IN_H,
	IN_W,
	KERNEL_N,
	KERNEL_H,
	KERNEL_W,
	stride,
	padding,
	dilation,
	groups,
	dtype,
	) = (32, 56, 56, 64, 3, 3, 64, (1, 1), (0, 0), (1, 1), 1, torch.float32)


	def profile_op(
	# provider
	provider,
	# Tensor dimensions
	BATCH,
	IN_C,
	IN_H,
	IN_W,
	KERNEL_N,
	KERNEL_H,
	KERNEL_W,
	# parameters of conv
	stride=(1, 1),
	padding=(0, 0),
	dilation=(1, 1),
	groups=1,
	dtype=torch.float16,
	layout="nhwc",
	warmup=25,
	rep=50,
	):
	# allocate inputs, nchw
	x = torch.randn((BATCH, IN_C, IN_H, IN_W), dtype=dtype, device="cuda")
	w = torch.randn(
	(KERNEL_N, IN_C // groups, KERNEL_H, KERNEL_W), dtype=dtype, device="cuda"
	)
	bias = torch.randn((KERNEL_N), dtype=dtype, device="cuda")
	if layout == "nhwc":
	x = x.to(memory_format=torch.channels_last)
	w = w.to(memory_format=torch.channels_last)

	if provider == "cublas":

	def fn():
	return torch.conv2d(x, w, bias, stride, padding, dilation, groups)

	elif provider == "triton":

	def fn():
	return torch._inductor.triton_ops.conv(
	x, w, bias, stride, padding, dilation, False, (0, 0), groups
	)

	else:
	raise ValueError(f"{provider} not supported")
	# warm up
	for _ in range(warmup):
	fn()
	with profile(activities=[ProfilerActivity.CUDA], record_shapes=True) as prof:
	with record_function("model_inference"):
	for _ in range(rep):
	fn()

	print("Profiling ", provider)
	print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))


	for provider in ["cublas", "triton"]:
	profile_op(
	# provider
	provider,
	# Tensor dimensions
	BATCH,
	IN_C,
	IN_H,
	IN_W,
	KERNEL_N,
	KERNEL_H,
	KERNEL_W,
	# parameters of conv
	stride,
	padding,
	dilation,
	groups,
	dtype=dtype,
	layout="nhwc",
	warmup=25,
	rep=50,
	)