test/inductor/test_mmdecomp.py - platform/external/pytorch - Git at Google

 # Owner(s): ["module: nn"]

 import math
 import unittest
 from typing import List, Tuple, Union

 import torch
 from torch._inductor import config
 from torch.testing._internal.common_cuda import SM80OrLater
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_nn import NNTestCase
 from torch.testing._internal.common_utils import IS_WINDOWS, parametrize, run_tests
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU

 default_atol = {
     torch.float16: 1e-3,
     torch.bfloat16: float("infinity"),
     torch.float32: 1e-5,
 }
 default_rtol = {
     torch.float16: 1e-3,
     torch.bfloat16: float("infinity"),
     torch.float32: 1.3e-6,
 }


 def rand_math_tensor(
     shape: Tuple[Union[int, List[int]]],
     device: str,
     dtype: torch.dtype,
     requires_grad: bool = False,
     packed: bool = False,
 ) -> torch.Tensor:
     """Creates rand dense or nested tensor with given shape and type.

     Args:
         shape (Tuple[int]): Shape of Tensor to construct
         device (str): which device to create tensor on
         dtype (torch.dtype): Tensors' dtype
         requires_grad (bool, optional): Tensors grad status. Defaults to False.
         packed (bool, optional): Whether to create a single QKV packed or not. Defaults to False.

     Returns:
         torch.Tensor: A new tensor
     """
     return torch.randn(shape, device=device, dtype=dtype, requires_grad=requires_grad)


 def init_tensor(tensor_list, **kwargs) -> torch.Tensor:
     return torch.Tensor(tensor_list).to(**kwargs)


 def run_comp_nocomp(function, *inputs, **kwargs):
     c_function = torch.compile(function)

     f_res = function(*inputs)
     cf_res = c_function(*inputs)

     if not (math.isinf(kwargs.get("atol", 0.0)) or math.isinf(kwargs.get("rtol", 0.0))):
         torch.testing.assert_close(f_res, cf_res, **kwargs)


 # The test functions are used by several tests
 def torch_mm(a, b):
     return torch.mm(a, b)


 def torch_addmm(add, b, c):
     return torch.addmm(add, b, c)


 def torch_bmm(a, b):
     return torch.bmm(a, b)


 def torch_baddbmm(add, b, c, alpha, beta):
     return torch.baddbmm(add, b, c, alpha=alpha, beta=beta)


 # The shapes we test on
 ts_list = [
     (1, 32, 32, 1),
     (1, 10, 10, 1),
     (1, 3, 3, 1),
     (32, 1, 1, 32),
     (3, 1, 1, 3),
     (4, 1, 1, 9),
     (9, 1, 1, 4),
 ]


 class TestDecomp(NNTestCase):
     _do_cuda_memory_leak_check = GPU_TYPE == "cuda"
     _do_cuda_non_default_stream = GPU_TYPE == "cuda"

     @unittest.skipIf(not HAS_GPU, "GPU tests require triton")
     @parametrize("dtype", [torch.float, torch.bfloat16])
     def test_simple_mm(self, device, dtype):
         fudge = 10
         rtol = default_rtol[dtype] * fudge
         atol = default_atol[dtype] * fudge

         for t_size in ts_list:
             ((a1_0, a1_1, a2_0, a2_1)) = t_size

             t1 = rand_math_tensor((a1_0, a1_1), dtype=dtype, device=device)
             t2 = rand_math_tensor((a2_0, a2_1), dtype=dtype, device=device)
             tadd = rand_math_tensor((a1_0, a2_1), dtype=dtype, device=device)

             run_comp_nocomp(torch_mm, t1, t2, rtol=rtol, atol=atol)
             run_comp_nocomp(torch_addmm, tadd, t1, t2, rtol=rtol, atol=atol)

     @unittest.skipIf(not HAS_GPU, "GPU tests require triton")
     @parametrize(
         "dtype", [torch.float, torch.bfloat16] if SM80OrLater else [torch.float]
     )
     @parametrize("bs", [1, 2, 4, 10])
     def test_batched_mm(self, device, dtype, bs):
         fudge = 3
         rtol = default_rtol[dtype] * fudge
         atol = default_atol[dtype] * fudge

         for t_size in ts_list:
             ((a1_0, a1_1, a2_0, a2_1)) = t_size

             t1 = rand_math_tensor((bs, a1_0, a1_1), dtype=dtype, device=device)
             t2 = rand_math_tensor((bs, a2_0, a2_1), dtype=dtype, device=device)
             tadd = rand_math_tensor((bs, a1_0, a2_1), dtype=dtype, device=device)

             run_comp_nocomp(torch_bmm, t1, t2, rtol=rtol, atol=atol)

             for alpha in (0, 1, -1, 0.5, -0.5):
                 for beta in (0, 1, -1, 0.5, -0.5):
                     run_comp_nocomp(
                         torch_baddbmm, tadd, t1, t2, alpha, beta, rtol=rtol, atol=atol
                     )

     @unittest.skipIf(not HAS_GPU, "GPU tests require triton")
     @config.patch(coordinate_descent_tuning=True)
     def test_bmm_batch2_last_dim_size_is_one(self, device):
         fudge = 3
         rtol = default_rtol[torch.float32] * fudge
         atol = default_atol[torch.float32] * fudge

         t1 = torch.randn(1, 32, 2, device=device)
         t2 = torch.randn(1, 2, 1, device=device)

         run_comp_nocomp(torch_bmm, t1, t2, rtol=rtol, atol=atol)

     @unittest.skipIf(not HAS_GPU, "GPU tests require triton")
     @parametrize("dtype", [torch.float, torch.bfloat16, torch.int])
     def test_some(self, device, dtype):
         # this Pytorch data type is not fully supported on cuda today
         # - unfortunately we can't skipIf because we don't see the actual parms in skipIf
         if device.startswith(GPU_TYPE) and dtype == torch.int:
             return

         run_comp_nocomp(
             torch_mm,
             init_tensor([[1], [2], [3], [4]], dtype=dtype, device=device),
             init_tensor([[1, 2, 3, 4]], dtype=dtype, device=device),
         )
         run_comp_nocomp(
             torch_mm,
             init_tensor([[1, 2, 3, 4]], dtype=dtype, device=device),
             init_tensor([[1], [2], [3], [4]], dtype=dtype, device=device),
         )

     @unittest.skipIf(not HAS_GPU, "GPU tests require triton")
     @parametrize("dtype", [torch.float, torch.bfloat16, torch.int])
     @parametrize("bs", [1, 2, 4, 10])
     def test_some_batched(self, device, dtype, bs):
         # this Pytorch data type is not fully supported on cuda today
         # - unfortunately we can't skipIf because we don't see the actual parms in skipIf
         if device.startswith(GPU_TYPE) and dtype == torch.int:
             return

         run_comp_nocomp(
             torch_bmm,
             init_tensor([[[1], [2], [3], [4]]] * bs, dtype=dtype, device=device),
             init_tensor([[[1, 2, 3, 4]]] * bs, dtype=dtype, device=device),
         )
         run_comp_nocomp(
             torch_bmm,
             init_tensor([[[1, 2, 3, 4]]] * bs, dtype=dtype, device=device),
             init_tensor([[[1], [2], [3], [4]]] * bs, dtype=dtype, device=device),
         )


 device_types = ("cpu", GPU_TYPE)
 instantiate_device_type_tests(TestDecomp, globals(), only_for=device_types)

 if __name__ == "__main__":
     # We don't support torch.compile() on Windows
     if not IS_WINDOWS:
         run_tests()
	# Owner(s): ["module: nn"]

	import math
	import unittest
	from typing import List, Tuple, Union

	import torch
	from torch._inductor import config
	from torch.testing._internal.common_cuda import SM80OrLater
	from torch.testing._internal.common_device_type import instantiate_device_type_tests
	from torch.testing._internal.common_nn import NNTestCase
	from torch.testing._internal.common_utils import IS_WINDOWS, parametrize, run_tests
	from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU

	default_atol = {
	torch.float16: 1e-3,
	torch.bfloat16: float("infinity"),
	torch.float32: 1e-5,
	}
	default_rtol = {
	torch.float16: 1e-3,
	torch.bfloat16: float("infinity"),
	torch.float32: 1.3e-6,
	}


	def rand_math_tensor(
	shape: Tuple[Union[int, List[int]]],
	device: str,
	dtype: torch.dtype,
	requires_grad: bool = False,
	packed: bool = False,
	) -> torch.Tensor:
	"""Creates rand dense or nested tensor with given shape and type.

	Args:
	shape (Tuple[int]): Shape of Tensor to construct
	device (str): which device to create tensor on
	dtype (torch.dtype): Tensors' dtype
	requires_grad (bool, optional): Tensors grad status. Defaults to False.
	packed (bool, optional): Whether to create a single QKV packed or not. Defaults to False.

	Returns:
	torch.Tensor: A new tensor
	"""
	return torch.randn(shape, device=device, dtype=dtype, requires_grad=requires_grad)


	def init_tensor(tensor_list, **kwargs) -> torch.Tensor:
	return torch.Tensor(tensor_list).to(**kwargs)


	def run_comp_nocomp(function, inputs, *kwargs):
	c_function = torch.compile(function)

	f_res = function(*inputs)
	cf_res = c_function(*inputs)

	if not (math.isinf(kwargs.get("atol", 0.0)) or math.isinf(kwargs.get("rtol", 0.0))):
	torch.testing.assert_close(f_res, cf_res, **kwargs)


	# The test functions are used by several tests
	def torch_mm(a, b):
	return torch.mm(a, b)


	def torch_addmm(add, b, c):
	return torch.addmm(add, b, c)


	def torch_bmm(a, b):
	return torch.bmm(a, b)


	def torch_baddbmm(add, b, c, alpha, beta):
	return torch.baddbmm(add, b, c, alpha=alpha, beta=beta)


	# The shapes we test on
	ts_list = [
	(1, 32, 32, 1),
	(1, 10, 10, 1),
	(1, 3, 3, 1),
	(32, 1, 1, 32),
	(3, 1, 1, 3),
	(4, 1, 1, 9),
	(9, 1, 1, 4),
	]


	class TestDecomp(NNTestCase):
	_do_cuda_memory_leak_check = GPU_TYPE == "cuda"
	_do_cuda_non_default_stream = GPU_TYPE == "cuda"

	@unittest.skipIf(not HAS_GPU, "GPU tests require triton")
	@parametrize("dtype", [torch.float, torch.bfloat16])
	def test_simple_mm(self, device, dtype):
	fudge = 10
	rtol = default_rtol[dtype] * fudge
	atol = default_atol[dtype] * fudge

	for t_size in ts_list:
	((a1_0, a1_1, a2_0, a2_1)) = t_size

	t1 = rand_math_tensor((a1_0, a1_1), dtype=dtype, device=device)
	t2 = rand_math_tensor((a2_0, a2_1), dtype=dtype, device=device)
	tadd = rand_math_tensor((a1_0, a2_1), dtype=dtype, device=device)

	run_comp_nocomp(torch_mm, t1, t2, rtol=rtol, atol=atol)
	run_comp_nocomp(torch_addmm, tadd, t1, t2, rtol=rtol, atol=atol)

	@unittest.skipIf(not HAS_GPU, "GPU tests require triton")
	@parametrize(
	"dtype", [torch.float, torch.bfloat16] if SM80OrLater else [torch.float]
	)
	@parametrize("bs", [1, 2, 4, 10])
	def test_batched_mm(self, device, dtype, bs):
	fudge = 3
	rtol = default_rtol[dtype] * fudge
	atol = default_atol[dtype] * fudge

	for t_size in ts_list:
	((a1_0, a1_1, a2_0, a2_1)) = t_size

	t1 = rand_math_tensor((bs, a1_0, a1_1), dtype=dtype, device=device)
	t2 = rand_math_tensor((bs, a2_0, a2_1), dtype=dtype, device=device)
	tadd = rand_math_tensor((bs, a1_0, a2_1), dtype=dtype, device=device)

	run_comp_nocomp(torch_bmm, t1, t2, rtol=rtol, atol=atol)

	for alpha in (0, 1, -1, 0.5, -0.5):
	for beta in (0, 1, -1, 0.5, -0.5):
	run_comp_nocomp(
	torch_baddbmm, tadd, t1, t2, alpha, beta, rtol=rtol, atol=atol
	)

	@unittest.skipIf(not HAS_GPU, "GPU tests require triton")
	@config.patch(coordinate_descent_tuning=True)
	def test_bmm_batch2_last_dim_size_is_one(self, device):
	fudge = 3
	rtol = default_rtol[torch.float32] * fudge
	atol = default_atol[torch.float32] * fudge

	t1 = torch.randn(1, 32, 2, device=device)
	t2 = torch.randn(1, 2, 1, device=device)

	run_comp_nocomp(torch_bmm, t1, t2, rtol=rtol, atol=atol)

	@unittest.skipIf(not HAS_GPU, "GPU tests require triton")
	@parametrize("dtype", [torch.float, torch.bfloat16, torch.int])
	def test_some(self, device, dtype):
	# this Pytorch data type is not fully supported on cuda today
	# - unfortunately we can't skipIf because we don't see the actual parms in skipIf
	if device.startswith(GPU_TYPE) and dtype == torch.int:
	return

	run_comp_nocomp(
	torch_mm,
	init_tensor([[1], [2], [3], [4]], dtype=dtype, device=device),
	init_tensor([[1, 2, 3, 4]], dtype=dtype, device=device),
	)
	run_comp_nocomp(
	torch_mm,
	init_tensor([[1, 2, 3, 4]], dtype=dtype, device=device),
	init_tensor([[1], [2], [3], [4]], dtype=dtype, device=device),
	)

	@unittest.skipIf(not HAS_GPU, "GPU tests require triton")
	@parametrize("dtype", [torch.float, torch.bfloat16, torch.int])
	@parametrize("bs", [1, 2, 4, 10])
	def test_some_batched(self, device, dtype, bs):
	# this Pytorch data type is not fully supported on cuda today
	# - unfortunately we can't skipIf because we don't see the actual parms in skipIf
	if device.startswith(GPU_TYPE) and dtype == torch.int:
	return

	run_comp_nocomp(
	torch_bmm,
	init_tensor([[[1], [2], [3], [4]]] * bs, dtype=dtype, device=device),
	init_tensor([[[1, 2, 3, 4]]] * bs, dtype=dtype, device=device),
	)
	run_comp_nocomp(
	torch_bmm,
	init_tensor([[[1, 2, 3, 4]]] * bs, dtype=dtype, device=device),
	init_tensor([[[1], [2], [3], [4]]] * bs, dtype=dtype, device=device),
	)


	device_types = ("cpu", GPU_TYPE)
	instantiate_device_type_tests(TestDecomp, globals(), only_for=device_types)

	if __name__ == "__main__":
	# We don't support torch.compile() on Windows
	if not IS_WINDOWS:
	run_tests()