test/inductor/test_ck_backend.py - platform/external/pytorch - Git at Google

 # Owner(s): ["module: inductor"]
 import logging
 import os
 import unittest

 import torch
 from torch._inductor import config
 from torch._inductor.test_case import run_tests, TestCase

 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
 )
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA

 torch.set_float32_matmul_precision("high")
 if HAS_CUDA:
     torch.cuda.memory._set_allocator_settings("expandable_segments:False")

 log = logging.getLogger(__name__)


 def _get_path_without_sccache() -> str:
     """
     Get the PATH environment variable without sccache.
     """
     path_envs = os.environ.get("PATH", "").split(":")
     path_envs = [env for env in path_envs if "/opt/cache/bin" not in env]
     return ":".join(path_envs)


 @instantiate_parametrized_tests
 class TestCKBackend(TestCase):
     def setUp(self):
         # The new inductor cache refresh mechanism
         # introduced with https://github.com/pytorch/pytorch/pull/122661
         # interacts badly with persistent subprocesses during
         # autotuning. So we need to disable automatic cache refresh
         # before calling setUp() on the parent class.
         old_disable_fresh_cache_envvar = os.environ.get(
             "INDUCTOR_TEST_DISABLE_FRESH_CACHE", ""
         )

         torch.random.manual_seed(1234)
         try:
             import ck4inductor

             self.ck_dir = os.path.dirname(ck4inductor.__file__)
             os.environ["TORCHINDUCTOR_CK_DIR"] = self.ck_dir
         except ImportError as e:
             raise unittest.SkipTest("Composable Kernel library not installed") from e

         try:
             os.environ["INDUCTOR_TEST_DISABLE_FRESH_CACHE"] = "1"
             super().setUp()
         finally:
             os.environ[
                 "INDUCTOR_TEST_DISABLE_FRESH_CACHE"
             ] = old_disable_fresh_cache_envvar

     @unittest.skipIf(not torch.version.hip, "ROCM only")
     @unittest.skipIf(config.is_fbcode(), "fbcode requires different CK path setup")
     @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     @parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK"))
     def test_max_autotune_precompile(self, max_autotune_gemm_backends):
         """
         Make sure autotuning mm in subprocesses doesn't crash.
         """

         torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False

         def mm(a, b):
             return a @ b

         tensor_options = {"device": "cuda", "dtype": torch.bfloat16}

         a = torch.randn(2240, 256, **tensor_options)
         b = torch.randn(256, 2048, **tensor_options)

         assert "rocm" in dir(config)

         with config.patch(
             {
                 "max_autotune": True,
                 "autotune_in_subproc": True,
                 "max_autotune_gemm_backends": max_autotune_gemm_backends,
                 "compile_threads": 2,
                 "rocm.n_max_profiling_configs": 2,
                 "rocm.ck_dir": self.ck_dir,
             }
         ):
             Y_compiled = torch.compile(mm, dynamic=False)(a, b)
             Y = mm(a, b)
             torch.testing.assert_close(Y_compiled, Y)

     @unittest.skipIf(not torch.version.hip, "ROCM only")
     @unittest.skipIf(config.is_fbcode(), "fbcode requires different CK path setup")
     @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     @parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK"))
     def test_max_autotune_precompile_preselected(self, max_autotune_gemm_backends):
         """
         End to end test for picking preselected ck instances
         """

         torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False

         def mm(a, b):
             return a @ b

         tensor_options = {"device": "cuda", "dtype": torch.float16}

         a = torch.randn(2240, 256, **tensor_options)
         b = torch.randn(2048, 256, **tensor_options).transpose(0, 1)

         assert "rocm" in dir(config)

         with config.patch(
             {
                 "max_autotune": True,
                 "autotune_in_subproc": True,
                 "max_autotune_gemm_backends": max_autotune_gemm_backends,
                 "compile_threads": 12,
                 "rocm.ck_dir": self.ck_dir,
                 "rocm.use_preselected_instances": True,
             }
         ):
             Y_compiled = torch.compile(mm, dynamic=False)(a, b)
             Y = mm(a, b)
             torch.testing.assert_close(Y_compiled, Y)

     @unittest.skipIf(not torch.version.hip, "ROCM only")
     @unittest.skipIf(config.is_fbcode(), "fbcode requires different CK path setup")
     @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     @parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK"))
     def test_max_autotune_precompile_non_contiguous(self, max_autotune_gemm_backends):
         """
         Make sure the ck template can work with non-contiguous inputs
         """

         torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False

         tensor_options = {"device": "cuda", "dtype": torch.float16}

         a = torch.empty_strided((50257, 32768), (1, 50304), **tensor_options)
         b = torch.empty_strided((32768, 768), (768, 1), **tensor_options)

         assert "rocm" in dir(config)

         with config.patch(
             {
                 "max_autotune": True,
                 "autotune_in_subproc": True,
                 "max_autotune_gemm_backends": max_autotune_gemm_backends,
                 "compile_threads": 2,
                 "rocm.ck_dir": self.ck_dir,
                 "rocm.n_max_profiling_configs": 2,
             }
         ):

             @torch.compile(dynamic=False)
             def mm(a, b):
                 return a @ b

             Y_compiled = mm(a, b)
             Y_eager = a @ b
             torch.testing.assert_close(Y_compiled, Y_eager)


 if __name__ == "__main__":
     from torch._inductor.utils import is_big_gpu

     # Set env to make it work in CI.
     if HAS_CUDA and HAS_CPU and is_big_gpu(0):
         run_tests()
	# Owner(s): ["module: inductor"]
	import logging
	import os
	import unittest

	import torch
	from torch._inductor import config
	from torch._inductor.test_case import run_tests, TestCase

	from torch.testing._internal.common_utils import (
	instantiate_parametrized_tests,
	parametrize,
	)
	from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA

	torch.set_float32_matmul_precision("high")
	if HAS_CUDA:
	torch.cuda.memory._set_allocator_settings("expandable_segments:False")

	log = logging.getLogger(__name__)


	def _get_path_without_sccache() -> str:
	"""
	Get the PATH environment variable without sccache.
	"""
	path_envs = os.environ.get("PATH", "").split(":")
	path_envs = [env for env in path_envs if "/opt/cache/bin" not in env]
	return ":".join(path_envs)


	@instantiate_parametrized_tests
	class TestCKBackend(TestCase):
	def setUp(self):
	# The new inductor cache refresh mechanism
	# introduced with https://github.com/pytorch/pytorch/pull/122661
	# interacts badly with persistent subprocesses during
	# autotuning. So we need to disable automatic cache refresh
	# before calling setUp() on the parent class.
	old_disable_fresh_cache_envvar = os.environ.get(
	"INDUCTOR_TEST_DISABLE_FRESH_CACHE", ""
	)

	torch.random.manual_seed(1234)
	try:
	import ck4inductor

	self.ck_dir = os.path.dirname(ck4inductor.__file__)
	os.environ["TORCHINDUCTOR_CK_DIR"] = self.ck_dir
	except ImportError as e:
	raise unittest.SkipTest("Composable Kernel library not installed") from e

	try:
	os.environ["INDUCTOR_TEST_DISABLE_FRESH_CACHE"] = "1"
	super().setUp()
	finally:
	os.environ[
	"INDUCTOR_TEST_DISABLE_FRESH_CACHE"
	] = old_disable_fresh_cache_envvar

	@unittest.skipIf(not torch.version.hip, "ROCM only")
	@unittest.skipIf(config.is_fbcode(), "fbcode requires different CK path setup")
	@unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
	@parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK"))
	def test_max_autotune_precompile(self, max_autotune_gemm_backends):
	"""
	Make sure autotuning mm in subprocesses doesn't crash.
	"""

	torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False

	def mm(a, b):
	return a @ b

	tensor_options = {"device": "cuda", "dtype": torch.bfloat16}

	a = torch.randn(2240, 256, **tensor_options)
	b = torch.randn(256, 2048, **tensor_options)

	assert "rocm" in dir(config)

	with config.patch(
	{
	"max_autotune": True,
	"autotune_in_subproc": True,
	"max_autotune_gemm_backends": max_autotune_gemm_backends,
	"compile_threads": 2,
	"rocm.n_max_profiling_configs": 2,
	"rocm.ck_dir": self.ck_dir,
	}
	):
	Y_compiled = torch.compile(mm, dynamic=False)(a, b)
	Y = mm(a, b)
	torch.testing.assert_close(Y_compiled, Y)

	@unittest.skipIf(not torch.version.hip, "ROCM only")
	@unittest.skipIf(config.is_fbcode(), "fbcode requires different CK path setup")
	@unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
	@parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK"))
	def test_max_autotune_precompile_preselected(self, max_autotune_gemm_backends):
	"""
	End to end test for picking preselected ck instances
	"""

	torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False

	def mm(a, b):
	return a @ b

	tensor_options = {"device": "cuda", "dtype": torch.float16}

	a = torch.randn(2240, 256, **tensor_options)
	b = torch.randn(2048, 256, **tensor_options).transpose(0, 1)

	assert "rocm" in dir(config)

	with config.patch(
	{
	"max_autotune": True,
	"autotune_in_subproc": True,
	"max_autotune_gemm_backends": max_autotune_gemm_backends,
	"compile_threads": 12,
	"rocm.ck_dir": self.ck_dir,
	"rocm.use_preselected_instances": True,
	}
	):
	Y_compiled = torch.compile(mm, dynamic=False)(a, b)
	Y = mm(a, b)
	torch.testing.assert_close(Y_compiled, Y)

	@unittest.skipIf(not torch.version.hip, "ROCM only")
	@unittest.skipIf(config.is_fbcode(), "fbcode requires different CK path setup")
	@unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
	@parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK"))
	def test_max_autotune_precompile_non_contiguous(self, max_autotune_gemm_backends):
	"""
	Make sure the ck template can work with non-contiguous inputs
	"""

	torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False

	tensor_options = {"device": "cuda", "dtype": torch.float16}

	a = torch.empty_strided((50257, 32768), (1, 50304), **tensor_options)
	b = torch.empty_strided((32768, 768), (768, 1), **tensor_options)

	assert "rocm" in dir(config)

	with config.patch(
	{
	"max_autotune": True,
	"autotune_in_subproc": True,
	"max_autotune_gemm_backends": max_autotune_gemm_backends,
	"compile_threads": 2,
	"rocm.ck_dir": self.ck_dir,
	"rocm.n_max_profiling_configs": 2,
	}
	):

	@torch.compile(dynamic=False)
	def mm(a, b):
	return a @ b

	Y_compiled = mm(a, b)
	Y_eager = a @ b
	torch.testing.assert_close(Y_compiled, Y_eager)


	if __name__ == "__main__":
	from torch._inductor.utils import is_big_gpu

	# Set env to make it work in CI.
	if HAS_CUDA and HAS_CPU and is_big_gpu(0):
	run_tests()