| # Owner(s): ["module: inductor"] |
| import logging |
| import os |
| import unittest |
| from typing import Callable, List, Optional |
| from unittest import mock |
| |
| import torch |
| from torch._dynamo.utils import counters |
| from torch._inductor import config |
| from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller |
| from torch._inductor.codegen.cuda.cutlass_utils import get_max_alignment |
| from torch._inductor.ir import ChoiceCaller, FixedLayout |
| from torch._inductor.select_algorithm import NoValidChoicesError |
| from torch._inductor.test_case import run_tests, TestCase |
| from torch._inductor.utils import fresh_inductor_cache |
| from torch.testing._internal.common_cuda import SM75OrLater, SM80OrLater, SM90OrLater |
| from torch.testing._internal.common_utils import ( |
| instantiate_parametrized_tests, |
| parametrize, |
| ) |
| |
| from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA |
| |
| torch.set_float32_matmul_precision("high") |
| if HAS_CUDA: |
| torch.cuda.memory._set_allocator_settings("expandable_segments:False") |
| |
| _CUTLASS_DIR = os.path.join(os.path.dirname(__file__), "../../third_party/cutlass/") |
| |
| log = logging.getLogger(__name__) |
| |
| HAS_CUDA = HAS_CUDA and not torch.version.hip |
| SM75OrLater = SM75OrLater and not torch.version.hip |
| SM80OrLater = SM80OrLater and not torch.version.hip |
| SM90OrLater = SM90OrLater and not torch.version.hip |
| |
| |
| def _get_path_without_sccache() -> str: |
| """ |
| Get the PATH environment variable without sccache. |
| """ |
| path_envs = os.environ.get("PATH", "").split(":") |
| path_envs = [env for env in path_envs if "/opt/cache/bin" not in env] |
| return ":".join(path_envs) |
| |
| |
| @instantiate_parametrized_tests |
| class TestCutlassBackend(TestCase): |
| def setUp(self): |
| # The new inductor cache refresh mechanism |
| # introduced with https://github.com/pytorch/pytorch/pull/122661 |
| # interacts badly with persistent subprocesses during |
| # autotuning. So we need to disable automatic cache refresh |
| # before calling setUp() on the parent class. |
| old_disable_fresh_cache_envvar = os.environ.get( |
| "INDUCTOR_TEST_DISABLE_FRESH_CACHE", "" |
| ) |
| try: |
| os.environ["INDUCTOR_TEST_DISABLE_FRESH_CACHE"] = "1" |
| super().setUp() |
| finally: |
| os.environ[ |
| "INDUCTOR_TEST_DISABLE_FRESH_CACHE" |
| ] = old_disable_fresh_cache_envvar |
| torch.random.manual_seed(1234) |
| |
| @unittest.skipIf(not SM75OrLater, "need sm_75") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()}) |
| def test_max_autotune_cutlass_threshold(self): |
| """ |
| Make sure Cutlass GEMM threshold works as intended. |
| """ |
| |
| if torch.version.hip: |
| return |
| |
| torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False |
| |
| def mm(a, b): |
| return a @ b |
| |
| a = torch.randn(100, 10).cuda().half() |
| b = torch.randn(10, 100).cuda().half() |
| |
| with config.patch( |
| { |
| "max_autotune": True, |
| "autotune_in_subproc": True, |
| "max_autotune_gemm_backends": "CUTLASS,ATen", |
| "compile_threads": 4, |
| "cuda.cutlass_backend_min_gemm_size": 100000, |
| "cuda.cutlass_dir": _CUTLASS_DIR, |
| "cuda.cutlass_max_profiling_configs": 2, |
| } |
| ): |
| from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller |
| |
| with mock.patch( |
| "torch._inductor.select_algorithm.autotune_select_algorithm" |
| ) as mocked_select_algorithm: |
| Y_compiled = torch.compile(mm, dynamic=False)(a, b) |
| Y = mm(a, b) |
| passed_choice_callers: List[ChoiceCaller] = mocked_select_algorithm[0][ |
| 1 |
| ] |
| assert all( |
| isinstance(cc, ChoiceCaller) for cc in passed_choice_callers |
| ), "Argument 1 to autotune_select_algorithm should be a list of ChoiceCaller instances" |
| # We expect that no Cutlass Kernels are considered, due to the threshold |
| assert all( |
| not isinstance(cc, CUDATemplateCaller) |
| for cc in passed_choice_callers |
| ), "Cutlass Kernels should have been filtered, GEMM size is too small" |
| torch.testing.assert_close(Y_compiled, Y) |
| |
| @unittest.skipIf(not SM75OrLater, "need sm_75") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()}) |
| def test_max_autotune_precompile(self): |
| """ |
| Make sure autotuning mm in sub processes work without crashes. |
| """ |
| |
| if torch.version.hip: |
| return |
| |
| torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False |
| |
| def mm(a, b): |
| return a @ b |
| |
| a = torch.randn(100, 10).cuda().half() |
| b = torch.randn(10, 100).cuda().half() |
| |
| with config.patch( |
| { |
| "max_autotune": True, |
| "autotune_in_subproc": True, |
| "max_autotune_gemm_backends": "CUTLASS,Triton,ATen", |
| "compile_threads": 4, |
| "cuda.cutlass_dir": _CUTLASS_DIR, |
| "cuda.cutlass_max_profiling_configs": 2, |
| } |
| ): |
| Y_compiled = torch.compile(mm, dynamic=False)(a, b) |
| Y = mm(a, b) |
| torch.testing.assert_close(Y_compiled, Y) |
| |
| # TODO: Enable dynamic test cases when dynamic support is added. |
| @unittest.skipIf(not SM75OrLater, "need sm_75") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| @parametrize("dynamic", (False, True)) |
| @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen,Triton,CUTLASS")) |
| @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()}) |
| def test_max_autotune_cutlass_backend_regular_mm( |
| self, dynamic: bool, max_autotune_gemm_backends: str |
| ): |
| """ |
| Make sure autotuning mm in sub processes work without crashes. |
| """ |
| |
| if max_autotune_gemm_backends == "CUTLASS" and torch.version.hip: |
| return |
| |
| torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False |
| |
| def mm(a, b): |
| return a @ b |
| |
| a = torch.randn(128, 16).cuda().half() |
| b = torch.randn(16, 128).cuda().half() |
| |
| with config.patch( |
| { |
| "max_autotune": True, |
| "autotune_in_subproc": False, |
| "max_autotune_gemm_backends": max_autotune_gemm_backends, |
| "cuda.cutlass_dir": _CUTLASS_DIR, |
| "cuda.cutlass_max_profiling_configs": 2, |
| } |
| ): |
| Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b) |
| Y = mm(a, b) |
| torch.testing.assert_close(Y_compiled, Y) |
| |
| @unittest.skipIf(not SM90OrLater, "need sm_90") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()}) |
| def test_max_autotune_cutlass_backend_regular_mm_streamk( |
| self, dynamic: bool = False, max_autotune_gemm_backends: str = "CUTLASS" |
| ): |
| """ |
| Make sure autotuning mm in sub processes work without crashes. |
| """ |
| |
| if max_autotune_gemm_backends == "CUTLASS" and torch.version.hip: |
| return |
| |
| torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False |
| |
| def mm(a, b): |
| return a @ b |
| |
| a = torch.randn(128, 16).cuda().half() |
| b = torch.randn(16, 128).cuda().half() |
| |
| with config.patch( |
| { |
| "max_autotune": True, |
| "autotune_in_subproc": True, |
| "max_autotune_gemm_backends": max_autotune_gemm_backends, |
| "cuda.cutlass_dir": _CUTLASS_DIR, |
| "cuda.cutlass_max_profiling_configs": 2, |
| "cuda.cutlass_op_allowlist_regex": "stream_k", # only stream-k GEMM Kernels |
| } |
| ): |
| for M, K, N in ( |
| (128, 16, 128), |
| (1024, 256, 1024), |
| ( |
| 16384, |
| 1024, |
| 16384, |
| ), |
| ( |
| 16384, |
| 1408, |
| 16384, |
| ), |
| ): |
| a = torch.randn(M, K).cuda().half() |
| b = torch.randn(K, N).cuda().half() |
| Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b) |
| Y = mm(a, b) |
| # we need relaxed numerical limits due to the sheer size of the |
| # matmuls involved. Many small addition differences add up. |
| torch.testing.assert_close(Y_compiled, Y, atol=0.01, rtol=0.01) |
| |
| def _test_max_autotune_cutlass_backend_epilogue_fusion( |
| self, |
| dynamic: bool = False, |
| max_autotune_gemm_backends: str = "CUTLASS", |
| mixed_precision=False, |
| fp16=True, |
| expected_fuse_count=0, |
| mm: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = None, |
| batch_size: Optional[int] = None, |
| ): |
| torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = ( |
| mixed_precision |
| ) |
| |
| # Note: The ops that are available |
| # also depend on the alignment of the shapes |
| # so if these shapes don't all align to at least 8 elements |
| # it can happen that no Cutlass 3.x op is available |
| # that allows fusions |
| if batch_size is None: |
| a = torch.randn(256, 32).cuda() |
| b = torch.randn(32, 256).cuda() |
| else: |
| a = torch.randn(batch_size, 256, 32).cuda() |
| b = torch.randn(batch_size, 32, 256).cuda() |
| if fp16: |
| a = a.half() |
| b = b.half() |
| |
| with config.patch( |
| { |
| "max_autotune": True, |
| "autotune_in_subproc": True, |
| "max_autotune_gemm_backends": max_autotune_gemm_backends, |
| "cuda.cutlass_dir": _CUTLASS_DIR, |
| "cuda.cutlass_max_profiling_configs": 4, |
| "cuda.version": "12.2", # required to enable the Kernels we need |
| } |
| ): |
| counters["inductor"]["cuda_epilogue_fusion_counter"] = 0 |
| Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b) |
| Y = mm(a, b) |
| actual_count = counters["inductor"]["cuda_epilogue_fusion_counter"] |
| assert ( |
| actual_count == expected_fuse_count |
| ), f"Expected fuse count of {expected_fuse_count} but got {actual_count}" |
| torch.testing.assert_close(Y_compiled, Y, atol=1e-2, rtol=1e-2) |
| |
| @unittest.skipIf(not SM90OrLater, "need sm_90") |
| @unittest.skipIf(torch.version.hip, "HIP not supported") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| def test_max_autotune_cutlass_backend_simple_fusion_fp16(self): |
| def mm(a, b): |
| return (a @ b) * 3.0 |
| |
| # The pointwise ops seem to be pre-fused into a single Pointwise |
| self._test_max_autotune_cutlass_backend_epilogue_fusion( |
| mixed_precision=False, fp16=True, expected_fuse_count=0, mm=mm |
| ) |
| |
| @unittest.skipIf(not SM90OrLater, "need sm_90") |
| @unittest.skipIf(torch.version.hip, "HIP not supported") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| def test_max_autotune_cutlass_backend_simple_fusion_fp16_fp32acc(self): |
| def mm(a, b): |
| return (a @ b) * 3.0 |
| |
| self._test_max_autotune_cutlass_backend_epilogue_fusion( |
| mixed_precision=True, fp16=True, expected_fuse_count=0, mm=mm |
| ) |
| |
| @unittest.skipIf(not SM90OrLater, "need sm_90") |
| @unittest.skipIf(torch.version.hip, "HIP not supported") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| def test_max_autotune_cutlass_backend_chained_fusion_fp16(self): |
| def mm(a, b): |
| return (a @ b) * 3.3 - 1.234 |
| |
| # The pointwise ops seem to be pre-fused into a single Pointwise |
| self._test_max_autotune_cutlass_backend_epilogue_fusion( |
| mixed_precision=False, fp16=True, expected_fuse_count=0, mm=mm |
| ) |
| |
| @unittest.skipIf(not SM90OrLater, "need sm_90") |
| @unittest.skipIf(torch.version.hip, "HIP not supported") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| def test_max_autotune_cutlass_backend_chained_fusion_fp16_fp32acc(self): |
| def mm(a, b): |
| return (a @ b) * 3.3 - 1.234 |
| |
| self._test_max_autotune_cutlass_backend_epilogue_fusion( |
| mixed_precision=True, fp16=True, expected_fuse_count=0, mm=mm |
| ) |
| |
| @unittest.skipIf(not SM90OrLater, "need sm_90") |
| @unittest.skipIf(torch.version.hip, "HIP not supported") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| def test_max_autotune_cutlass_backend_relu_fusion_fp16(self): |
| def mm(a, b): |
| return torch.nn.functional.relu((a @ b) * 3.3 - 1.234) |
| |
| self._test_max_autotune_cutlass_backend_epilogue_fusion( |
| mixed_precision=False, fp16=True, expected_fuse_count=0, mm=mm |
| ) |
| |
| @unittest.skipIf(not SM90OrLater, "need sm_90") |
| @unittest.skipIf(torch.version.hip, "HIP not supported") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| def test_max_autotune_cutlass_backend_relu_fusion_fp16_fp32acc(self): |
| def mm(a, b): |
| return torch.nn.functional.relu((a @ b) * 3.3 - 1.234) |
| |
| # The pointwise ops seem to be pre-fused into a single Pointwise |
| self._test_max_autotune_cutlass_backend_epilogue_fusion( |
| mixed_precision=True, fp16=True, expected_fuse_count=0, mm=mm |
| ) |
| |
| @unittest.skipIf(not SM90OrLater, "need sm_90") |
| @unittest.skipIf(torch.version.hip, "HIP not supported") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| def test_max_autotune_cutlass_backend_relu6_fusion_fp16_fp32acc(self): |
| def mm(a, b): |
| return torch.clamp(torch.nn.functional.relu(a @ b), max=6.0) |
| |
| # The pointwise ops seem to be pre-fused into a single Pointwise |
| self._test_max_autotune_cutlass_backend_epilogue_fusion( |
| mixed_precision=True, fp16=True, expected_fuse_count=0, mm=mm |
| ) |
| |
| @unittest.skipIf(not SM90OrLater, "need sm_90") |
| @unittest.skipIf(torch.version.hip, "HIP not supported") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| def test_max_autotune_cutlass_backend_no_fusion_dtype_mismatch(self): |
| def mm(a, b): |
| # this should not be fused, since the output dtype is different from the matmul dtype |
| return (a @ b).to(torch.float32) * 0.00001 |
| |
| self._test_max_autotune_cutlass_backend_epilogue_fusion( |
| mixed_precision=True, fp16=True, expected_fuse_count=0, mm=mm |
| ) |
| |
| def test_max_autotune_cutlass_backend_simple_bmm(self): |
| def bmm(a, b): |
| return torch.bmm(a, b) |
| |
| self._test_max_autotune_cutlass_backend_epilogue_fusion( # test bmm |
| mixed_precision=False, |
| fp16=True, |
| expected_fuse_count=0, |
| mm=bmm, |
| batch_size=10, |
| ) |
| |
| @unittest.skipIf(not SM90OrLater, "need sm_90") |
| @unittest.skipIf(torch.version.hip, "HIP not supported") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| def test_max_autotune_cutlass_backend_shape_dependent_normalization_fusion(self): |
| def mm(a, b): |
| return (a @ b) / b.size(1) |
| |
| self._test_max_autotune_cutlass_backend_epilogue_fusion( |
| mixed_precision=True, fp16=True, expected_fuse_count=0, mm=mm |
| ) |
| |
| # TODO: Enable dynamic test cases when dynamic support is added. |
| @unittest.skipIf(not SM75OrLater, "need sm_75") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| @parametrize("dynamic", (False,)) |
| @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen,Triton,CUTLASS")) |
| @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()}) |
| def test_max_autotune_cutlass_backend_mm_bias( |
| self, dynamic: bool = False, max_autotune_gemm_backends: str = "CUTLASS" |
| ): |
| """ |
| Make sure autotuning mm in sub processes work without crashes. |
| """ |
| |
| if max_autotune_gemm_backends == "CUTLASS" and torch.version.hip: |
| return |
| |
| torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False |
| |
| def mm(a, b, bias): |
| return torch.nn.functional.linear(a, b, bias) |
| |
| a = torch.randn(2048, 4096).cuda().half() |
| bias = torch.randn(2048).cuda().half() |
| |
| with config.patch( |
| { |
| "max_autotune": True, |
| "autotune_in_subproc": True, |
| "max_autotune_gemm_backends": max_autotune_gemm_backends, |
| "cuda.cutlass_dir": _CUTLASS_DIR, |
| "cuda.cutlass_max_profiling_configs": 2, |
| } |
| ): |
| Y = mm(a, a, bias) |
| Y_compiled = torch.compile(mm, dynamic=dynamic)(a, a, bias) |
| torch.testing.assert_close(Y_compiled, Y, atol=1e-1, rtol=1e-1) |
| |
| @unittest.skipIf(not SM75OrLater, "need sm_75") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| @parametrize("dynamic", (False,)) |
| @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen,Triton,CUTLASS")) |
| @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()}) |
| def test_max_autotune_cutlass_backend_addmm( |
| self, dynamic, max_autotune_gemm_backends |
| ): |
| """ |
| Make sure autotuning addmm in sub processes work without crashes. |
| """ |
| |
| if max_autotune_gemm_backends == "CUTLASS" and torch.version.hip: |
| return |
| |
| torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False |
| |
| def addmm(x, a, b, alpha, beta): |
| return torch.addmm(x, a, b, alpha=alpha, beta=beta) |
| |
| def compare_results( |
| m: int, k: int, n: int, alpha: float, beta: float, x_shape: List[int] |
| ) -> None: |
| x = torch.randn(x_shape).cuda().half() |
| a = torch.randn(m, k).cuda().half() |
| b = torch.randn(k, n).cuda().half() |
| y_expected = addmm(x, a, b, alpha, beta) |
| |
| compiled_fn = torch.compile(addmm, dynamic=dynamic) |
| y = compiled_fn(x, a, b, alpha, beta) |
| torch.testing.assert_close(y, y_expected) |
| |
| with config.patch( |
| { |
| "max_autotune": True, |
| # Some Cutlass Kernels fail with IMA on this example, which leads to unrecoverable CUDA errors |
| # unless we tune in a subproc here. |
| "autotune_in_subproc": True, |
| "max_autotune_gemm_backends": max_autotune_gemm_backends, |
| "cuda.cutlass_dir": _CUTLASS_DIR, |
| "cuda.cutlass_max_profiling_configs": 4, |
| "cuda.cutlass_op_allowlist_regex": "", |
| "cuda.cutlass_op_denylist_regex": "pingpong", # Pingpong Kernels can lead to numerical issues |
| } |
| ): |
| # No broadcast |
| compare_results(4096, 25728, 2048, 2.0, 0.4, [4096, 2048]) |
| # Broadcast first dim. |
| compare_results(4096, 25728, 2048, 2.0, 0.4, [2048]) |
| # Broadcast last dim. |
| compare_results(4096, 25728, 2048, 2.0, 0.4, [4096, 1]) |
| |
| # TODO: Enable dynamic test cases when dynamic support is added. |
| @unittest.skipIf(not SM80OrLater, "need sm_80") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| @parametrize("dynamic", (False,)) |
| @parametrize("max_autotune_gemm_backends", ("CUTLASS", "CUTLASS,ATen")) |
| @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()}) |
| def test_max_autotune_cutlass_backend_int_mm( |
| self, dynamic: bool, max_autotune_gemm_backends: str |
| ): |
| """ |
| Make sure autotuning mm in sub processes work without crashes. |
| """ |
| |
| if "CUTLASS" in max_autotune_gemm_backends.upper() and torch.version.hip: |
| return |
| |
| def mm(a, b): |
| return torch._int_mm(a, b) |
| |
| # CUTLASS only supports row-major/column-major combination of |
| # layouts for this operation, thus the transpose of tensor b |
| # (on the other side, Triton at the moment doesn't support |
| # this combination, so it's excluded from the test). Also, |
| # for CUTLASS alignment requirements, number of columns in |
| # both tensors has to be divisible by 16. |
| a = torch.randint(0, 5, (100, 16), dtype=torch.int8).cuda() |
| b = torch.randint(0, 5, (32, 16), dtype=torch.int8).cuda().T |
| |
| with config.patch( |
| { |
| "max_autotune": True, |
| "autotune_in_subproc": True, |
| "max_autotune_gemm_backends": max_autotune_gemm_backends, |
| "cuda.cutlass_dir": _CUTLASS_DIR, |
| "cuda.cutlass_max_profiling_configs": 2, |
| } |
| ): |
| Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b) |
| Y = mm(a, b) |
| torch.testing.assert_close(Y_compiled, Y) |
| |
| # TODO: Enable dynamic test cases when dynamic support is added. |
| @unittest.skipIf(not SM80OrLater, "need sm_80") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| @parametrize("dynamic", (False,)) |
| @parametrize("max_autotune_gemm_backends", ("CUTLASS", "CUTLASS,Triton,ATen")) |
| @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()}) |
| def test_max_autotune_cutlass_backend_mixed_mm( |
| self, dynamic: bool, max_autotune_gemm_backends: str |
| ): |
| """ |
| Make sure autotuning mm in sub processes work without crashes. |
| """ |
| |
| if max_autotune_gemm_backends == "CUTLASS" and torch.version.hip: |
| return |
| |
| torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False |
| |
| def mm(a, b): |
| return torch.mm(a, b.to(torch.half)) |
| |
| # CUTLASS only supports row-major/column-major combination of |
| # layouts for this operation, thus the transpose of tensor b. |
| # Also, for CUTLASS alignment requirements, number of columns |
| # of the first tensor has to be divisible by 16. |
| a = torch.randn(100, 16).cuda().half() |
| b = torch.randint(0, 5, (100, 16), dtype=torch.int8).cuda().T |
| |
| with config.patch( |
| { |
| "max_autotune": True, |
| "autotune_in_subproc": True, |
| "max_autotune_gemm_backends": max_autotune_gemm_backends, |
| "cuda.cutlass_dir": _CUTLASS_DIR, |
| "cuda.cutlass_max_profiling_configs": 2, |
| "use_mixed_mm": True, |
| } |
| ): |
| Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b) |
| Y = mm(a, b) |
| torch.testing.assert_close(Y_compiled, Y) |
| |
| @unittest.skipIf(not SM90OrLater, "need sm_90") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()}) |
| def test_cutlass_backend_op_denylist( |
| self, |
| ): |
| def my_addmm(x, a, b, alpha, beta): |
| return torch.addmm(x, a, b, alpha=beta, beta=alpha) |
| |
| x = torch.randn((128, 128)).cuda().half() |
| a = torch.randn(128, 128).cuda().half() |
| b = torch.randn(128, 128).cuda().half() |
| |
| def select_no_algorithm(*args, **kwargs): |
| raise NoValidChoicesError |
| |
| with fresh_inductor_cache(): |
| with config.patch( |
| { |
| "max_autotune": True, |
| # Some Cutlass Kernels fail with IMA on this example, which leads to unrecoverable CUDA errors |
| # unless we tune in a subproc here. |
| "autotune_in_subproc": False, |
| "max_autotune_gemm_backends": "CUTLASS,ATen", |
| "cuda.cutlass_dir": _CUTLASS_DIR, |
| "cuda.cutlass_max_profiling_configs": 2, |
| "cuda.cutlass_op_allowlist_regex": "", |
| "cuda.cutlass_op_denylist_regex": "pingpong", # Pingpong Kernels can lead to numerical issues |
| } |
| ): |
| with mock.patch( |
| "torch._inductor.kernel.mm.autotune_select_algorithm", |
| wraps=select_no_algorithm, |
| ) as sa: |
| torch.compile(my_addmm, dynamic=False)(x, a, b, 1.0, 2.0) |
| args, kwargs = sa.call_args |
| op_name, choices, _, __ = args |
| assert op_name == "addmm" |
| cuda_template_count = 0 |
| for choice in choices: |
| if isinstance(choice, CUDATemplateCaller): |
| choice_info = choice.info_dict() |
| assert ( |
| "pingpong" not in choice_info["op_conf_name"] |
| ), "All pingpong Kernels should have been filtered" |
| cuda_template_count += 1 |
| assert cuda_template_count > 0, "No CUDATemplateCaller choices" |
| |
| @unittest.skipIf(not SM90OrLater, "need sm_90") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()}) |
| def test_cutlass_backend_op_allowlist( |
| self, |
| ): |
| def addmm(x, a, b, alpha, beta): |
| return torch.addmm(x, a, b, alpha=alpha, beta=beta) |
| |
| x = torch.randn((128, 128)).cuda().half() |
| a = torch.randn(128, 128).cuda().half() |
| b = torch.randn(128, 128).cuda().half() |
| |
| def select_no_algorithm(*args, **kwargs): |
| raise NoValidChoicesError |
| |
| with fresh_inductor_cache(): |
| with config.patch( |
| { |
| "max_autotune": True, |
| # Some Cutlass Kernels fail with IMA on this example, which leads to unrecoverable CUDA errors |
| # unless we tune in a subproc here. |
| "autotune_in_subproc": False, |
| "max_autotune_gemm_backends": "CUTLASS,ATen", |
| "cuda.cutlass_dir": _CUTLASS_DIR, |
| "cuda.cutlass_max_profiling_configs": 2, |
| "cuda.cutlass_op_allowlist_regex": "pingpong", |
| "cuda.cutlass_op_denylist_regex": None, # Pingpong Kernels can lead to numerical issues |
| } |
| ): |
| with mock.patch( |
| "torch._inductor.kernel.mm.autotune_select_algorithm", |
| wraps=select_no_algorithm, |
| ) as sa: |
| torch.compile(addmm, dynamic=False)(x, a, b, 1.0, 1.0) |
| args, kwargs = sa.call_args |
| op_name, choices, _, __ = args |
| assert op_name == "addmm" |
| cuda_template_count = 0 |
| for choice in choices: |
| if isinstance(choice, CUDATemplateCaller): |
| choice_info = choice.info_dict() |
| assert ( |
| "pingpong" in choice_info["op_conf_name"] |
| ), "Only pingpong Kernels should have been allowed" |
| cuda_template_count += 1 |
| assert cuda_template_count > 0, "No CUDATemplateCaller choices" |
| |
| @unittest.skipIf(not SM80OrLater, "need sm_90") |
| @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") |
| @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()}) |
| def test_get_max_alignment(self): |
| l4 = FixedLayout("cpu", torch.half, size=(1, 2, 4), stride=(0, 4, 1)) |
| m4 = get_max_alignment(l4) |
| self.assertEqual( |
| m4, 4, "Wrong max alignment. Should have been 4. (simple, contiguous case)" |
| ) |
| |
| l4_2 = FixedLayout("cpu", torch.half, size=(1, 4, 2), stride=(0, 1, 4)) |
| m4_2 = get_max_alignment(l4_2) |
| self.assertEqual( |
| m4_2, |
| 4, |
| "Wrong max alignment. Should have been 4. Did not deal with strides correctly", |
| ) |
| |
| l1 = FixedLayout("cpu", torch.half, size=(2, 4, 2), stride=(23, 1, 4)) |
| m1 = get_max_alignment(l1) |
| self.assertEqual( |
| m1, |
| 1, |
| "Wrong max alignment. Should have been 1. Did not take stride into account correctly", |
| ) |
| |
| l2 = FixedLayout("cpu", torch.half, size=(1, 2, 4), stride=(0, 4, 1), offset=6) |
| m2 = get_max_alignment(l2) |
| self.assertEqual( |
| m2, 2, "Wrong max alignment. Should have been 2. (due to choice of offset)" |
| ) |
| |
| l8 = FixedLayout( |
| "cpu", torch.half, size=(2, 2, 8), stride=(32, 8, 1), offset=24 |
| ) |
| m8 = get_max_alignment(l8) |
| self.assertEqual(m8, 8, "Wrong max alignment. Should have been 8.") |
| |
| l4 = FixedLayout( |
| "cpu", torch.float32, size=(2, 2, 8), stride=(32, 8, 1), offset=24 |
| ) |
| m4 = get_max_alignment(l4) |
| self.assertEqual( |
| m4, 4, "Wrong max alignment. Should have been 4 (due to float32 dtype )." |
| ) |
| |
| |
| if __name__ == "__main__": |
| from torch._inductor.utils import is_big_gpu |
| |
| # Set env to make it work in CI. |
| if HAS_CUDA and HAS_CPU and is_big_gpu(0): |
| run_tests() |