Add tests to bsr_dense_addmm_meta. Tune bsr_dense_addmm kernel for ViT shapes. (#132646)

As in the title.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/132646
Approved by: https://github.com/cpuhrsch
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index f61136b..14e8a85 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -2,10 +2,12 @@
 
 import torch
 import random
+import io
 import itertools
 import unittest
 import functools
-from torch.testing import make_tensor
+from contextlib import redirect_stderr
+from torch.testing import make_tensor, FileCheck
 from torch.testing._internal.common_cuda import SM53OrLater, SM80OrLater, TEST_CUSPARSE_GENERIC
 from torch.testing._internal.common_utils import \
     (TEST_WITH_TORCHINDUCTOR, TEST_WITH_ROCM, TEST_SCIPY, TEST_NUMPY, TEST_MKL, IS_WINDOWS, TestCase, run_tests,
@@ -4134,6 +4136,76 @@
         result = operation(*args, **dict(meta=meta))
         self.assertEqual(result, expected)
 
+    @onlyCUDA
+    @skipIfRocm
+    @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "Test requires Triton")
+    def test_triton_bsr_dense_addmm_meta(self, device):
+        from torch.sparse._triton_ops import bsr_dense_addmm_meta
+        from torch.sparse._triton_ops_meta import update as update_bsr_dense_addmm_meta
+
+        dtype = torch.float32
+        Ms = Ks = 16
+        beta = 0.0
+        alpha = 1.0
+
+        def get_meta(M, K, N, sparsity=None):
+            return bsr_dense_addmm_meta(M, K, N, Ms, Ks, beta, alpha, dtype=dtype, sparsity=sparsity,
+                                        _version="test_triton_bsr_dense_addmm_meta")
+
+        def update_meta(M, K, N, value, sparsity=0.5):
+            key = (M, K, N, Ms, Ks, beta == 0, beta == 1, alpha == 1)
+            update_bsr_dense_addmm_meta("bsr_dense_addmm", torch.cuda.get_device_name(),
+                                        ("test_triton_bsr_dense_addmm_meta", dtype, sparsity),
+                                        key, value)
+
+        def get_meta_with_checks(M, K, N, warn_count=0, sparsity=None):
+            f = io.StringIO()
+            with redirect_stderr(f):
+                result = get_meta(M, K, N, sparsity=sparsity)
+            msg = f.getvalue()
+            FileCheck().check_count(
+                str=f"UserWarning: bsr_dense_addmm uses non-optimal triton kernel parameters for M={M} K={K} N={N}",
+                count=warn_count, exactly=True
+            ).run(msg)
+            return result
+
+        # Test warn_once when requesting non-existing tuned parameters multiple times
+        f = io.StringIO()
+        with redirect_stderr(f):
+            for i in range(5):
+                get_meta(16, 16, 16)
+            for i in range(5):
+                get_meta(16, 16, 32)
+
+        msg = f.getvalue()
+        FileCheck().check_count(
+            str="UserWarning: bsr_dense_addmm uses non-optimal triton kernel parameters for M=16 K=16 N=16", count=1, exactly=True
+        ).run(msg)
+        FileCheck().check_count(
+            str="UserWarning: bsr_dense_addmm uses non-optimal triton kernel parameters for M=16 K=16 N=32", count=1, exactly=True
+        ).run(msg)
+
+        # Test warn_once when tuned parameters are missing
+        default_meta = dict(GROUP_SIZE_ROW=4, SPLIT_N=2, num_stages=1, num_warps=4)
+        self.assertEqual(get_meta_with_checks(32, 32, 32, warn_count=1), default_meta)
+
+        # Test (no)warn_once when tuned parameters are available
+        update_meta(32, 32, 48, (2, 8, 5, 6))
+        expected_meta = dict(GROUP_SIZE_ROW=2, SPLIT_N=8, num_stages=5, num_warps=6)
+        self.assertEqual(get_meta_with_checks(32, 32, 48, warn_count=0), expected_meta)
+
+        # Test non-existing tuned parameters with non-default sparsity
+        # while for default sparsity 0.5 the parameters are available
+        self.assertEqual(get_meta_with_checks(32, 32, 48, warn_count=0, sparsity=0.6), expected_meta)
+
+        # Test non-existing tuned parameters while there exists
+        # parameters with consistent N // SPLIT_N ratio:
+        self.assertEqual(get_meta_with_checks(32, 32, 72, warn_count=0),
+                         dict(GROUP_SIZE_ROW=2, SPLIT_N=12, num_stages=5, num_warps=6))
+        # ... or not:
+        self.assertEqual(get_meta_with_checks(32, 32, 64, warn_count=1),
+                         dict(GROUP_SIZE_ROW=4, SPLIT_N=4, num_stages=1, num_warps=4))
+
 
 # e.g., TestSparseCSRCPU and TestSparseCSRCUDA
 instantiate_device_type_tests(TestSparseCSR, globals())
diff --git a/torch/sparse/_triton_ops.py b/torch/sparse/_triton_ops.py
index ccd29b9..4f7f0dc 100644
--- a/torch/sparse/_triton_ops.py
+++ b/torch/sparse/_triton_ops.py
@@ -7,6 +7,7 @@
 from typing import Optional, Tuple
 
 import torch
+from torch._dynamo.utils import warn_once
 from torch.utils._triton import has_triton
 
 from ._triton_ops_meta import get_meta
@@ -748,8 +749,12 @@
     num_stages=None,
     sparsity=None,
     dtype=None,
+    _version=0,
     **extra,
 ):
+    # Specifying _version is useful for situations when one wants to
+    # discard existing triton kernel tuning results, say, in testing
+    # bsr_dense_addmm_meta functionality.
     if dtype is None:
         dtype = torch.float16
     if sparsity is None:
@@ -758,27 +763,39 @@
         device_name = torch.cuda.get_device_name()
         key = (M, K, N, Ms, Ks, beta == 0, beta == 1, alpha == 1)
         meta = get_meta(
-            "bsr_dense_addmm", key, device_name, version=(0, dtype, sparsity)
+            "bsr_dense_addmm", key, device_name, version=(_version, dtype, sparsity)
         )
         if meta is None and sparsity != 0.5:
             meta = get_meta(
-                "bsr_dense_addmm", key, device_name, version=(0, dtype, 0.5)
+                "bsr_dense_addmm", key, device_name, version=(_version, dtype, 0.5)
             )
-            if meta is None:
-                # find approximate meta such that N % SPLIT_N == 0.
-                matching_meta = get_meta(
-                    "bsr_dense_addmm",
-                    (*key[:2], "*", *key[3:]),
-                    device_name,
-                    version=(0, dtype, 0.5),
-                )
-                for mkey in sorted(matching_meta or {}):
-                    meta_ = matching_meta[mkey]
-                    if N % meta_["SPLIT_N"] == 0 and mkey[2] <= N:
-                        meta = meta_
+        if meta is None:
+            # find approximate meta such that N % SPLIT_N == 0.
+            matching_meta = get_meta(
+                "bsr_dense_addmm",
+                (*key[:2], "*", *key[3:]),
+                device_name,
+                version=(_version, dtype, 0.5),
+            )
+            for mkey in sorted(matching_meta or {}):
+                meta_ = matching_meta[mkey]
+                n = mkey[2]
+                split_n = meta_["SPLIT_N"]
+                c = n // split_n
+                if N % c == 0 and n <= N:
+                    meta = dict(meta_)
+                    meta["SPLIT_N"] = N // c
         if meta is not None:
             meta.update(**extra)
             return meta
+        else:
+            # see [Computing optimal kernel parameters] in
+            # _triton_ops_meta.py for ways to avoid this warning
+            # message
+            warn_once(
+                f"bsr_dense_addmm uses non-optimal triton kernel parameters for {M=} {K=} {N=} {Ms=}, {Ks=} {beta=} {alpha=}"
+            )
+
     SPLIT_N = SPLIT_N or max(N // Ms, 1)
     GROUP_SIZE_ROW = GROUP_SIZE_ROW or 4
     num_stages = num_stages or 1
diff --git a/torch/sparse/_triton_ops_meta.py b/torch/sparse/_triton_ops_meta.py
index ef61a1c..8672e7e 100644
--- a/torch/sparse/_triton_ops_meta.py
+++ b/torch/sparse/_triton_ops_meta.py
@@ -716,9 +716,22 @@
 def main(op="scatter_mm", force=False, dtype=torch.float16, verbose=True):
     import itertools
 
-    sizes_lst = [256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
+    sizes_lst = [
+        256,
+        512,
+        1024,
+        2048,
+        4096,
+        8192,
+        16384,
+        32768,
+        65536,
+        131072,
+        50432,
+    ]
     sizes3_lst = [3 * sz for sz in [64, 128] + sizes_lst if sz <= 2048]
-    shapes_lst = [(sz, sz) for sz in sizes_lst[:-3] + sizes3_lst]
+    shapes_lst = [(sz, sz) for sz in sizes_lst[:-4] + sizes3_lst]
+    shapes_lst.extend([(3072, 768), (768, 3072)])
     blocksize_lst = [(16, 16), (32, 32), (64, 64), (128, 128)]
     sparsity_lst = [0.5, 0.7, 0.3][:1]
     for sparsity in sparsity_lst:
@@ -734,6 +747,8 @@
                         M, K, N, BM, BK, force=force, sparsity=sparsity, dtype=dtype
                     )
                 elif op == "bsr_dense_addmm":
+                    if M == K and N == 50432:
+                        continue
                     print(f"{M, K, N, (BM, BK)=}")
                     for alpha, beta in [(1, 1), (1, 0)]:
                         optimize_bsr_dense_addmm(
@@ -1414,6 +1429,94 @@
         (768, 768, 131072, 64, 64, True, False, True): (2, 512, 3, 4),
         (768, 768, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
         (768, 768, 131072, 128, 128, True, False, True): (1, 1024, 2, 4),
+        (768, 3072, 256, 16, 16, False, True, True): (3, 8, 6, 1),
+        (768, 3072, 256, 16, 16, True, False, True): (1, 4, 6, 2),
+        (768, 3072, 256, 32, 32, False, True, True): (1, 8, 4, 4),
+        (768, 3072, 256, 32, 32, True, False, True): (3, 4, 6, 4),
+        (768, 3072, 256, 64, 64, False, True, True): (2, 4, 3, 4),
+        (768, 3072, 256, 64, 64, True, False, True): (1, 4, 4, 4),
+        (768, 3072, 256, 128, 128, False, True, True): (2, 2, 3, 8),
+        (768, 3072, 256, 128, 128, True, False, True): (1, 2, 3, 8),
+        (768, 3072, 512, 16, 16, False, True, True): (1, 8, 4, 2),
+        (768, 3072, 512, 16, 16, True, False, True): (1, 8, 5, 2),
+        (768, 3072, 512, 32, 32, False, True, True): (1, 16, 3, 2),
+        (768, 3072, 512, 32, 32, True, False, True): (1, 8, 5, 2),
+        (768, 3072, 512, 64, 64, False, True, True): (1, 8, 3, 4),
+        (768, 3072, 512, 64, 64, True, False, True): (3, 8, 4, 4),
+        (768, 3072, 512, 128, 128, False, True, True): (1, 4, 3, 8),
+        (768, 3072, 512, 128, 128, True, False, True): (2, 4, 3, 8),
+        (768, 3072, 1024, 16, 16, False, True, True): (1, 16, 1, 4),
+        (768, 3072, 1024, 16, 16, True, False, True): (5, 4, 4, 4),
+        (768, 3072, 1024, 32, 32, False, True, True): (3, 8, 3, 4),
+        (768, 3072, 1024, 32, 32, True, False, True): (1, 8, 4, 4),
+        (768, 3072, 1024, 64, 64, False, True, True): (2, 16, 3, 4),
+        (768, 3072, 1024, 64, 64, True, False, True): (2, 16, 4, 4),
+        (768, 3072, 1024, 128, 128, False, True, True): (1, 8, 3, 8),
+        (768, 3072, 1024, 128, 128, True, False, True): (5, 8, 3, 8),
+        (768, 3072, 2048, 16, 16, False, True, True): (3, 16, 1, 2),
+        (768, 3072, 2048, 16, 16, True, False, True): (1, 8, 3, 4),
+        (768, 3072, 2048, 32, 32, False, True, True): (4, 16, 1, 8),
+        (768, 3072, 2048, 32, 32, True, False, True): (3, 8, 3, 4),
+        (768, 3072, 2048, 64, 64, False, True, True): (2, 16, 3, 4),
+        (768, 3072, 2048, 64, 64, True, False, True): (2, 16, 3, 4),
+        (768, 3072, 2048, 128, 128, False, True, True): (3, 16, 3, 8),
+        (768, 3072, 2048, 128, 128, True, False, True): (4, 16, 3, 8),
+        (768, 3072, 4096, 16, 16, False, True, True): (1, 32, 1, 4),
+        (768, 3072, 4096, 16, 16, True, False, True): (1, 16, 3, 1),
+        (768, 3072, 4096, 32, 32, False, True, True): (3, 32, 1, 8),
+        (768, 3072, 4096, 32, 32, True, False, True): (3, 16, 4, 4),
+        (768, 3072, 4096, 64, 64, False, True, True): (2, 32, 3, 4),
+        (768, 3072, 4096, 64, 64, True, False, True): (2, 16, 3, 4),
+        (768, 3072, 4096, 128, 128, False, True, True): (5, 32, 1, 4),
+        (768, 3072, 4096, 128, 128, True, False, True): (9, 32, 3, 8),
+        (768, 3072, 8192, 16, 16, False, True, True): (1, 32, 1, 4),
+        (768, 3072, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+        (768, 3072, 8192, 32, 32, False, True, True): (1, 64, 1, 8),
+        (768, 3072, 8192, 32, 32, True, False, True): (2, 64, 4, 2),
+        (768, 3072, 8192, 64, 64, False, True, True): (1, 64, 3, 4),
+        (768, 3072, 8192, 64, 64, True, False, True): (2, 32, 3, 4),
+        (768, 3072, 8192, 128, 128, False, True, True): (2, 64, 3, 8),
+        (768, 3072, 8192, 128, 128, True, False, True): (2, 64, 3, 8),
+        (768, 3072, 16384, 16, 16, False, True, True): (1, 64, 1, 4),
+        (768, 3072, 16384, 16, 16, True, False, True): (1, 64, 4, 1),
+        (768, 3072, 16384, 32, 32, False, True, True): (1, 128, 1, 8),
+        (768, 3072, 16384, 32, 32, True, False, True): (1, 64, 3, 4),
+        (768, 3072, 16384, 64, 64, False, True, True): (1, 128, 3, 4),
+        (768, 3072, 16384, 64, 64, True, False, True): (4, 64, 3, 4),
+        (768, 3072, 16384, 128, 128, False, True, True): (2, 128, 3, 8),
+        (768, 3072, 16384, 128, 128, True, False, True): (2, 128, 3, 8),
+        (768, 3072, 32768, 16, 16, False, True, True): (1, 128, 1, 4),
+        (768, 3072, 32768, 16, 16, True, False, True): (1, 128, 4, 1),
+        (768, 3072, 32768, 32, 32, False, True, True): (1, 256, 1, 8),
+        (768, 3072, 32768, 32, 32, True, False, True): (1, 128, 3, 4),
+        (768, 3072, 32768, 64, 64, False, True, True): (1, 256, 3, 4),
+        (768, 3072, 32768, 64, 64, True, False, True): (1, 128, 3, 4),
+        (768, 3072, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
+        (768, 3072, 32768, 128, 128, True, False, True): (2, 256, 3, 8),
+        (768, 3072, 50432, 16, 16, False, True, True): (1, 197, 1, 4),
+        (768, 3072, 50432, 16, 16, True, False, True): (4, 197, 4, 4),
+        (768, 3072, 50432, 32, 32, False, True, True): (1, 197, 1, 4),
+        (768, 3072, 50432, 32, 32, True, False, True): (4, 197, 3, 4),
+        (768, 3072, 50432, 64, 64, False, True, True): (1, 394, 3, 4),
+        (768, 3072, 50432, 64, 64, True, False, True): (3, 197, 3, 4),
+        (768, 3072, 50432, 128, 128, False, True, True): (3, 394, 1, 4),
+        (768, 3072, 50432, 128, 128, True, False, True): (1, 394, 3, 8),
+        (768, 3072, 65536, 16, 16, False, True, True): (1, 256, 1, 4),
+        (768, 3072, 65536, 16, 16, True, False, True): (5, 256, 4, 1),
+        (768, 3072, 65536, 32, 32, False, True, True): (1, 256, 1, 4),
+        (768, 3072, 65536, 32, 32, True, False, True): (3, 256, 3, 4),
+        (768, 3072, 65536, 64, 64, False, True, True): (2, 512, 3, 4),
+        (768, 3072, 65536, 64, 64, True, False, True): (3, 256, 3, 4),
+        (768, 3072, 65536, 128, 128, False, True, True): (3, 512, 1, 4),
+        (768, 3072, 65536, 128, 128, True, False, True): (2, 512, 3, 8),
+        (768, 3072, 131072, 16, 16, False, True, True): (1, 512, 1, 4),
+        (768, 3072, 131072, 16, 16, True, False, True): (5, 512, 4, 1),
+        (768, 3072, 131072, 32, 32, False, True, True): (1, 512, 1, 4),
+        (768, 3072, 131072, 32, 32, True, False, True): (4, 512, 3, 4),
+        (768, 3072, 131072, 64, 64, False, True, True): (1, 1024, 3, 4),
+        (768, 3072, 131072, 64, 64, True, False, True): (1, 512, 3, 4),
+        (768, 3072, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
+        (768, 3072, 131072, 128, 128, True, False, True): (1, 1024, 3, 8),
         (1024, 1024, 256, 16, 16, False, True, True): (1, 4, 5, 4),
         (1024, 1024, 256, 16, 16, True, False, True): (3, 4, 4, 4),
         (1024, 1024, 256, 32, 32, False, True, True): (4, 4, 5, 2),
@@ -1654,6 +1757,94 @@
         (2048, 2048, 131072, 64, 64, True, False, True): (4, 1024, 3, 4),
         (2048, 2048, 131072, 128, 128, False, True, True): (1, 1024, 1, 4),
         (2048, 2048, 131072, 128, 128, True, False, True): (2, 1024, 1, 4),
+        (3072, 768, 256, 16, 16, False, True, True): (6, 4, 1, 4),
+        (3072, 768, 256, 16, 16, True, False, True): (3, 1, 4, 4),
+        (3072, 768, 256, 32, 32, False, True, True): (6, 8, 1, 2),
+        (3072, 768, 256, 32, 32, True, False, True): (1, 2, 4, 4),
+        (3072, 768, 256, 64, 64, False, True, True): (1, 4, 4, 4),
+        (3072, 768, 256, 64, 64, True, False, True): (4, 2, 4, 4),
+        (3072, 768, 256, 128, 128, False, True, True): (1, 2, 3, 8),
+        (3072, 768, 256, 128, 128, True, False, True): (1, 2, 3, 8),
+        (3072, 768, 512, 16, 16, False, True, True): (2, 4, 1, 4),
+        (3072, 768, 512, 16, 16, True, False, True): (1, 4, 4, 1),
+        (3072, 768, 512, 32, 32, False, True, True): (3, 8, 1, 4),
+        (3072, 768, 512, 32, 32, True, False, True): (1, 2, 3, 4),
+        (3072, 768, 512, 64, 64, False, True, True): (1, 8, 1, 4),
+        (3072, 768, 512, 64, 64, True, False, True): (4, 4, 3, 4),
+        (3072, 768, 512, 128, 128, False, True, True): (1, 4, 3, 8),
+        (3072, 768, 512, 128, 128, True, False, True): (1, 4, 3, 8),
+        (3072, 768, 1024, 16, 16, False, True, True): (1, 8, 1, 4),
+        (3072, 768, 1024, 16, 16, True, False, True): (3, 4, 3, 1),
+        (3072, 768, 1024, 32, 32, False, True, True): (1, 8, 1, 8),
+        (3072, 768, 1024, 32, 32, True, False, True): (1, 4, 4, 4),
+        (3072, 768, 1024, 64, 64, False, True, True): (1, 16, 3, 4),
+        (3072, 768, 1024, 64, 64, True, False, True): (1, 4, 3, 4),
+        (3072, 768, 1024, 128, 128, False, True, True): (1, 8, 3, 8),
+        (3072, 768, 1024, 128, 128, True, False, True): (2, 8, 3, 8),
+        (3072, 768, 2048, 16, 16, False, True, True): (3, 8, 1, 4),
+        (3072, 768, 2048, 16, 16, True, False, True): (2, 8, 3, 4),
+        (3072, 768, 2048, 32, 32, False, True, True): (3, 16, 1, 8),
+        (3072, 768, 2048, 32, 32, True, False, True): (3, 8, 3, 4),
+        (3072, 768, 2048, 64, 64, False, True, True): (1, 16, 1, 4),
+        (3072, 768, 2048, 64, 64, True, False, True): (1, 16, 3, 4),
+        (3072, 768, 2048, 128, 128, False, True, True): (1, 16, 3, 8),
+        (3072, 768, 2048, 128, 128, True, False, True): (2, 16, 2, 4),
+        (3072, 768, 4096, 16, 16, False, True, True): (1, 16, 1, 4),
+        (3072, 768, 4096, 16, 16, True, False, True): (4, 32, 4, 2),
+        (3072, 768, 4096, 32, 32, False, True, True): (2, 32, 1, 8),
+        (3072, 768, 4096, 32, 32, True, False, True): (7, 16, 3, 4),
+        (3072, 768, 4096, 64, 64, False, True, True): (2, 32, 1, 4),
+        (3072, 768, 4096, 64, 64, True, False, True): (2, 16, 2, 4),
+        (3072, 768, 4096, 128, 128, False, True, True): (1, 32, 3, 8),
+        (3072, 768, 4096, 128, 128, True, False, True): (3, 32, 2, 4),
+        (3072, 768, 8192, 16, 16, False, True, True): (2, 32, 1, 4),
+        (3072, 768, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+        (3072, 768, 8192, 32, 32, False, True, True): (4, 32, 1, 4),
+        (3072, 768, 8192, 32, 32, True, False, True): (4, 32, 3, 4),
+        (3072, 768, 8192, 64, 64, False, True, True): (2, 64, 1, 4),
+        (3072, 768, 8192, 64, 64, True, False, True): (4, 32, 2, 4),
+        (3072, 768, 8192, 128, 128, False, True, True): (3, 64, 1, 4),
+        (3072, 768, 8192, 128, 128, True, False, True): (6, 64, 2, 4),
+        (3072, 768, 16384, 16, 16, False, True, True): (1, 64, 1, 4),
+        (3072, 768, 16384, 16, 16, True, False, True): (1, 64, 1, 1),
+        (3072, 768, 16384, 32, 32, False, True, True): (1, 64, 1, 4),
+        (3072, 768, 16384, 32, 32, True, False, True): (4, 64, 3, 4),
+        (3072, 768, 16384, 64, 64, False, True, True): (4, 128, 1, 4),
+        (3072, 768, 16384, 64, 64, True, False, True): (4, 64, 2, 4),
+        (3072, 768, 16384, 128, 128, False, True, True): (3, 128, 1, 4),
+        (3072, 768, 16384, 128, 128, True, False, True): (4, 128, 2, 4),
+        (3072, 768, 32768, 16, 16, False, True, True): (1, 128, 1, 4),
+        (3072, 768, 32768, 16, 16, True, False, True): (8, 128, 4, 1),
+        (3072, 768, 32768, 32, 32, False, True, True): (1, 128, 1, 4),
+        (3072, 768, 32768, 32, 32, True, False, True): (8, 128, 3, 4),
+        (3072, 768, 32768, 64, 64, False, True, True): (1, 256, 1, 4),
+        (3072, 768, 32768, 64, 64, True, False, True): (1, 128, 2, 4),
+        (3072, 768, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
+        (3072, 768, 32768, 128, 128, True, False, True): (8, 256, 2, 4),
+        (3072, 768, 50432, 16, 16, False, True, True): (1, 197, 1, 4),
+        (3072, 768, 50432, 16, 16, True, False, True): (7, 197, 4, 1),
+        (3072, 768, 50432, 32, 32, False, True, True): (1, 197, 1, 4),
+        (3072, 768, 50432, 32, 32, True, False, True): (4, 197, 3, 4),
+        (3072, 768, 50432, 64, 64, False, True, True): (1, 394, 1, 4),
+        (3072, 768, 50432, 64, 64, True, False, True): (3, 197, 2, 4),
+        (3072, 768, 50432, 128, 128, False, True, True): (3, 394, 1, 4),
+        (3072, 768, 50432, 128, 128, True, False, True): (8, 394, 2, 4),
+        (3072, 768, 65536, 16, 16, False, True, True): (1, 256, 1, 4),
+        (3072, 768, 65536, 16, 16, True, False, True): (15, 256, 4, 1),
+        (3072, 768, 65536, 32, 32, False, True, True): (1, 256, 1, 4),
+        (3072, 768, 65536, 32, 32, True, False, True): (15, 256, 3, 4),
+        (3072, 768, 65536, 64, 64, False, True, True): (1, 512, 1, 4),
+        (3072, 768, 65536, 64, 64, True, False, True): (2, 256, 2, 4),
+        (3072, 768, 65536, 128, 128, False, True, True): (3, 512, 1, 4),
+        (3072, 768, 65536, 128, 128, True, False, True): (3, 512, 2, 4),
+        (3072, 768, 131072, 16, 16, False, True, True): (1, 512, 1, 4),
+        (3072, 768, 131072, 16, 16, True, False, True): (15, 512, 4, 1),
+        (3072, 768, 131072, 32, 32, False, True, True): (1, 512, 1, 4),
+        (3072, 768, 131072, 32, 32, True, False, True): (9, 512, 3, 4),
+        (3072, 768, 131072, 64, 64, False, True, True): (1, 1024, 1, 4),
+        (3072, 768, 131072, 64, 64, True, False, True): (3, 512, 2, 4),
+        (3072, 768, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
+        (3072, 768, 131072, 128, 128, True, False, True): (1, 1024, 2, 4),
         (3072, 3072, 256, 16, 16, False, True, True): (5, 4, 1, 4),
         (3072, 3072, 256, 16, 16, True, False, True): (1, 2, 5, 2),
         (3072, 3072, 256, 32, 32, False, True, True): (5, 4, 1, 8),
@@ -2418,6 +2609,14 @@
         (256, 256, 32768, 64, 64, True, False, True): (2, 256, 1, 4),
         (256, 256, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
         (256, 256, 32768, 128, 128, True, False, True): (2, 256, 1, 4),
+        (256, 256, 50432, 16, 16, False, True, True): (4, 197, 1, 4),
+        (256, 256, 50432, 16, 16, True, False, True): (4, 197, 3, 2),
+        (256, 256, 50432, 32, 32, False, True, True): (1, 394, 1, 2),
+        (256, 256, 50432, 32, 32, True, False, True): (4, 197, 3, 4),
+        (256, 256, 50432, 64, 64, False, True, True): (6, 394, 1, 4),
+        (256, 256, 50432, 64, 64, True, False, True): (4, 394, 2, 4),
+        (256, 256, 50432, 128, 128, False, True, True): (3, 394, 1, 4),
+        (256, 256, 50432, 128, 128, True, False, True): (1, 394, 2, 4),
         (256, 256, 65536, 16, 16, False, True, True): (1, 256, 3, 2),
         (256, 256, 65536, 16, 16, True, False, True): (1, 256, 3, 2),
         (256, 256, 65536, 32, 32, False, True, True): (1, 512, 3, 2),
@@ -2558,6 +2757,14 @@
         (512, 512, 32768, 64, 64, True, False, True): (1, 256, 3, 4),
         (512, 512, 32768, 128, 128, False, True, True): (5, 256, 1, 4),
         (512, 512, 32768, 128, 128, True, False, True): (1, 256, 1, 4),
+        (512, 512, 50432, 16, 16, False, True, True): (4, 197, 1, 4),
+        (512, 512, 50432, 16, 16, True, False, True): (4, 197, 3, 2),
+        (512, 512, 50432, 32, 32, False, True, True): (2, 197, 1, 4),
+        (512, 512, 50432, 32, 32, True, False, True): (4, 197, 3, 4),
+        (512, 512, 50432, 64, 64, False, True, True): (2, 394, 1, 4),
+        (512, 512, 50432, 64, 64, True, False, True): (4, 197, 2, 4),
+        (512, 512, 50432, 128, 128, False, True, True): (5, 394, 1, 4),
+        (512, 512, 50432, 128, 128, True, False, True): (6, 394, 2, 4),
         (512, 512, 65536, 16, 16, False, True, True): (1, 256, 3, 2),
         (512, 512, 65536, 16, 16, True, False, True): (1, 256, 3, 1),
         (512, 512, 65536, 32, 32, False, True, True): (1, 512, 3, 2),
@@ -2654,6 +2861,94 @@
         (768, 768, 131072, 64, 64, True, False, True): (3, 512, 3, 4),
         (768, 768, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
         (768, 768, 131072, 128, 128, True, False, True): (1, 1024, 2, 4),
+        (768, 3072, 256, 16, 16, False, True, True): (1, 8, 5, 2),
+        (768, 3072, 256, 16, 16, True, False, True): (3, 4, 7, 2),
+        (768, 3072, 256, 32, 32, False, True, True): (1, 8, 4, 2),
+        (768, 3072, 256, 32, 32, True, False, True): (1, 4, 5, 4),
+        (768, 3072, 256, 64, 64, False, True, True): (1, 4, 3, 4),
+        (768, 3072, 256, 64, 64, True, False, True): (1, 4, 5, 4),
+        (768, 3072, 256, 128, 128, False, True, True): (2, 2, 3, 8),
+        (768, 3072, 256, 128, 128, True, False, True): (2, 2, 3, 8),
+        (768, 3072, 512, 16, 16, False, True, True): (1, 8, 5, 2),
+        (768, 3072, 512, 16, 16, True, False, True): (1, 8, 5, 2),
+        (768, 3072, 512, 32, 32, False, True, True): (3, 8, 3, 4),
+        (768, 3072, 512, 32, 32, True, False, True): (1, 8, 7, 4),
+        (768, 3072, 512, 64, 64, False, True, True): (3, 8, 3, 4),
+        (768, 3072, 512, 64, 64, True, False, True): (3, 8, 5, 4),
+        (768, 3072, 512, 128, 128, False, True, True): (1, 4, 3, 8),
+        (768, 3072, 512, 128, 128, True, False, True): (1, 4, 3, 8),
+        (768, 3072, 1024, 16, 16, False, True, True): (4, 16, 1, 4),
+        (768, 3072, 1024, 16, 16, True, False, True): (2, 8, 5, 2),
+        (768, 3072, 1024, 32, 32, False, True, True): (1, 16, 6, 2),
+        (768, 3072, 1024, 32, 32, True, False, True): (1, 8, 4, 4),
+        (768, 3072, 1024, 64, 64, False, True, True): (2, 16, 4, 4),
+        (768, 3072, 1024, 64, 64, True, False, True): (2, 16, 4, 4),
+        (768, 3072, 1024, 128, 128, False, True, True): (1, 8, 3, 8),
+        (768, 3072, 1024, 128, 128, True, False, True): (3, 8, 3, 8),
+        (768, 3072, 2048, 16, 16, False, True, True): (1, 16, 1, 2),
+        (768, 3072, 2048, 16, 16, True, False, True): (1, 16, 5, 2),
+        (768, 3072, 2048, 32, 32, False, True, True): (4, 16, 1, 8),
+        (768, 3072, 2048, 32, 32, True, False, True): (2, 8, 3, 4),
+        (768, 3072, 2048, 64, 64, False, True, True): (2, 16, 3, 4),
+        (768, 3072, 2048, 64, 64, True, False, True): (2, 16, 3, 4),
+        (768, 3072, 2048, 128, 128, False, True, True): (3, 16, 3, 8),
+        (768, 3072, 2048, 128, 128, True, False, True): (1, 16, 3, 8),
+        (768, 3072, 4096, 16, 16, False, True, True): (1, 32, 1, 4),
+        (768, 3072, 4096, 16, 16, True, False, True): (1, 16, 3, 1),
+        (768, 3072, 4096, 32, 32, False, True, True): (3, 32, 1, 8),
+        (768, 3072, 4096, 32, 32, True, False, True): (2, 16, 3, 8),
+        (768, 3072, 4096, 64, 64, False, True, True): (2, 32, 3, 4),
+        (768, 3072, 4096, 64, 64, True, False, True): (2, 16, 3, 4),
+        (768, 3072, 4096, 128, 128, False, True, True): (5, 32, 1, 4),
+        (768, 3072, 4096, 128, 128, True, False, True): (4, 32, 3, 8),
+        (768, 3072, 8192, 16, 16, False, True, True): (1, 32, 1, 4),
+        (768, 3072, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+        (768, 3072, 8192, 32, 32, False, True, True): (1, 64, 1, 8),
+        (768, 3072, 8192, 32, 32, True, False, True): (2, 32, 3, 8),
+        (768, 3072, 8192, 64, 64, False, True, True): (2, 64, 3, 4),
+        (768, 3072, 8192, 64, 64, True, False, True): (2, 32, 3, 4),
+        (768, 3072, 8192, 128, 128, False, True, True): (1, 64, 3, 8),
+        (768, 3072, 8192, 128, 128, True, False, True): (2, 64, 3, 8),
+        (768, 3072, 16384, 16, 16, False, True, True): (1, 64, 1, 4),
+        (768, 3072, 16384, 16, 16, True, False, True): (1, 64, 4, 1),
+        (768, 3072, 16384, 32, 32, False, True, True): (1, 128, 1, 8),
+        (768, 3072, 16384, 32, 32, True, False, True): (1, 64, 3, 4),
+        (768, 3072, 16384, 64, 64, False, True, True): (1, 128, 3, 4),
+        (768, 3072, 16384, 64, 64, True, False, True): (1, 64, 3, 4),
+        (768, 3072, 16384, 128, 128, False, True, True): (2, 128, 3, 8),
+        (768, 3072, 16384, 128, 128, True, False, True): (1, 128, 3, 8),
+        (768, 3072, 32768, 16, 16, False, True, True): (1, 128, 1, 4),
+        (768, 3072, 32768, 16, 16, True, False, True): (1, 128, 4, 1),
+        (768, 3072, 32768, 32, 32, False, True, True): (1, 256, 1, 8),
+        (768, 3072, 32768, 32, 32, True, False, True): (1, 128, 3, 4),
+        (768, 3072, 32768, 64, 64, False, True, True): (1, 256, 3, 4),
+        (768, 3072, 32768, 64, 64, True, False, True): (1, 128, 3, 4),
+        (768, 3072, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
+        (768, 3072, 32768, 128, 128, True, False, True): (5, 256, 3, 8),
+        (768, 3072, 50432, 16, 16, False, True, True): (1, 197, 1, 4),
+        (768, 3072, 50432, 16, 16, True, False, True): (4, 197, 4, 1),
+        (768, 3072, 50432, 32, 32, False, True, True): (2, 197, 1, 4),
+        (768, 3072, 50432, 32, 32, True, False, True): (4, 197, 3, 4),
+        (768, 3072, 50432, 64, 64, False, True, True): (1, 394, 3, 4),
+        (768, 3072, 50432, 64, 64, True, False, True): (1, 197, 3, 4),
+        (768, 3072, 50432, 128, 128, False, True, True): (3, 394, 1, 4),
+        (768, 3072, 50432, 128, 128, True, False, True): (3, 394, 2, 4),
+        (768, 3072, 65536, 16, 16, False, True, True): (1, 256, 1, 4),
+        (768, 3072, 65536, 16, 16, True, False, True): (5, 256, 4, 1),
+        (768, 3072, 65536, 32, 32, False, True, True): (2, 256, 1, 4),
+        (768, 3072, 65536, 32, 32, True, False, True): (3, 256, 3, 4),
+        (768, 3072, 65536, 64, 64, False, True, True): (1, 512, 3, 4),
+        (768, 3072, 65536, 64, 64, True, False, True): (1, 256, 3, 4),
+        (768, 3072, 65536, 128, 128, False, True, True): (3, 512, 1, 4),
+        (768, 3072, 65536, 128, 128, True, False, True): (2, 512, 3, 8),
+        (768, 3072, 131072, 16, 16, False, True, True): (1, 512, 1, 4),
+        (768, 3072, 131072, 16, 16, True, False, True): (5, 512, 4, 1),
+        (768, 3072, 131072, 32, 32, False, True, True): (2, 512, 1, 4),
+        (768, 3072, 131072, 32, 32, True, False, True): (2, 512, 3, 4),
+        (768, 3072, 131072, 64, 64, False, True, True): (1, 1024, 3, 4),
+        (768, 3072, 131072, 64, 64, True, False, True): (2, 512, 3, 4),
+        (768, 3072, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
+        (768, 3072, 131072, 128, 128, True, False, True): (2, 1024, 3, 8),
         (1024, 1024, 256, 16, 16, False, True, True): (3, 4, 5, 4),
         (1024, 1024, 256, 16, 16, True, False, True): (3, 4, 5, 4),
         (1024, 1024, 256, 32, 32, False, True, True): (2, 4, 6, 2),
@@ -2718,6 +3013,14 @@
         (1024, 1024, 32768, 64, 64, True, False, True): (1, 256, 3, 4),
         (1024, 1024, 32768, 128, 128, False, True, True): (7, 256, 1, 4),
         (1024, 1024, 32768, 128, 128, True, False, True): (4, 256, 1, 4),
+        (1024, 1024, 50432, 16, 16, False, True, True): (1, 197, 1, 4),
+        (1024, 1024, 50432, 16, 16, True, False, True): (4, 197, 3, 4),
+        (1024, 1024, 50432, 32, 32, False, True, True): (2, 197, 1, 4),
+        (1024, 1024, 50432, 32, 32, True, False, True): (1, 197, 3, 4),
+        (1024, 1024, 50432, 64, 64, False, True, True): (2, 394, 1, 4),
+        (1024, 1024, 50432, 64, 64, True, False, True): (1, 197, 2, 4),
+        (1024, 1024, 50432, 128, 128, False, True, True): (3, 394, 1, 4),
+        (1024, 1024, 50432, 128, 128, True, False, True): (2, 394, 2, 4),
         (1024, 1024, 65536, 16, 16, False, True, True): (1, 256, 3, 4),
         (1024, 1024, 65536, 16, 16, True, False, True): (1, 256, 3, 1),
         (1024, 1024, 65536, 32, 32, False, True, True): (1, 512, 3, 2),
@@ -2878,6 +3181,14 @@
         (2048, 2048, 32768, 64, 64, True, False, True): (8, 256, 3, 4),
         (2048, 2048, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
         (2048, 2048, 32768, 128, 128, True, False, True): (1, 256, 1, 4),
+        (2048, 2048, 50432, 16, 16, False, True, True): (1, 197, 1, 4),
+        (2048, 2048, 50432, 16, 16, True, False, True): (4, 197, 4, 1),
+        (2048, 2048, 50432, 32, 32, False, True, True): (2, 197, 1, 4),
+        (2048, 2048, 50432, 32, 32, True, False, True): (4, 197, 3, 4),
+        (2048, 2048, 50432, 64, 64, False, True, True): (2, 394, 3, 4),
+        (2048, 2048, 50432, 64, 64, True, False, True): (4, 197, 2, 4),
+        (2048, 2048, 50432, 128, 128, False, True, True): (3, 394, 1, 4),
+        (2048, 2048, 50432, 128, 128, True, False, True): (4, 394, 2, 4),
         (2048, 2048, 65536, 16, 16, False, True, True): (9, 256, 3, 2),
         (2048, 2048, 65536, 16, 16, True, False, True): (9, 256, 4, 4),
         (2048, 2048, 65536, 32, 32, False, True, True): (7, 256, 3, 4),
@@ -2894,6 +3205,94 @@
         (2048, 2048, 131072, 64, 64, True, False, True): (2, 1024, 3, 4),
         (2048, 2048, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
         (2048, 2048, 131072, 128, 128, True, False, True): (1, 1024, 1, 4),
+        (3072, 768, 256, 16, 16, False, True, True): (6, 4, 1, 4),
+        (3072, 768, 256, 16, 16, True, False, True): (2, 1, 5, 2),
+        (3072, 768, 256, 32, 32, False, True, True): (1, 4, 1, 8),
+        (3072, 768, 256, 32, 32, True, False, True): (4, 2, 4, 4),
+        (3072, 768, 256, 64, 64, False, True, True): (1, 2, 3, 4),
+        (3072, 768, 256, 64, 64, True, False, True): (3, 4, 3, 4),
+        (3072, 768, 256, 128, 128, False, True, True): (1, 2, 3, 8),
+        (3072, 768, 256, 128, 128, True, False, True): (3, 2, 3, 8),
+        (3072, 768, 512, 16, 16, False, True, True): (1, 4, 1, 4),
+        (3072, 768, 512, 16, 16, True, False, True): (3, 4, 4, 1),
+        (3072, 768, 512, 32, 32, False, True, True): (5, 8, 1, 4),
+        (3072, 768, 512, 32, 32, True, False, True): (3, 4, 4, 2),
+        (3072, 768, 512, 64, 64, False, True, True): (1, 8, 1, 4),
+        (3072, 768, 512, 64, 64, True, False, True): (1, 4, 3, 4),
+        (3072, 768, 512, 128, 128, False, True, True): (3, 4, 3, 8),
+        (3072, 768, 512, 128, 128, True, False, True): (1, 4, 3, 8),
+        (3072, 768, 1024, 16, 16, False, True, True): (1, 8, 1, 4),
+        (3072, 768, 1024, 16, 16, True, False, True): (3, 4, 3, 1),
+        (3072, 768, 1024, 32, 32, False, True, True): (1, 16, 1, 4),
+        (3072, 768, 1024, 32, 32, True, False, True): (1, 4, 3, 8),
+        (3072, 768, 1024, 64, 64, False, True, True): (8, 16, 3, 2),
+        (3072, 768, 1024, 64, 64, True, False, True): (1, 4, 3, 4),
+        (3072, 768, 1024, 128, 128, False, True, True): (2, 8, 3, 8),
+        (3072, 768, 1024, 128, 128, True, False, True): (3, 8, 2, 4),
+        (3072, 768, 2048, 16, 16, False, True, True): (1, 8, 1, 4),
+        (3072, 768, 2048, 16, 16, True, False, True): (6, 8, 4, 4),
+        (3072, 768, 2048, 32, 32, False, True, True): (1, 16, 1, 8),
+        (3072, 768, 2048, 32, 32, True, False, True): (6, 8, 3, 4),
+        (3072, 768, 2048, 64, 64, False, True, True): (8, 16, 3, 4),
+        (3072, 768, 2048, 64, 64, True, False, True): (3, 16, 3, 4),
+        (3072, 768, 2048, 128, 128, False, True, True): (1, 16, 3, 8),
+        (3072, 768, 2048, 128, 128, True, False, True): (2, 16, 2, 4),
+        (3072, 768, 4096, 16, 16, False, True, True): (1, 16, 1, 4),
+        (3072, 768, 4096, 16, 16, True, False, True): (4, 32, 4, 2),
+        (3072, 768, 4096, 32, 32, False, True, True): (1, 32, 1, 8),
+        (3072, 768, 4096, 32, 32, True, False, True): (4, 16, 3, 4),
+        (3072, 768, 4096, 64, 64, False, True, True): (2, 32, 1, 4),
+        (3072, 768, 4096, 64, 64, True, False, True): (2, 16, 2, 4),
+        (3072, 768, 4096, 128, 128, False, True, True): (2, 32, 1, 16),
+        (3072, 768, 4096, 128, 128, True, False, True): (3, 32, 2, 4),
+        (3072, 768, 8192, 16, 16, False, True, True): (2, 32, 1, 4),
+        (3072, 768, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+        (3072, 768, 8192, 32, 32, False, True, True): (2, 32, 1, 4),
+        (3072, 768, 8192, 32, 32, True, False, True): (6, 32, 3, 4),
+        (3072, 768, 8192, 64, 64, False, True, True): (2, 64, 1, 4),
+        (3072, 768, 8192, 64, 64, True, False, True): (2, 32, 2, 4),
+        (3072, 768, 8192, 128, 128, False, True, True): (3, 64, 1, 4),
+        (3072, 768, 8192, 128, 128, True, False, True): (2, 64, 2, 4),
+        (3072, 768, 16384, 16, 16, False, True, True): (1, 64, 1, 4),
+        (3072, 768, 16384, 16, 16, True, False, True): (1, 64, 1, 1),
+        (3072, 768, 16384, 32, 32, False, True, True): (2, 64, 1, 4),
+        (3072, 768, 16384, 32, 32, True, False, True): (4, 64, 3, 4),
+        (3072, 768, 16384, 64, 64, False, True, True): (2, 128, 1, 4),
+        (3072, 768, 16384, 64, 64, True, False, True): (4, 64, 2, 4),
+        (3072, 768, 16384, 128, 128, False, True, True): (3, 128, 1, 4),
+        (3072, 768, 16384, 128, 128, True, False, True): (1, 128, 2, 4),
+        (3072, 768, 32768, 16, 16, False, True, True): (1, 128, 1, 4),
+        (3072, 768, 32768, 16, 16, True, False, True): (8, 256, 3, 2),
+        (3072, 768, 32768, 32, 32, False, True, True): (2, 128, 1, 4),
+        (3072, 768, 32768, 32, 32, True, False, True): (8, 128, 3, 4),
+        (3072, 768, 32768, 64, 64, False, True, True): (1, 256, 1, 4),
+        (3072, 768, 32768, 64, 64, True, False, True): (8, 128, 2, 4),
+        (3072, 768, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
+        (3072, 768, 32768, 128, 128, True, False, True): (3, 256, 2, 4),
+        (3072, 768, 50432, 16, 16, False, True, True): (1, 197, 1, 4),
+        (3072, 768, 50432, 16, 16, True, False, True): (7, 197, 4, 1),
+        (3072, 768, 50432, 32, 32, False, True, True): (2, 197, 1, 4),
+        (3072, 768, 50432, 32, 32, True, False, True): (10, 197, 3, 4),
+        (3072, 768, 50432, 64, 64, False, True, True): (1, 394, 1, 4),
+        (3072, 768, 50432, 64, 64, True, False, True): (3, 197, 2, 4),
+        (3072, 768, 50432, 128, 128, False, True, True): (3, 394, 1, 4),
+        (3072, 768, 50432, 128, 128, True, False, True): (2, 394, 2, 4),
+        (3072, 768, 65536, 16, 16, False, True, True): (1, 256, 1, 4),
+        (3072, 768, 65536, 16, 16, True, False, True): (15, 256, 4, 1),
+        (3072, 768, 65536, 32, 32, False, True, True): (2, 256, 1, 4),
+        (3072, 768, 65536, 32, 32, True, False, True): (10, 256, 3, 4),
+        (3072, 768, 65536, 64, 64, False, True, True): (1, 512, 1, 4),
+        (3072, 768, 65536, 64, 64, True, False, True): (3, 256, 2, 4),
+        (3072, 768, 65536, 128, 128, False, True, True): (3, 512, 1, 4),
+        (3072, 768, 65536, 128, 128, True, False, True): (3, 512, 2, 4),
+        (3072, 768, 131072, 16, 16, False, True, True): (1, 512, 1, 4),
+        (3072, 768, 131072, 16, 16, True, False, True): (15, 512, 4, 1),
+        (3072, 768, 131072, 32, 32, False, True, True): (2, 512, 1, 4),
+        (3072, 768, 131072, 32, 32, True, False, True): (9, 512, 3, 4),
+        (3072, 768, 131072, 64, 64, False, True, True): (1, 1024, 1, 4),
+        (3072, 768, 131072, 64, 64, True, False, True): (3, 512, 2, 4),
+        (3072, 768, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
+        (3072, 768, 131072, 128, 128, True, False, True): (3, 1024, 2, 4),
         (3072, 3072, 256, 16, 16, False, True, True): (5, 4, 1, 4),
         (3072, 3072, 256, 16, 16, True, False, True): (1, 2, 5, 2),
         (3072, 3072, 256, 32, 32, False, True, True): (1, 4, 1, 8),
@@ -3038,6 +3437,14 @@
         (4096, 4096, 32768, 64, 64, True, False, True): (3, 256, 3, 4),
         (4096, 4096, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
         (4096, 4096, 32768, 128, 128, True, False, True): (1, 256, 1, 4),
+        (4096, 4096, 50432, 16, 16, False, True, True): (1, 197, 1, 4),
+        (4096, 4096, 50432, 16, 16, True, False, True): (4, 197, 4, 1),
+        (4096, 4096, 50432, 32, 32, False, True, True): (1, 197, 1, 4),
+        (4096, 4096, 50432, 32, 32, True, False, True): (2, 197, 3, 4),
+        (4096, 4096, 50432, 64, 64, False, True, True): (1, 394, 3, 4),
+        (4096, 4096, 50432, 64, 64, True, False, True): (1, 197, 2, 4),
+        (4096, 4096, 50432, 128, 128, False, True, True): (3, 394, 1, 4),
+        (4096, 4096, 50432, 128, 128, True, False, True): (1, 394, 2, 4),
         (4096, 4096, 65536, 16, 16, False, True, True): (5, 256, 4, 4),
         (4096, 4096, 65536, 16, 16, True, False, True): (5, 256, 4, 4),
         (4096, 4096, 65536, 32, 32, False, True, True): (4, 256, 4, 8),
@@ -3198,6 +3605,11 @@
         (8192, 8192, 32768, 64, 64, True, False, True): (2, 128, 3, 8),
         (8192, 8192, 32768, 128, 128, False, True, True): (6, 256, 1, 4),
         (8192, 8192, 32768, 128, 128, True, False, True): (4, 256, 1, 4),
+        (8192, 8192, 50432, 16, 16, False, True, True): (1, 197, 1, 1),
+        (8192, 8192, 50432, 16, 16, True, False, True): (3, 197, 4, 1),
+        (8192, 8192, 50432, 32, 32, False, True, True): (2, 197, 1, 4),
+        (8192, 8192, 50432, 32, 32, True, False, True): (2, 197, 3, 4),
+        (8192, 8192, 50432, 64, 64, False, True, True): (2, 394, 3, 4),
         (8192, 8192, 65536, 16, 16, False, True, True): (3, 256, 4, 4),
         (8192, 8192, 65536, 16, 16, True, False, True): (4, 256, 4, 4),
         (8192, 8192, 65536, 32, 32, False, True, True): (2, 256, 4, 8),
@@ -3440,6 +3852,7 @@
         (24576, 24576, 32768, 128, 128, True, False, True): (2, 256, 3, 8),
         (24576, 24576, 65536, 16, 16, False, True, True): (2, 512, 1, 2),
         (24576, 24576, 65536, 16, 16, True, False, True): (1, 256, 4, 4),
+        (32768, 32768, 256, 16, 16, False, True, True): (4, 2, 1, 2),
     },
     ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.float16, 0.56)): {
         (192, 192, 256, 64, 64, False, True, True): (1, 4, 3, 4),
@@ -4044,10 +4457,94 @@
         (768, 768, 131072, 64, 64, True, False, True): (1, 2048, 3, 4),
         (768, 768, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
         (768, 768, 131072, 128, 128, True, False, True): (1, 1024, 1, 32),
+        (768, 3072, 256, 16, 16, False, True, True): (1, 2, 4, 4),
+        (768, 3072, 256, 16, 16, True, False, True): (1, 4, 3, 4),
+        (768, 3072, 256, 32, 32, False, True, True): (1, 4, 3, 4),
+        (768, 3072, 256, 32, 32, True, False, True): (3, 4, 3, 4),
+        (768, 3072, 256, 64, 64, False, True, True): (1, 4, 3, 8),
+        (768, 3072, 256, 64, 64, True, False, True): (1, 4, 3, 8),
+        (768, 3072, 256, 128, 128, False, True, True): (2, 2, 2, 32),
+        (768, 3072, 256, 128, 128, True, False, True): (2, 2, 1, 32),
+        (768, 3072, 512, 16, 16, False, True, True): (2, 4, 3, 4),
+        (768, 3072, 512, 16, 16, True, False, True): (1, 8, 3, 2),
+        (768, 3072, 512, 32, 32, False, True, True): (3, 8, 4, 4),
+        (768, 3072, 512, 32, 32, True, False, True): (3, 8, 3, 4),
+        (768, 3072, 512, 64, 64, False, True, True): (1, 8, 4, 8),
+        (768, 3072, 512, 64, 64, True, False, True): (1, 8, 3, 8),
+        (768, 3072, 512, 128, 128, False, True, True): (1, 4, 2, 32),
+        (768, 3072, 512, 128, 128, True, False, True): (1, 4, 1, 32),
+        (768, 3072, 1024, 16, 16, False, True, True): (4, 16, 3, 2),
+        (768, 3072, 1024, 16, 16, True, False, True): (4, 16, 3, 2),
+        (768, 3072, 1024, 32, 32, False, True, True): (4, 16, 5, 4),
+        (768, 3072, 1024, 32, 32, True, False, True): (4, 16, 5, 4),
+        (768, 3072, 1024, 64, 64, False, True, True): (2, 16, 3, 8),
+        (768, 3072, 1024, 64, 64, True, False, True): (2, 16, 3, 8),
+        (768, 3072, 1024, 128, 128, False, True, True): (1, 8, 1, 32),
+        (768, 3072, 1024, 128, 128, True, False, True): (1, 8, 1, 32),
+        (768, 3072, 2048, 16, 16, False, True, True): (2, 16, 3, 4),
+        (768, 3072, 2048, 16, 16, True, False, True): (2, 16, 3, 4),
+        (768, 3072, 2048, 32, 32, False, True, True): (4, 32, 5, 4),
+        (768, 3072, 2048, 32, 32, True, False, True): (2, 32, 3, 4),
+        (768, 3072, 2048, 64, 64, False, True, True): (2, 32, 3, 8),
+        (768, 3072, 2048, 64, 64, True, False, True): (2, 32, 3, 8),
+        (768, 3072, 2048, 128, 128, False, True, True): (1, 16, 1, 32),
+        (768, 3072, 2048, 128, 128, True, False, True): (2, 16, 1, 32),
+        (768, 3072, 4096, 16, 16, False, True, True): (1, 32, 5, 4),
+        (768, 3072, 4096, 16, 16, True, False, True): (3, 64, 3, 2),
+        (768, 3072, 4096, 32, 32, False, True, True): (5, 64, 3, 4),
+        (768, 3072, 4096, 32, 32, True, False, True): (5, 64, 3, 4),
+        (768, 3072, 4096, 64, 64, False, True, True): (1, 64, 3, 8),
+        (768, 3072, 4096, 64, 64, True, False, True): (5, 64, 3, 4),
+        (768, 3072, 4096, 128, 128, False, True, True): (1, 32, 1, 32),
+        (768, 3072, 4096, 128, 128, True, False, True): (1, 32, 1, 32),
+        (768, 3072, 8192, 16, 16, False, True, True): (1, 128, 3, 2),
+        (768, 3072, 8192, 16, 16, True, False, True): (1, 128, 3, 2),
+        (768, 3072, 8192, 32, 32, False, True, True): (1, 128, 3, 4),
+        (768, 3072, 8192, 32, 32, True, False, True): (1, 64, 3, 4),
+        (768, 3072, 8192, 64, 64, False, True, True): (3, 128, 3, 4),
+        (768, 3072, 8192, 64, 64, True, False, True): (3, 128, 3, 4),
+        (768, 3072, 8192, 128, 128, False, True, True): (4, 64, 2, 32),
+        (768, 3072, 8192, 128, 128, True, False, True): (2, 64, 1, 32),
+        (768, 3072, 16384, 16, 16, False, True, True): (1, 256, 2, 2),
+        (768, 3072, 16384, 16, 16, True, False, True): (1, 64, 3, 4),
+        (768, 3072, 16384, 32, 32, False, True, True): (8, 128, 3, 4),
+        (768, 3072, 16384, 32, 32, True, False, True): (1, 128, 3, 4),
+        (768, 3072, 16384, 64, 64, False, True, True): (1, 256, 3, 4),
+        (768, 3072, 16384, 64, 64, True, False, True): (3, 256, 3, 4),
+        (768, 3072, 16384, 128, 128, False, True, True): (3, 128, 1, 32),
+        (768, 3072, 16384, 128, 128, True, False, True): (2, 128, 2, 32),
+        (768, 3072, 32768, 16, 16, False, True, True): (1, 512, 3, 1),
+        (768, 3072, 32768, 16, 16, True, False, True): (1, 128, 3, 4),
+        (768, 3072, 32768, 32, 32, False, True, True): (1, 256, 3, 4),
+        (768, 3072, 32768, 32, 32, True, False, True): (1, 256, 3, 4),
+        (768, 3072, 32768, 64, 64, False, True, True): (2, 512, 3, 4),
+        (768, 3072, 32768, 64, 64, True, False, True): (1, 512, 3, 4),
+        (768, 3072, 32768, 128, 128, False, True, True): (1, 256, 1, 32),
+        (768, 3072, 32768, 128, 128, True, False, True): (2, 256, 2, 32),
         (768, 3072, 50432, 16, 16, False, True, True): (1, 197, 3, 4),
+        (768, 3072, 50432, 16, 16, True, False, True): (1, 197, 3, 4),
         (768, 3072, 50432, 32, 32, False, True, True): (1, 788, 2, 4),
+        (768, 3072, 50432, 32, 32, True, False, True): (1, 394, 3, 4),
         (768, 3072, 50432, 64, 64, False, True, True): (1, 788, 3, 4),
+        (768, 3072, 50432, 64, 64, True, False, True): (2, 788, 3, 4),
         (768, 3072, 50432, 128, 128, False, True, True): (1, 394, 1, 32),
+        (768, 3072, 50432, 128, 128, True, False, True): (2, 394, 2, 32),
+        (768, 3072, 65536, 16, 16, False, True, True): (1, 1024, 3, 1),
+        (768, 3072, 65536, 16, 16, True, False, True): (1, 256, 3, 4),
+        (768, 3072, 65536, 32, 32, False, True, True): (1, 512, 3, 4),
+        (768, 3072, 65536, 32, 32, True, False, True): (1, 512, 3, 4),
+        (768, 3072, 65536, 64, 64, False, True, True): (2, 1024, 3, 4),
+        (768, 3072, 65536, 64, 64, True, False, True): (5, 1024, 3, 4),
+        (768, 3072, 65536, 128, 128, False, True, True): (1, 512, 1, 32),
+        (768, 3072, 65536, 128, 128, True, False, True): (2, 512, 2, 32),
+        (768, 3072, 131072, 16, 16, False, True, True): (1, 2048, 3, 1),
+        (768, 3072, 131072, 16, 16, True, False, True): (1, 512, 3, 4),
+        (768, 3072, 131072, 32, 32, False, True, True): (1, 1024, 3, 4),
+        (768, 3072, 131072, 32, 32, True, False, True): (1, 1024, 3, 4),
+        (768, 3072, 131072, 64, 64, False, True, True): (1, 2048, 3, 4),
+        (768, 3072, 131072, 64, 64, True, False, True): (2, 2048, 3, 4),
+        (768, 3072, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
+        (768, 3072, 131072, 128, 128, True, False, True): (1, 1024, 2, 32),
         (1024, 1024, 256, 16, 16, False, True, True): (4, 8, 3, 2),
         (1024, 1024, 256, 16, 16, True, False, True): (2, 8, 3, 2),
         (1024, 1024, 256, 32, 32, False, True, True): (1, 8, 3, 4),
@@ -4288,6 +4785,94 @@
         (2048, 2048, 131072, 64, 64, True, False, True): (1, 2048, 2, 4),
         (2048, 2048, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
         (2048, 2048, 131072, 128, 128, True, False, True): (4, 1024, 1, 32),
+        (3072, 768, 256, 16, 16, False, True, True): (4, 4, 3, 2),
+        (3072, 768, 256, 16, 16, True, False, True): (1, 2, 6, 4),
+        (3072, 768, 256, 32, 32, False, True, True): (1, 4, 6, 4),
+        (3072, 768, 256, 32, 32, True, False, True): (5, 4, 3, 4),
+        (3072, 768, 256, 64, 64, False, True, True): (4, 4, 3, 8),
+        (3072, 768, 256, 64, 64, True, False, True): (4, 4, 3, 8),
+        (3072, 768, 256, 128, 128, False, True, True): (1, 2, 1, 32),
+        (3072, 768, 256, 128, 128, True, False, True): (5, 2, 1, 32),
+        (3072, 768, 512, 16, 16, False, True, True): (4, 4, 3, 4),
+        (3072, 768, 512, 16, 16, True, False, True): (1, 4, 3, 4),
+        (3072, 768, 512, 32, 32, False, True, True): (3, 8, 3, 4),
+        (3072, 768, 512, 32, 32, True, False, True): (3, 8, 3, 4),
+        (3072, 768, 512, 64, 64, False, True, True): (2, 8, 3, 8),
+        (3072, 768, 512, 64, 64, True, False, True): (2, 8, 3, 8),
+        (3072, 768, 512, 128, 128, False, True, True): (1, 4, 2, 32),
+        (3072, 768, 512, 128, 128, True, False, True): (1, 4, 1, 32),
+        (3072, 768, 1024, 16, 16, False, True, True): (1, 16, 3, 2),
+        (3072, 768, 1024, 16, 16, True, False, True): (3, 16, 3, 2),
+        (3072, 768, 1024, 32, 32, False, True, True): (1, 16, 3, 4),
+        (3072, 768, 1024, 32, 32, True, False, True): (3, 16, 3, 4),
+        (3072, 768, 1024, 64, 64, False, True, True): (4, 16, 3, 8),
+        (3072, 768, 1024, 64, 64, True, False, True): (4, 16, 3, 4),
+        (3072, 768, 1024, 128, 128, False, True, True): (5, 8, 1, 32),
+        (3072, 768, 1024, 128, 128, True, False, True): (5, 8, 1, 32),
+        (3072, 768, 2048, 16, 16, False, True, True): (4, 32, 3, 2),
+        (3072, 768, 2048, 16, 16, True, False, True): (1, 32, 3, 2),
+        (3072, 768, 2048, 32, 32, False, True, True): (1, 32, 3, 4),
+        (3072, 768, 2048, 32, 32, True, False, True): (1, 32, 2, 4),
+        (3072, 768, 2048, 64, 64, False, True, True): (2, 32, 3, 4),
+        (3072, 768, 2048, 64, 64, True, False, True): (4, 32, 3, 4),
+        (3072, 768, 2048, 128, 128, False, True, True): (1, 16, 1, 32),
+        (3072, 768, 2048, 128, 128, True, False, True): (1, 16, 1, 32),
+        (3072, 768, 4096, 16, 16, False, True, True): (3, 64, 3, 2),
+        (3072, 768, 4096, 16, 16, True, False, True): (1, 64, 3, 2),
+        (3072, 768, 4096, 32, 32, False, True, True): (1, 64, 3, 4),
+        (3072, 768, 4096, 32, 32, True, False, True): (1, 32, 3, 4),
+        (3072, 768, 4096, 64, 64, False, True, True): (2, 64, 3, 4),
+        (3072, 768, 4096, 64, 64, True, False, True): (2, 64, 3, 4),
+        (3072, 768, 4096, 128, 128, False, True, True): (1, 32, 1, 32),
+        (3072, 768, 4096, 128, 128, True, False, True): (1, 32, 1, 32),
+        (3072, 768, 8192, 16, 16, False, True, True): (4, 128, 3, 1),
+        (3072, 768, 8192, 16, 16, True, False, True): (1, 32, 3, 4),
+        (3072, 768, 8192, 32, 32, False, True, True): (1, 64, 3, 4),
+        (3072, 768, 8192, 32, 32, True, False, True): (1, 64, 3, 4),
+        (3072, 768, 8192, 64, 64, False, True, True): (2, 128, 3, 4),
+        (3072, 768, 8192, 64, 64, True, False, True): (2, 128, 3, 4),
+        (3072, 768, 8192, 128, 128, False, True, True): (1, 64, 1, 32),
+        (3072, 768, 8192, 128, 128, True, False, True): (1, 64, 1, 32),
+        (3072, 768, 16384, 16, 16, False, True, True): (4, 256, 3, 1),
+        (3072, 768, 16384, 16, 16, True, False, True): (1, 64, 3, 4),
+        (3072, 768, 16384, 32, 32, False, True, True): (1, 128, 3, 4),
+        (3072, 768, 16384, 32, 32, True, False, True): (1, 128, 3, 4),
+        (3072, 768, 16384, 64, 64, False, True, True): (2, 256, 3, 4),
+        (3072, 768, 16384, 64, 64, True, False, True): (2, 256, 3, 4),
+        (3072, 768, 16384, 128, 128, False, True, True): (1, 128, 1, 32),
+        (3072, 768, 16384, 128, 128, True, False, True): (1, 128, 1, 32),
+        (3072, 768, 32768, 16, 16, False, True, True): (4, 512, 3, 1),
+        (3072, 768, 32768, 16, 16, True, False, True): (1, 128, 3, 4),
+        (3072, 768, 32768, 32, 32, False, True, True): (1, 256, 3, 4),
+        (3072, 768, 32768, 32, 32, True, False, True): (1, 256, 3, 4),
+        (3072, 768, 32768, 64, 64, False, True, True): (2, 512, 3, 4),
+        (3072, 768, 32768, 64, 64, True, False, True): (2, 512, 3, 4),
+        (3072, 768, 32768, 128, 128, False, True, True): (1, 256, 1, 32),
+        (3072, 768, 32768, 128, 128, True, False, True): (1, 256, 1, 32),
+        (3072, 768, 50432, 16, 16, False, True, True): (4, 788, 3, 1),
+        (3072, 768, 50432, 16, 16, True, False, True): (1, 197, 3, 4),
+        (3072, 768, 50432, 32, 32, False, True, True): (1, 394, 3, 4),
+        (3072, 768, 50432, 32, 32, True, False, True): (1, 394, 3, 4),
+        (3072, 768, 50432, 64, 64, False, True, True): (1, 788, 3, 4),
+        (3072, 768, 50432, 64, 64, True, False, True): (2, 788, 3, 4),
+        (3072, 768, 50432, 128, 128, False, True, True): (1, 394, 1, 32),
+        (3072, 768, 50432, 128, 128, True, False, True): (1, 394, 1, 32),
+        (3072, 768, 65536, 16, 16, False, True, True): (4, 1024, 3, 1),
+        (3072, 768, 65536, 16, 16, True, False, True): (1, 256, 3, 4),
+        (3072, 768, 65536, 32, 32, False, True, True): (1, 512, 3, 4),
+        (3072, 768, 65536, 32, 32, True, False, True): (1, 512, 3, 4),
+        (3072, 768, 65536, 64, 64, False, True, True): (2, 1024, 3, 4),
+        (3072, 768, 65536, 64, 64, True, False, True): (2, 1024, 3, 4),
+        (3072, 768, 65536, 128, 128, False, True, True): (1, 512, 1, 32),
+        (3072, 768, 65536, 128, 128, True, False, True): (1, 512, 1, 32),
+        (3072, 768, 131072, 16, 16, False, True, True): (4, 2048, 3, 1),
+        (3072, 768, 131072, 16, 16, True, False, True): (1, 512, 3, 4),
+        (3072, 768, 131072, 32, 32, False, True, True): (1, 1024, 3, 4),
+        (3072, 768, 131072, 32, 32, True, False, True): (1, 1024, 3, 4),
+        (3072, 768, 131072, 64, 64, False, True, True): (2, 2048, 3, 4),
+        (3072, 768, 131072, 64, 64, True, False, True): (2, 2048, 3, 4),
+        (3072, 768, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
+        (3072, 768, 131072, 128, 128, True, False, True): (1, 1024, 1, 32),
         (3072, 3072, 256, 16, 16, False, True, True): (1, 4, 5, 2),
         (3072, 3072, 256, 16, 16, True, False, True): (1, 4, 3, 2),
         (3072, 3072, 256, 32, 32, False, True, True): (1, 4, 4, 4),