Add tests to bsr_dense_addmm_meta. Tune bsr_dense_addmm kernel for ViT shapes. (#132646)
As in the title.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/132646
Approved by: https://github.com/cpuhrsch
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index f61136b..14e8a85 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -2,10 +2,12 @@
import torch
import random
+import io
import itertools
import unittest
import functools
-from torch.testing import make_tensor
+from contextlib import redirect_stderr
+from torch.testing import make_tensor, FileCheck
from torch.testing._internal.common_cuda import SM53OrLater, SM80OrLater, TEST_CUSPARSE_GENERIC
from torch.testing._internal.common_utils import \
(TEST_WITH_TORCHINDUCTOR, TEST_WITH_ROCM, TEST_SCIPY, TEST_NUMPY, TEST_MKL, IS_WINDOWS, TestCase, run_tests,
@@ -4134,6 +4136,76 @@
result = operation(*args, **dict(meta=meta))
self.assertEqual(result, expected)
+ @onlyCUDA
+ @skipIfRocm
+ @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "Test requires Triton")
+ def test_triton_bsr_dense_addmm_meta(self, device):
+ from torch.sparse._triton_ops import bsr_dense_addmm_meta
+ from torch.sparse._triton_ops_meta import update as update_bsr_dense_addmm_meta
+
+ dtype = torch.float32
+ Ms = Ks = 16
+ beta = 0.0
+ alpha = 1.0
+
+ def get_meta(M, K, N, sparsity=None):
+ return bsr_dense_addmm_meta(M, K, N, Ms, Ks, beta, alpha, dtype=dtype, sparsity=sparsity,
+ _version="test_triton_bsr_dense_addmm_meta")
+
+ def update_meta(M, K, N, value, sparsity=0.5):
+ key = (M, K, N, Ms, Ks, beta == 0, beta == 1, alpha == 1)
+ update_bsr_dense_addmm_meta("bsr_dense_addmm", torch.cuda.get_device_name(),
+ ("test_triton_bsr_dense_addmm_meta", dtype, sparsity),
+ key, value)
+
+ def get_meta_with_checks(M, K, N, warn_count=0, sparsity=None):
+ f = io.StringIO()
+ with redirect_stderr(f):
+ result = get_meta(M, K, N, sparsity=sparsity)
+ msg = f.getvalue()
+ FileCheck().check_count(
+ str=f"UserWarning: bsr_dense_addmm uses non-optimal triton kernel parameters for M={M} K={K} N={N}",
+ count=warn_count, exactly=True
+ ).run(msg)
+ return result
+
+ # Test warn_once when requesting non-existing tuned parameters multiple times
+ f = io.StringIO()
+ with redirect_stderr(f):
+ for i in range(5):
+ get_meta(16, 16, 16)
+ for i in range(5):
+ get_meta(16, 16, 32)
+
+ msg = f.getvalue()
+ FileCheck().check_count(
+ str="UserWarning: bsr_dense_addmm uses non-optimal triton kernel parameters for M=16 K=16 N=16", count=1, exactly=True
+ ).run(msg)
+ FileCheck().check_count(
+ str="UserWarning: bsr_dense_addmm uses non-optimal triton kernel parameters for M=16 K=16 N=32", count=1, exactly=True
+ ).run(msg)
+
+ # Test warn_once when tuned parameters are missing
+ default_meta = dict(GROUP_SIZE_ROW=4, SPLIT_N=2, num_stages=1, num_warps=4)
+ self.assertEqual(get_meta_with_checks(32, 32, 32, warn_count=1), default_meta)
+
+ # Test (no)warn_once when tuned parameters are available
+ update_meta(32, 32, 48, (2, 8, 5, 6))
+ expected_meta = dict(GROUP_SIZE_ROW=2, SPLIT_N=8, num_stages=5, num_warps=6)
+ self.assertEqual(get_meta_with_checks(32, 32, 48, warn_count=0), expected_meta)
+
+ # Test non-existing tuned parameters with non-default sparsity
+ # while for default sparsity 0.5 the parameters are available
+ self.assertEqual(get_meta_with_checks(32, 32, 48, warn_count=0, sparsity=0.6), expected_meta)
+
+ # Test non-existing tuned parameters while there exists
+ # parameters with consistent N // SPLIT_N ratio:
+ self.assertEqual(get_meta_with_checks(32, 32, 72, warn_count=0),
+ dict(GROUP_SIZE_ROW=2, SPLIT_N=12, num_stages=5, num_warps=6))
+ # ... or not:
+ self.assertEqual(get_meta_with_checks(32, 32, 64, warn_count=1),
+ dict(GROUP_SIZE_ROW=4, SPLIT_N=4, num_stages=1, num_warps=4))
+
# e.g., TestSparseCSRCPU and TestSparseCSRCUDA
instantiate_device_type_tests(TestSparseCSR, globals())
diff --git a/torch/sparse/_triton_ops.py b/torch/sparse/_triton_ops.py
index ccd29b9..4f7f0dc 100644
--- a/torch/sparse/_triton_ops.py
+++ b/torch/sparse/_triton_ops.py
@@ -7,6 +7,7 @@
from typing import Optional, Tuple
import torch
+from torch._dynamo.utils import warn_once
from torch.utils._triton import has_triton
from ._triton_ops_meta import get_meta
@@ -748,8 +749,12 @@
num_stages=None,
sparsity=None,
dtype=None,
+ _version=0,
**extra,
):
+ # Specifying _version is useful for situations when one wants to
+ # discard existing triton kernel tuning results, say, in testing
+ # bsr_dense_addmm_meta functionality.
if dtype is None:
dtype = torch.float16
if sparsity is None:
@@ -758,27 +763,39 @@
device_name = torch.cuda.get_device_name()
key = (M, K, N, Ms, Ks, beta == 0, beta == 1, alpha == 1)
meta = get_meta(
- "bsr_dense_addmm", key, device_name, version=(0, dtype, sparsity)
+ "bsr_dense_addmm", key, device_name, version=(_version, dtype, sparsity)
)
if meta is None and sparsity != 0.5:
meta = get_meta(
- "bsr_dense_addmm", key, device_name, version=(0, dtype, 0.5)
+ "bsr_dense_addmm", key, device_name, version=(_version, dtype, 0.5)
)
- if meta is None:
- # find approximate meta such that N % SPLIT_N == 0.
- matching_meta = get_meta(
- "bsr_dense_addmm",
- (*key[:2], "*", *key[3:]),
- device_name,
- version=(0, dtype, 0.5),
- )
- for mkey in sorted(matching_meta or {}):
- meta_ = matching_meta[mkey]
- if N % meta_["SPLIT_N"] == 0 and mkey[2] <= N:
- meta = meta_
+ if meta is None:
+ # find approximate meta such that N % SPLIT_N == 0.
+ matching_meta = get_meta(
+ "bsr_dense_addmm",
+ (*key[:2], "*", *key[3:]),
+ device_name,
+ version=(_version, dtype, 0.5),
+ )
+ for mkey in sorted(matching_meta or {}):
+ meta_ = matching_meta[mkey]
+ n = mkey[2]
+ split_n = meta_["SPLIT_N"]
+ c = n // split_n
+ if N % c == 0 and n <= N:
+ meta = dict(meta_)
+ meta["SPLIT_N"] = N // c
if meta is not None:
meta.update(**extra)
return meta
+ else:
+ # see [Computing optimal kernel parameters] in
+ # _triton_ops_meta.py for ways to avoid this warning
+ # message
+ warn_once(
+ f"bsr_dense_addmm uses non-optimal triton kernel parameters for {M=} {K=} {N=} {Ms=}, {Ks=} {beta=} {alpha=}"
+ )
+
SPLIT_N = SPLIT_N or max(N // Ms, 1)
GROUP_SIZE_ROW = GROUP_SIZE_ROW or 4
num_stages = num_stages or 1
diff --git a/torch/sparse/_triton_ops_meta.py b/torch/sparse/_triton_ops_meta.py
index ef61a1c..8672e7e 100644
--- a/torch/sparse/_triton_ops_meta.py
+++ b/torch/sparse/_triton_ops_meta.py
@@ -716,9 +716,22 @@
def main(op="scatter_mm", force=False, dtype=torch.float16, verbose=True):
import itertools
- sizes_lst = [256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
+ sizes_lst = [
+ 256,
+ 512,
+ 1024,
+ 2048,
+ 4096,
+ 8192,
+ 16384,
+ 32768,
+ 65536,
+ 131072,
+ 50432,
+ ]
sizes3_lst = [3 * sz for sz in [64, 128] + sizes_lst if sz <= 2048]
- shapes_lst = [(sz, sz) for sz in sizes_lst[:-3] + sizes3_lst]
+ shapes_lst = [(sz, sz) for sz in sizes_lst[:-4] + sizes3_lst]
+ shapes_lst.extend([(3072, 768), (768, 3072)])
blocksize_lst = [(16, 16), (32, 32), (64, 64), (128, 128)]
sparsity_lst = [0.5, 0.7, 0.3][:1]
for sparsity in sparsity_lst:
@@ -734,6 +747,8 @@
M, K, N, BM, BK, force=force, sparsity=sparsity, dtype=dtype
)
elif op == "bsr_dense_addmm":
+ if M == K and N == 50432:
+ continue
print(f"{M, K, N, (BM, BK)=}")
for alpha, beta in [(1, 1), (1, 0)]:
optimize_bsr_dense_addmm(
@@ -1414,6 +1429,94 @@
(768, 768, 131072, 64, 64, True, False, True): (2, 512, 3, 4),
(768, 768, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
(768, 768, 131072, 128, 128, True, False, True): (1, 1024, 2, 4),
+ (768, 3072, 256, 16, 16, False, True, True): (3, 8, 6, 1),
+ (768, 3072, 256, 16, 16, True, False, True): (1, 4, 6, 2),
+ (768, 3072, 256, 32, 32, False, True, True): (1, 8, 4, 4),
+ (768, 3072, 256, 32, 32, True, False, True): (3, 4, 6, 4),
+ (768, 3072, 256, 64, 64, False, True, True): (2, 4, 3, 4),
+ (768, 3072, 256, 64, 64, True, False, True): (1, 4, 4, 4),
+ (768, 3072, 256, 128, 128, False, True, True): (2, 2, 3, 8),
+ (768, 3072, 256, 128, 128, True, False, True): (1, 2, 3, 8),
+ (768, 3072, 512, 16, 16, False, True, True): (1, 8, 4, 2),
+ (768, 3072, 512, 16, 16, True, False, True): (1, 8, 5, 2),
+ (768, 3072, 512, 32, 32, False, True, True): (1, 16, 3, 2),
+ (768, 3072, 512, 32, 32, True, False, True): (1, 8, 5, 2),
+ (768, 3072, 512, 64, 64, False, True, True): (1, 8, 3, 4),
+ (768, 3072, 512, 64, 64, True, False, True): (3, 8, 4, 4),
+ (768, 3072, 512, 128, 128, False, True, True): (1, 4, 3, 8),
+ (768, 3072, 512, 128, 128, True, False, True): (2, 4, 3, 8),
+ (768, 3072, 1024, 16, 16, False, True, True): (1, 16, 1, 4),
+ (768, 3072, 1024, 16, 16, True, False, True): (5, 4, 4, 4),
+ (768, 3072, 1024, 32, 32, False, True, True): (3, 8, 3, 4),
+ (768, 3072, 1024, 32, 32, True, False, True): (1, 8, 4, 4),
+ (768, 3072, 1024, 64, 64, False, True, True): (2, 16, 3, 4),
+ (768, 3072, 1024, 64, 64, True, False, True): (2, 16, 4, 4),
+ (768, 3072, 1024, 128, 128, False, True, True): (1, 8, 3, 8),
+ (768, 3072, 1024, 128, 128, True, False, True): (5, 8, 3, 8),
+ (768, 3072, 2048, 16, 16, False, True, True): (3, 16, 1, 2),
+ (768, 3072, 2048, 16, 16, True, False, True): (1, 8, 3, 4),
+ (768, 3072, 2048, 32, 32, False, True, True): (4, 16, 1, 8),
+ (768, 3072, 2048, 32, 32, True, False, True): (3, 8, 3, 4),
+ (768, 3072, 2048, 64, 64, False, True, True): (2, 16, 3, 4),
+ (768, 3072, 2048, 64, 64, True, False, True): (2, 16, 3, 4),
+ (768, 3072, 2048, 128, 128, False, True, True): (3, 16, 3, 8),
+ (768, 3072, 2048, 128, 128, True, False, True): (4, 16, 3, 8),
+ (768, 3072, 4096, 16, 16, False, True, True): (1, 32, 1, 4),
+ (768, 3072, 4096, 16, 16, True, False, True): (1, 16, 3, 1),
+ (768, 3072, 4096, 32, 32, False, True, True): (3, 32, 1, 8),
+ (768, 3072, 4096, 32, 32, True, False, True): (3, 16, 4, 4),
+ (768, 3072, 4096, 64, 64, False, True, True): (2, 32, 3, 4),
+ (768, 3072, 4096, 64, 64, True, False, True): (2, 16, 3, 4),
+ (768, 3072, 4096, 128, 128, False, True, True): (5, 32, 1, 4),
+ (768, 3072, 4096, 128, 128, True, False, True): (9, 32, 3, 8),
+ (768, 3072, 8192, 16, 16, False, True, True): (1, 32, 1, 4),
+ (768, 3072, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+ (768, 3072, 8192, 32, 32, False, True, True): (1, 64, 1, 8),
+ (768, 3072, 8192, 32, 32, True, False, True): (2, 64, 4, 2),
+ (768, 3072, 8192, 64, 64, False, True, True): (1, 64, 3, 4),
+ (768, 3072, 8192, 64, 64, True, False, True): (2, 32, 3, 4),
+ (768, 3072, 8192, 128, 128, False, True, True): (2, 64, 3, 8),
+ (768, 3072, 8192, 128, 128, True, False, True): (2, 64, 3, 8),
+ (768, 3072, 16384, 16, 16, False, True, True): (1, 64, 1, 4),
+ (768, 3072, 16384, 16, 16, True, False, True): (1, 64, 4, 1),
+ (768, 3072, 16384, 32, 32, False, True, True): (1, 128, 1, 8),
+ (768, 3072, 16384, 32, 32, True, False, True): (1, 64, 3, 4),
+ (768, 3072, 16384, 64, 64, False, True, True): (1, 128, 3, 4),
+ (768, 3072, 16384, 64, 64, True, False, True): (4, 64, 3, 4),
+ (768, 3072, 16384, 128, 128, False, True, True): (2, 128, 3, 8),
+ (768, 3072, 16384, 128, 128, True, False, True): (2, 128, 3, 8),
+ (768, 3072, 32768, 16, 16, False, True, True): (1, 128, 1, 4),
+ (768, 3072, 32768, 16, 16, True, False, True): (1, 128, 4, 1),
+ (768, 3072, 32768, 32, 32, False, True, True): (1, 256, 1, 8),
+ (768, 3072, 32768, 32, 32, True, False, True): (1, 128, 3, 4),
+ (768, 3072, 32768, 64, 64, False, True, True): (1, 256, 3, 4),
+ (768, 3072, 32768, 64, 64, True, False, True): (1, 128, 3, 4),
+ (768, 3072, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
+ (768, 3072, 32768, 128, 128, True, False, True): (2, 256, 3, 8),
+ (768, 3072, 50432, 16, 16, False, True, True): (1, 197, 1, 4),
+ (768, 3072, 50432, 16, 16, True, False, True): (4, 197, 4, 4),
+ (768, 3072, 50432, 32, 32, False, True, True): (1, 197, 1, 4),
+ (768, 3072, 50432, 32, 32, True, False, True): (4, 197, 3, 4),
+ (768, 3072, 50432, 64, 64, False, True, True): (1, 394, 3, 4),
+ (768, 3072, 50432, 64, 64, True, False, True): (3, 197, 3, 4),
+ (768, 3072, 50432, 128, 128, False, True, True): (3, 394, 1, 4),
+ (768, 3072, 50432, 128, 128, True, False, True): (1, 394, 3, 8),
+ (768, 3072, 65536, 16, 16, False, True, True): (1, 256, 1, 4),
+ (768, 3072, 65536, 16, 16, True, False, True): (5, 256, 4, 1),
+ (768, 3072, 65536, 32, 32, False, True, True): (1, 256, 1, 4),
+ (768, 3072, 65536, 32, 32, True, False, True): (3, 256, 3, 4),
+ (768, 3072, 65536, 64, 64, False, True, True): (2, 512, 3, 4),
+ (768, 3072, 65536, 64, 64, True, False, True): (3, 256, 3, 4),
+ (768, 3072, 65536, 128, 128, False, True, True): (3, 512, 1, 4),
+ (768, 3072, 65536, 128, 128, True, False, True): (2, 512, 3, 8),
+ (768, 3072, 131072, 16, 16, False, True, True): (1, 512, 1, 4),
+ (768, 3072, 131072, 16, 16, True, False, True): (5, 512, 4, 1),
+ (768, 3072, 131072, 32, 32, False, True, True): (1, 512, 1, 4),
+ (768, 3072, 131072, 32, 32, True, False, True): (4, 512, 3, 4),
+ (768, 3072, 131072, 64, 64, False, True, True): (1, 1024, 3, 4),
+ (768, 3072, 131072, 64, 64, True, False, True): (1, 512, 3, 4),
+ (768, 3072, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
+ (768, 3072, 131072, 128, 128, True, False, True): (1, 1024, 3, 8),
(1024, 1024, 256, 16, 16, False, True, True): (1, 4, 5, 4),
(1024, 1024, 256, 16, 16, True, False, True): (3, 4, 4, 4),
(1024, 1024, 256, 32, 32, False, True, True): (4, 4, 5, 2),
@@ -1654,6 +1757,94 @@
(2048, 2048, 131072, 64, 64, True, False, True): (4, 1024, 3, 4),
(2048, 2048, 131072, 128, 128, False, True, True): (1, 1024, 1, 4),
(2048, 2048, 131072, 128, 128, True, False, True): (2, 1024, 1, 4),
+ (3072, 768, 256, 16, 16, False, True, True): (6, 4, 1, 4),
+ (3072, 768, 256, 16, 16, True, False, True): (3, 1, 4, 4),
+ (3072, 768, 256, 32, 32, False, True, True): (6, 8, 1, 2),
+ (3072, 768, 256, 32, 32, True, False, True): (1, 2, 4, 4),
+ (3072, 768, 256, 64, 64, False, True, True): (1, 4, 4, 4),
+ (3072, 768, 256, 64, 64, True, False, True): (4, 2, 4, 4),
+ (3072, 768, 256, 128, 128, False, True, True): (1, 2, 3, 8),
+ (3072, 768, 256, 128, 128, True, False, True): (1, 2, 3, 8),
+ (3072, 768, 512, 16, 16, False, True, True): (2, 4, 1, 4),
+ (3072, 768, 512, 16, 16, True, False, True): (1, 4, 4, 1),
+ (3072, 768, 512, 32, 32, False, True, True): (3, 8, 1, 4),
+ (3072, 768, 512, 32, 32, True, False, True): (1, 2, 3, 4),
+ (3072, 768, 512, 64, 64, False, True, True): (1, 8, 1, 4),
+ (3072, 768, 512, 64, 64, True, False, True): (4, 4, 3, 4),
+ (3072, 768, 512, 128, 128, False, True, True): (1, 4, 3, 8),
+ (3072, 768, 512, 128, 128, True, False, True): (1, 4, 3, 8),
+ (3072, 768, 1024, 16, 16, False, True, True): (1, 8, 1, 4),
+ (3072, 768, 1024, 16, 16, True, False, True): (3, 4, 3, 1),
+ (3072, 768, 1024, 32, 32, False, True, True): (1, 8, 1, 8),
+ (3072, 768, 1024, 32, 32, True, False, True): (1, 4, 4, 4),
+ (3072, 768, 1024, 64, 64, False, True, True): (1, 16, 3, 4),
+ (3072, 768, 1024, 64, 64, True, False, True): (1, 4, 3, 4),
+ (3072, 768, 1024, 128, 128, False, True, True): (1, 8, 3, 8),
+ (3072, 768, 1024, 128, 128, True, False, True): (2, 8, 3, 8),
+ (3072, 768, 2048, 16, 16, False, True, True): (3, 8, 1, 4),
+ (3072, 768, 2048, 16, 16, True, False, True): (2, 8, 3, 4),
+ (3072, 768, 2048, 32, 32, False, True, True): (3, 16, 1, 8),
+ (3072, 768, 2048, 32, 32, True, False, True): (3, 8, 3, 4),
+ (3072, 768, 2048, 64, 64, False, True, True): (1, 16, 1, 4),
+ (3072, 768, 2048, 64, 64, True, False, True): (1, 16, 3, 4),
+ (3072, 768, 2048, 128, 128, False, True, True): (1, 16, 3, 8),
+ (3072, 768, 2048, 128, 128, True, False, True): (2, 16, 2, 4),
+ (3072, 768, 4096, 16, 16, False, True, True): (1, 16, 1, 4),
+ (3072, 768, 4096, 16, 16, True, False, True): (4, 32, 4, 2),
+ (3072, 768, 4096, 32, 32, False, True, True): (2, 32, 1, 8),
+ (3072, 768, 4096, 32, 32, True, False, True): (7, 16, 3, 4),
+ (3072, 768, 4096, 64, 64, False, True, True): (2, 32, 1, 4),
+ (3072, 768, 4096, 64, 64, True, False, True): (2, 16, 2, 4),
+ (3072, 768, 4096, 128, 128, False, True, True): (1, 32, 3, 8),
+ (3072, 768, 4096, 128, 128, True, False, True): (3, 32, 2, 4),
+ (3072, 768, 8192, 16, 16, False, True, True): (2, 32, 1, 4),
+ (3072, 768, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+ (3072, 768, 8192, 32, 32, False, True, True): (4, 32, 1, 4),
+ (3072, 768, 8192, 32, 32, True, False, True): (4, 32, 3, 4),
+ (3072, 768, 8192, 64, 64, False, True, True): (2, 64, 1, 4),
+ (3072, 768, 8192, 64, 64, True, False, True): (4, 32, 2, 4),
+ (3072, 768, 8192, 128, 128, False, True, True): (3, 64, 1, 4),
+ (3072, 768, 8192, 128, 128, True, False, True): (6, 64, 2, 4),
+ (3072, 768, 16384, 16, 16, False, True, True): (1, 64, 1, 4),
+ (3072, 768, 16384, 16, 16, True, False, True): (1, 64, 1, 1),
+ (3072, 768, 16384, 32, 32, False, True, True): (1, 64, 1, 4),
+ (3072, 768, 16384, 32, 32, True, False, True): (4, 64, 3, 4),
+ (3072, 768, 16384, 64, 64, False, True, True): (4, 128, 1, 4),
+ (3072, 768, 16384, 64, 64, True, False, True): (4, 64, 2, 4),
+ (3072, 768, 16384, 128, 128, False, True, True): (3, 128, 1, 4),
+ (3072, 768, 16384, 128, 128, True, False, True): (4, 128, 2, 4),
+ (3072, 768, 32768, 16, 16, False, True, True): (1, 128, 1, 4),
+ (3072, 768, 32768, 16, 16, True, False, True): (8, 128, 4, 1),
+ (3072, 768, 32768, 32, 32, False, True, True): (1, 128, 1, 4),
+ (3072, 768, 32768, 32, 32, True, False, True): (8, 128, 3, 4),
+ (3072, 768, 32768, 64, 64, False, True, True): (1, 256, 1, 4),
+ (3072, 768, 32768, 64, 64, True, False, True): (1, 128, 2, 4),
+ (3072, 768, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
+ (3072, 768, 32768, 128, 128, True, False, True): (8, 256, 2, 4),
+ (3072, 768, 50432, 16, 16, False, True, True): (1, 197, 1, 4),
+ (3072, 768, 50432, 16, 16, True, False, True): (7, 197, 4, 1),
+ (3072, 768, 50432, 32, 32, False, True, True): (1, 197, 1, 4),
+ (3072, 768, 50432, 32, 32, True, False, True): (4, 197, 3, 4),
+ (3072, 768, 50432, 64, 64, False, True, True): (1, 394, 1, 4),
+ (3072, 768, 50432, 64, 64, True, False, True): (3, 197, 2, 4),
+ (3072, 768, 50432, 128, 128, False, True, True): (3, 394, 1, 4),
+ (3072, 768, 50432, 128, 128, True, False, True): (8, 394, 2, 4),
+ (3072, 768, 65536, 16, 16, False, True, True): (1, 256, 1, 4),
+ (3072, 768, 65536, 16, 16, True, False, True): (15, 256, 4, 1),
+ (3072, 768, 65536, 32, 32, False, True, True): (1, 256, 1, 4),
+ (3072, 768, 65536, 32, 32, True, False, True): (15, 256, 3, 4),
+ (3072, 768, 65536, 64, 64, False, True, True): (1, 512, 1, 4),
+ (3072, 768, 65536, 64, 64, True, False, True): (2, 256, 2, 4),
+ (3072, 768, 65536, 128, 128, False, True, True): (3, 512, 1, 4),
+ (3072, 768, 65536, 128, 128, True, False, True): (3, 512, 2, 4),
+ (3072, 768, 131072, 16, 16, False, True, True): (1, 512, 1, 4),
+ (3072, 768, 131072, 16, 16, True, False, True): (15, 512, 4, 1),
+ (3072, 768, 131072, 32, 32, False, True, True): (1, 512, 1, 4),
+ (3072, 768, 131072, 32, 32, True, False, True): (9, 512, 3, 4),
+ (3072, 768, 131072, 64, 64, False, True, True): (1, 1024, 1, 4),
+ (3072, 768, 131072, 64, 64, True, False, True): (3, 512, 2, 4),
+ (3072, 768, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
+ (3072, 768, 131072, 128, 128, True, False, True): (1, 1024, 2, 4),
(3072, 3072, 256, 16, 16, False, True, True): (5, 4, 1, 4),
(3072, 3072, 256, 16, 16, True, False, True): (1, 2, 5, 2),
(3072, 3072, 256, 32, 32, False, True, True): (5, 4, 1, 8),
@@ -2418,6 +2609,14 @@
(256, 256, 32768, 64, 64, True, False, True): (2, 256, 1, 4),
(256, 256, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
(256, 256, 32768, 128, 128, True, False, True): (2, 256, 1, 4),
+ (256, 256, 50432, 16, 16, False, True, True): (4, 197, 1, 4),
+ (256, 256, 50432, 16, 16, True, False, True): (4, 197, 3, 2),
+ (256, 256, 50432, 32, 32, False, True, True): (1, 394, 1, 2),
+ (256, 256, 50432, 32, 32, True, False, True): (4, 197, 3, 4),
+ (256, 256, 50432, 64, 64, False, True, True): (6, 394, 1, 4),
+ (256, 256, 50432, 64, 64, True, False, True): (4, 394, 2, 4),
+ (256, 256, 50432, 128, 128, False, True, True): (3, 394, 1, 4),
+ (256, 256, 50432, 128, 128, True, False, True): (1, 394, 2, 4),
(256, 256, 65536, 16, 16, False, True, True): (1, 256, 3, 2),
(256, 256, 65536, 16, 16, True, False, True): (1, 256, 3, 2),
(256, 256, 65536, 32, 32, False, True, True): (1, 512, 3, 2),
@@ -2558,6 +2757,14 @@
(512, 512, 32768, 64, 64, True, False, True): (1, 256, 3, 4),
(512, 512, 32768, 128, 128, False, True, True): (5, 256, 1, 4),
(512, 512, 32768, 128, 128, True, False, True): (1, 256, 1, 4),
+ (512, 512, 50432, 16, 16, False, True, True): (4, 197, 1, 4),
+ (512, 512, 50432, 16, 16, True, False, True): (4, 197, 3, 2),
+ (512, 512, 50432, 32, 32, False, True, True): (2, 197, 1, 4),
+ (512, 512, 50432, 32, 32, True, False, True): (4, 197, 3, 4),
+ (512, 512, 50432, 64, 64, False, True, True): (2, 394, 1, 4),
+ (512, 512, 50432, 64, 64, True, False, True): (4, 197, 2, 4),
+ (512, 512, 50432, 128, 128, False, True, True): (5, 394, 1, 4),
+ (512, 512, 50432, 128, 128, True, False, True): (6, 394, 2, 4),
(512, 512, 65536, 16, 16, False, True, True): (1, 256, 3, 2),
(512, 512, 65536, 16, 16, True, False, True): (1, 256, 3, 1),
(512, 512, 65536, 32, 32, False, True, True): (1, 512, 3, 2),
@@ -2654,6 +2861,94 @@
(768, 768, 131072, 64, 64, True, False, True): (3, 512, 3, 4),
(768, 768, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
(768, 768, 131072, 128, 128, True, False, True): (1, 1024, 2, 4),
+ (768, 3072, 256, 16, 16, False, True, True): (1, 8, 5, 2),
+ (768, 3072, 256, 16, 16, True, False, True): (3, 4, 7, 2),
+ (768, 3072, 256, 32, 32, False, True, True): (1, 8, 4, 2),
+ (768, 3072, 256, 32, 32, True, False, True): (1, 4, 5, 4),
+ (768, 3072, 256, 64, 64, False, True, True): (1, 4, 3, 4),
+ (768, 3072, 256, 64, 64, True, False, True): (1, 4, 5, 4),
+ (768, 3072, 256, 128, 128, False, True, True): (2, 2, 3, 8),
+ (768, 3072, 256, 128, 128, True, False, True): (2, 2, 3, 8),
+ (768, 3072, 512, 16, 16, False, True, True): (1, 8, 5, 2),
+ (768, 3072, 512, 16, 16, True, False, True): (1, 8, 5, 2),
+ (768, 3072, 512, 32, 32, False, True, True): (3, 8, 3, 4),
+ (768, 3072, 512, 32, 32, True, False, True): (1, 8, 7, 4),
+ (768, 3072, 512, 64, 64, False, True, True): (3, 8, 3, 4),
+ (768, 3072, 512, 64, 64, True, False, True): (3, 8, 5, 4),
+ (768, 3072, 512, 128, 128, False, True, True): (1, 4, 3, 8),
+ (768, 3072, 512, 128, 128, True, False, True): (1, 4, 3, 8),
+ (768, 3072, 1024, 16, 16, False, True, True): (4, 16, 1, 4),
+ (768, 3072, 1024, 16, 16, True, False, True): (2, 8, 5, 2),
+ (768, 3072, 1024, 32, 32, False, True, True): (1, 16, 6, 2),
+ (768, 3072, 1024, 32, 32, True, False, True): (1, 8, 4, 4),
+ (768, 3072, 1024, 64, 64, False, True, True): (2, 16, 4, 4),
+ (768, 3072, 1024, 64, 64, True, False, True): (2, 16, 4, 4),
+ (768, 3072, 1024, 128, 128, False, True, True): (1, 8, 3, 8),
+ (768, 3072, 1024, 128, 128, True, False, True): (3, 8, 3, 8),
+ (768, 3072, 2048, 16, 16, False, True, True): (1, 16, 1, 2),
+ (768, 3072, 2048, 16, 16, True, False, True): (1, 16, 5, 2),
+ (768, 3072, 2048, 32, 32, False, True, True): (4, 16, 1, 8),
+ (768, 3072, 2048, 32, 32, True, False, True): (2, 8, 3, 4),
+ (768, 3072, 2048, 64, 64, False, True, True): (2, 16, 3, 4),
+ (768, 3072, 2048, 64, 64, True, False, True): (2, 16, 3, 4),
+ (768, 3072, 2048, 128, 128, False, True, True): (3, 16, 3, 8),
+ (768, 3072, 2048, 128, 128, True, False, True): (1, 16, 3, 8),
+ (768, 3072, 4096, 16, 16, False, True, True): (1, 32, 1, 4),
+ (768, 3072, 4096, 16, 16, True, False, True): (1, 16, 3, 1),
+ (768, 3072, 4096, 32, 32, False, True, True): (3, 32, 1, 8),
+ (768, 3072, 4096, 32, 32, True, False, True): (2, 16, 3, 8),
+ (768, 3072, 4096, 64, 64, False, True, True): (2, 32, 3, 4),
+ (768, 3072, 4096, 64, 64, True, False, True): (2, 16, 3, 4),
+ (768, 3072, 4096, 128, 128, False, True, True): (5, 32, 1, 4),
+ (768, 3072, 4096, 128, 128, True, False, True): (4, 32, 3, 8),
+ (768, 3072, 8192, 16, 16, False, True, True): (1, 32, 1, 4),
+ (768, 3072, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+ (768, 3072, 8192, 32, 32, False, True, True): (1, 64, 1, 8),
+ (768, 3072, 8192, 32, 32, True, False, True): (2, 32, 3, 8),
+ (768, 3072, 8192, 64, 64, False, True, True): (2, 64, 3, 4),
+ (768, 3072, 8192, 64, 64, True, False, True): (2, 32, 3, 4),
+ (768, 3072, 8192, 128, 128, False, True, True): (1, 64, 3, 8),
+ (768, 3072, 8192, 128, 128, True, False, True): (2, 64, 3, 8),
+ (768, 3072, 16384, 16, 16, False, True, True): (1, 64, 1, 4),
+ (768, 3072, 16384, 16, 16, True, False, True): (1, 64, 4, 1),
+ (768, 3072, 16384, 32, 32, False, True, True): (1, 128, 1, 8),
+ (768, 3072, 16384, 32, 32, True, False, True): (1, 64, 3, 4),
+ (768, 3072, 16384, 64, 64, False, True, True): (1, 128, 3, 4),
+ (768, 3072, 16384, 64, 64, True, False, True): (1, 64, 3, 4),
+ (768, 3072, 16384, 128, 128, False, True, True): (2, 128, 3, 8),
+ (768, 3072, 16384, 128, 128, True, False, True): (1, 128, 3, 8),
+ (768, 3072, 32768, 16, 16, False, True, True): (1, 128, 1, 4),
+ (768, 3072, 32768, 16, 16, True, False, True): (1, 128, 4, 1),
+ (768, 3072, 32768, 32, 32, False, True, True): (1, 256, 1, 8),
+ (768, 3072, 32768, 32, 32, True, False, True): (1, 128, 3, 4),
+ (768, 3072, 32768, 64, 64, False, True, True): (1, 256, 3, 4),
+ (768, 3072, 32768, 64, 64, True, False, True): (1, 128, 3, 4),
+ (768, 3072, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
+ (768, 3072, 32768, 128, 128, True, False, True): (5, 256, 3, 8),
+ (768, 3072, 50432, 16, 16, False, True, True): (1, 197, 1, 4),
+ (768, 3072, 50432, 16, 16, True, False, True): (4, 197, 4, 1),
+ (768, 3072, 50432, 32, 32, False, True, True): (2, 197, 1, 4),
+ (768, 3072, 50432, 32, 32, True, False, True): (4, 197, 3, 4),
+ (768, 3072, 50432, 64, 64, False, True, True): (1, 394, 3, 4),
+ (768, 3072, 50432, 64, 64, True, False, True): (1, 197, 3, 4),
+ (768, 3072, 50432, 128, 128, False, True, True): (3, 394, 1, 4),
+ (768, 3072, 50432, 128, 128, True, False, True): (3, 394, 2, 4),
+ (768, 3072, 65536, 16, 16, False, True, True): (1, 256, 1, 4),
+ (768, 3072, 65536, 16, 16, True, False, True): (5, 256, 4, 1),
+ (768, 3072, 65536, 32, 32, False, True, True): (2, 256, 1, 4),
+ (768, 3072, 65536, 32, 32, True, False, True): (3, 256, 3, 4),
+ (768, 3072, 65536, 64, 64, False, True, True): (1, 512, 3, 4),
+ (768, 3072, 65536, 64, 64, True, False, True): (1, 256, 3, 4),
+ (768, 3072, 65536, 128, 128, False, True, True): (3, 512, 1, 4),
+ (768, 3072, 65536, 128, 128, True, False, True): (2, 512, 3, 8),
+ (768, 3072, 131072, 16, 16, False, True, True): (1, 512, 1, 4),
+ (768, 3072, 131072, 16, 16, True, False, True): (5, 512, 4, 1),
+ (768, 3072, 131072, 32, 32, False, True, True): (2, 512, 1, 4),
+ (768, 3072, 131072, 32, 32, True, False, True): (2, 512, 3, 4),
+ (768, 3072, 131072, 64, 64, False, True, True): (1, 1024, 3, 4),
+ (768, 3072, 131072, 64, 64, True, False, True): (2, 512, 3, 4),
+ (768, 3072, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
+ (768, 3072, 131072, 128, 128, True, False, True): (2, 1024, 3, 8),
(1024, 1024, 256, 16, 16, False, True, True): (3, 4, 5, 4),
(1024, 1024, 256, 16, 16, True, False, True): (3, 4, 5, 4),
(1024, 1024, 256, 32, 32, False, True, True): (2, 4, 6, 2),
@@ -2718,6 +3013,14 @@
(1024, 1024, 32768, 64, 64, True, False, True): (1, 256, 3, 4),
(1024, 1024, 32768, 128, 128, False, True, True): (7, 256, 1, 4),
(1024, 1024, 32768, 128, 128, True, False, True): (4, 256, 1, 4),
+ (1024, 1024, 50432, 16, 16, False, True, True): (1, 197, 1, 4),
+ (1024, 1024, 50432, 16, 16, True, False, True): (4, 197, 3, 4),
+ (1024, 1024, 50432, 32, 32, False, True, True): (2, 197, 1, 4),
+ (1024, 1024, 50432, 32, 32, True, False, True): (1, 197, 3, 4),
+ (1024, 1024, 50432, 64, 64, False, True, True): (2, 394, 1, 4),
+ (1024, 1024, 50432, 64, 64, True, False, True): (1, 197, 2, 4),
+ (1024, 1024, 50432, 128, 128, False, True, True): (3, 394, 1, 4),
+ (1024, 1024, 50432, 128, 128, True, False, True): (2, 394, 2, 4),
(1024, 1024, 65536, 16, 16, False, True, True): (1, 256, 3, 4),
(1024, 1024, 65536, 16, 16, True, False, True): (1, 256, 3, 1),
(1024, 1024, 65536, 32, 32, False, True, True): (1, 512, 3, 2),
@@ -2878,6 +3181,14 @@
(2048, 2048, 32768, 64, 64, True, False, True): (8, 256, 3, 4),
(2048, 2048, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
(2048, 2048, 32768, 128, 128, True, False, True): (1, 256, 1, 4),
+ (2048, 2048, 50432, 16, 16, False, True, True): (1, 197, 1, 4),
+ (2048, 2048, 50432, 16, 16, True, False, True): (4, 197, 4, 1),
+ (2048, 2048, 50432, 32, 32, False, True, True): (2, 197, 1, 4),
+ (2048, 2048, 50432, 32, 32, True, False, True): (4, 197, 3, 4),
+ (2048, 2048, 50432, 64, 64, False, True, True): (2, 394, 3, 4),
+ (2048, 2048, 50432, 64, 64, True, False, True): (4, 197, 2, 4),
+ (2048, 2048, 50432, 128, 128, False, True, True): (3, 394, 1, 4),
+ (2048, 2048, 50432, 128, 128, True, False, True): (4, 394, 2, 4),
(2048, 2048, 65536, 16, 16, False, True, True): (9, 256, 3, 2),
(2048, 2048, 65536, 16, 16, True, False, True): (9, 256, 4, 4),
(2048, 2048, 65536, 32, 32, False, True, True): (7, 256, 3, 4),
@@ -2894,6 +3205,94 @@
(2048, 2048, 131072, 64, 64, True, False, True): (2, 1024, 3, 4),
(2048, 2048, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
(2048, 2048, 131072, 128, 128, True, False, True): (1, 1024, 1, 4),
+ (3072, 768, 256, 16, 16, False, True, True): (6, 4, 1, 4),
+ (3072, 768, 256, 16, 16, True, False, True): (2, 1, 5, 2),
+ (3072, 768, 256, 32, 32, False, True, True): (1, 4, 1, 8),
+ (3072, 768, 256, 32, 32, True, False, True): (4, 2, 4, 4),
+ (3072, 768, 256, 64, 64, False, True, True): (1, 2, 3, 4),
+ (3072, 768, 256, 64, 64, True, False, True): (3, 4, 3, 4),
+ (3072, 768, 256, 128, 128, False, True, True): (1, 2, 3, 8),
+ (3072, 768, 256, 128, 128, True, False, True): (3, 2, 3, 8),
+ (3072, 768, 512, 16, 16, False, True, True): (1, 4, 1, 4),
+ (3072, 768, 512, 16, 16, True, False, True): (3, 4, 4, 1),
+ (3072, 768, 512, 32, 32, False, True, True): (5, 8, 1, 4),
+ (3072, 768, 512, 32, 32, True, False, True): (3, 4, 4, 2),
+ (3072, 768, 512, 64, 64, False, True, True): (1, 8, 1, 4),
+ (3072, 768, 512, 64, 64, True, False, True): (1, 4, 3, 4),
+ (3072, 768, 512, 128, 128, False, True, True): (3, 4, 3, 8),
+ (3072, 768, 512, 128, 128, True, False, True): (1, 4, 3, 8),
+ (3072, 768, 1024, 16, 16, False, True, True): (1, 8, 1, 4),
+ (3072, 768, 1024, 16, 16, True, False, True): (3, 4, 3, 1),
+ (3072, 768, 1024, 32, 32, False, True, True): (1, 16, 1, 4),
+ (3072, 768, 1024, 32, 32, True, False, True): (1, 4, 3, 8),
+ (3072, 768, 1024, 64, 64, False, True, True): (8, 16, 3, 2),
+ (3072, 768, 1024, 64, 64, True, False, True): (1, 4, 3, 4),
+ (3072, 768, 1024, 128, 128, False, True, True): (2, 8, 3, 8),
+ (3072, 768, 1024, 128, 128, True, False, True): (3, 8, 2, 4),
+ (3072, 768, 2048, 16, 16, False, True, True): (1, 8, 1, 4),
+ (3072, 768, 2048, 16, 16, True, False, True): (6, 8, 4, 4),
+ (3072, 768, 2048, 32, 32, False, True, True): (1, 16, 1, 8),
+ (3072, 768, 2048, 32, 32, True, False, True): (6, 8, 3, 4),
+ (3072, 768, 2048, 64, 64, False, True, True): (8, 16, 3, 4),
+ (3072, 768, 2048, 64, 64, True, False, True): (3, 16, 3, 4),
+ (3072, 768, 2048, 128, 128, False, True, True): (1, 16, 3, 8),
+ (3072, 768, 2048, 128, 128, True, False, True): (2, 16, 2, 4),
+ (3072, 768, 4096, 16, 16, False, True, True): (1, 16, 1, 4),
+ (3072, 768, 4096, 16, 16, True, False, True): (4, 32, 4, 2),
+ (3072, 768, 4096, 32, 32, False, True, True): (1, 32, 1, 8),
+ (3072, 768, 4096, 32, 32, True, False, True): (4, 16, 3, 4),
+ (3072, 768, 4096, 64, 64, False, True, True): (2, 32, 1, 4),
+ (3072, 768, 4096, 64, 64, True, False, True): (2, 16, 2, 4),
+ (3072, 768, 4096, 128, 128, False, True, True): (2, 32, 1, 16),
+ (3072, 768, 4096, 128, 128, True, False, True): (3, 32, 2, 4),
+ (3072, 768, 8192, 16, 16, False, True, True): (2, 32, 1, 4),
+ (3072, 768, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+ (3072, 768, 8192, 32, 32, False, True, True): (2, 32, 1, 4),
+ (3072, 768, 8192, 32, 32, True, False, True): (6, 32, 3, 4),
+ (3072, 768, 8192, 64, 64, False, True, True): (2, 64, 1, 4),
+ (3072, 768, 8192, 64, 64, True, False, True): (2, 32, 2, 4),
+ (3072, 768, 8192, 128, 128, False, True, True): (3, 64, 1, 4),
+ (3072, 768, 8192, 128, 128, True, False, True): (2, 64, 2, 4),
+ (3072, 768, 16384, 16, 16, False, True, True): (1, 64, 1, 4),
+ (3072, 768, 16384, 16, 16, True, False, True): (1, 64, 1, 1),
+ (3072, 768, 16384, 32, 32, False, True, True): (2, 64, 1, 4),
+ (3072, 768, 16384, 32, 32, True, False, True): (4, 64, 3, 4),
+ (3072, 768, 16384, 64, 64, False, True, True): (2, 128, 1, 4),
+ (3072, 768, 16384, 64, 64, True, False, True): (4, 64, 2, 4),
+ (3072, 768, 16384, 128, 128, False, True, True): (3, 128, 1, 4),
+ (3072, 768, 16384, 128, 128, True, False, True): (1, 128, 2, 4),
+ (3072, 768, 32768, 16, 16, False, True, True): (1, 128, 1, 4),
+ (3072, 768, 32768, 16, 16, True, False, True): (8, 256, 3, 2),
+ (3072, 768, 32768, 32, 32, False, True, True): (2, 128, 1, 4),
+ (3072, 768, 32768, 32, 32, True, False, True): (8, 128, 3, 4),
+ (3072, 768, 32768, 64, 64, False, True, True): (1, 256, 1, 4),
+ (3072, 768, 32768, 64, 64, True, False, True): (8, 128, 2, 4),
+ (3072, 768, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
+ (3072, 768, 32768, 128, 128, True, False, True): (3, 256, 2, 4),
+ (3072, 768, 50432, 16, 16, False, True, True): (1, 197, 1, 4),
+ (3072, 768, 50432, 16, 16, True, False, True): (7, 197, 4, 1),
+ (3072, 768, 50432, 32, 32, False, True, True): (2, 197, 1, 4),
+ (3072, 768, 50432, 32, 32, True, False, True): (10, 197, 3, 4),
+ (3072, 768, 50432, 64, 64, False, True, True): (1, 394, 1, 4),
+ (3072, 768, 50432, 64, 64, True, False, True): (3, 197, 2, 4),
+ (3072, 768, 50432, 128, 128, False, True, True): (3, 394, 1, 4),
+ (3072, 768, 50432, 128, 128, True, False, True): (2, 394, 2, 4),
+ (3072, 768, 65536, 16, 16, False, True, True): (1, 256, 1, 4),
+ (3072, 768, 65536, 16, 16, True, False, True): (15, 256, 4, 1),
+ (3072, 768, 65536, 32, 32, False, True, True): (2, 256, 1, 4),
+ (3072, 768, 65536, 32, 32, True, False, True): (10, 256, 3, 4),
+ (3072, 768, 65536, 64, 64, False, True, True): (1, 512, 1, 4),
+ (3072, 768, 65536, 64, 64, True, False, True): (3, 256, 2, 4),
+ (3072, 768, 65536, 128, 128, False, True, True): (3, 512, 1, 4),
+ (3072, 768, 65536, 128, 128, True, False, True): (3, 512, 2, 4),
+ (3072, 768, 131072, 16, 16, False, True, True): (1, 512, 1, 4),
+ (3072, 768, 131072, 16, 16, True, False, True): (15, 512, 4, 1),
+ (3072, 768, 131072, 32, 32, False, True, True): (2, 512, 1, 4),
+ (3072, 768, 131072, 32, 32, True, False, True): (9, 512, 3, 4),
+ (3072, 768, 131072, 64, 64, False, True, True): (1, 1024, 1, 4),
+ (3072, 768, 131072, 64, 64, True, False, True): (3, 512, 2, 4),
+ (3072, 768, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
+ (3072, 768, 131072, 128, 128, True, False, True): (3, 1024, 2, 4),
(3072, 3072, 256, 16, 16, False, True, True): (5, 4, 1, 4),
(3072, 3072, 256, 16, 16, True, False, True): (1, 2, 5, 2),
(3072, 3072, 256, 32, 32, False, True, True): (1, 4, 1, 8),
@@ -3038,6 +3437,14 @@
(4096, 4096, 32768, 64, 64, True, False, True): (3, 256, 3, 4),
(4096, 4096, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
(4096, 4096, 32768, 128, 128, True, False, True): (1, 256, 1, 4),
+ (4096, 4096, 50432, 16, 16, False, True, True): (1, 197, 1, 4),
+ (4096, 4096, 50432, 16, 16, True, False, True): (4, 197, 4, 1),
+ (4096, 4096, 50432, 32, 32, False, True, True): (1, 197, 1, 4),
+ (4096, 4096, 50432, 32, 32, True, False, True): (2, 197, 3, 4),
+ (4096, 4096, 50432, 64, 64, False, True, True): (1, 394, 3, 4),
+ (4096, 4096, 50432, 64, 64, True, False, True): (1, 197, 2, 4),
+ (4096, 4096, 50432, 128, 128, False, True, True): (3, 394, 1, 4),
+ (4096, 4096, 50432, 128, 128, True, False, True): (1, 394, 2, 4),
(4096, 4096, 65536, 16, 16, False, True, True): (5, 256, 4, 4),
(4096, 4096, 65536, 16, 16, True, False, True): (5, 256, 4, 4),
(4096, 4096, 65536, 32, 32, False, True, True): (4, 256, 4, 8),
@@ -3198,6 +3605,11 @@
(8192, 8192, 32768, 64, 64, True, False, True): (2, 128, 3, 8),
(8192, 8192, 32768, 128, 128, False, True, True): (6, 256, 1, 4),
(8192, 8192, 32768, 128, 128, True, False, True): (4, 256, 1, 4),
+ (8192, 8192, 50432, 16, 16, False, True, True): (1, 197, 1, 1),
+ (8192, 8192, 50432, 16, 16, True, False, True): (3, 197, 4, 1),
+ (8192, 8192, 50432, 32, 32, False, True, True): (2, 197, 1, 4),
+ (8192, 8192, 50432, 32, 32, True, False, True): (2, 197, 3, 4),
+ (8192, 8192, 50432, 64, 64, False, True, True): (2, 394, 3, 4),
(8192, 8192, 65536, 16, 16, False, True, True): (3, 256, 4, 4),
(8192, 8192, 65536, 16, 16, True, False, True): (4, 256, 4, 4),
(8192, 8192, 65536, 32, 32, False, True, True): (2, 256, 4, 8),
@@ -3440,6 +3852,7 @@
(24576, 24576, 32768, 128, 128, True, False, True): (2, 256, 3, 8),
(24576, 24576, 65536, 16, 16, False, True, True): (2, 512, 1, 2),
(24576, 24576, 65536, 16, 16, True, False, True): (1, 256, 4, 4),
+ (32768, 32768, 256, 16, 16, False, True, True): (4, 2, 1, 2),
},
("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.float16, 0.56)): {
(192, 192, 256, 64, 64, False, True, True): (1, 4, 3, 4),
@@ -4044,10 +4457,94 @@
(768, 768, 131072, 64, 64, True, False, True): (1, 2048, 3, 4),
(768, 768, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
(768, 768, 131072, 128, 128, True, False, True): (1, 1024, 1, 32),
+ (768, 3072, 256, 16, 16, False, True, True): (1, 2, 4, 4),
+ (768, 3072, 256, 16, 16, True, False, True): (1, 4, 3, 4),
+ (768, 3072, 256, 32, 32, False, True, True): (1, 4, 3, 4),
+ (768, 3072, 256, 32, 32, True, False, True): (3, 4, 3, 4),
+ (768, 3072, 256, 64, 64, False, True, True): (1, 4, 3, 8),
+ (768, 3072, 256, 64, 64, True, False, True): (1, 4, 3, 8),
+ (768, 3072, 256, 128, 128, False, True, True): (2, 2, 2, 32),
+ (768, 3072, 256, 128, 128, True, False, True): (2, 2, 1, 32),
+ (768, 3072, 512, 16, 16, False, True, True): (2, 4, 3, 4),
+ (768, 3072, 512, 16, 16, True, False, True): (1, 8, 3, 2),
+ (768, 3072, 512, 32, 32, False, True, True): (3, 8, 4, 4),
+ (768, 3072, 512, 32, 32, True, False, True): (3, 8, 3, 4),
+ (768, 3072, 512, 64, 64, False, True, True): (1, 8, 4, 8),
+ (768, 3072, 512, 64, 64, True, False, True): (1, 8, 3, 8),
+ (768, 3072, 512, 128, 128, False, True, True): (1, 4, 2, 32),
+ (768, 3072, 512, 128, 128, True, False, True): (1, 4, 1, 32),
+ (768, 3072, 1024, 16, 16, False, True, True): (4, 16, 3, 2),
+ (768, 3072, 1024, 16, 16, True, False, True): (4, 16, 3, 2),
+ (768, 3072, 1024, 32, 32, False, True, True): (4, 16, 5, 4),
+ (768, 3072, 1024, 32, 32, True, False, True): (4, 16, 5, 4),
+ (768, 3072, 1024, 64, 64, False, True, True): (2, 16, 3, 8),
+ (768, 3072, 1024, 64, 64, True, False, True): (2, 16, 3, 8),
+ (768, 3072, 1024, 128, 128, False, True, True): (1, 8, 1, 32),
+ (768, 3072, 1024, 128, 128, True, False, True): (1, 8, 1, 32),
+ (768, 3072, 2048, 16, 16, False, True, True): (2, 16, 3, 4),
+ (768, 3072, 2048, 16, 16, True, False, True): (2, 16, 3, 4),
+ (768, 3072, 2048, 32, 32, False, True, True): (4, 32, 5, 4),
+ (768, 3072, 2048, 32, 32, True, False, True): (2, 32, 3, 4),
+ (768, 3072, 2048, 64, 64, False, True, True): (2, 32, 3, 8),
+ (768, 3072, 2048, 64, 64, True, False, True): (2, 32, 3, 8),
+ (768, 3072, 2048, 128, 128, False, True, True): (1, 16, 1, 32),
+ (768, 3072, 2048, 128, 128, True, False, True): (2, 16, 1, 32),
+ (768, 3072, 4096, 16, 16, False, True, True): (1, 32, 5, 4),
+ (768, 3072, 4096, 16, 16, True, False, True): (3, 64, 3, 2),
+ (768, 3072, 4096, 32, 32, False, True, True): (5, 64, 3, 4),
+ (768, 3072, 4096, 32, 32, True, False, True): (5, 64, 3, 4),
+ (768, 3072, 4096, 64, 64, False, True, True): (1, 64, 3, 8),
+ (768, 3072, 4096, 64, 64, True, False, True): (5, 64, 3, 4),
+ (768, 3072, 4096, 128, 128, False, True, True): (1, 32, 1, 32),
+ (768, 3072, 4096, 128, 128, True, False, True): (1, 32, 1, 32),
+ (768, 3072, 8192, 16, 16, False, True, True): (1, 128, 3, 2),
+ (768, 3072, 8192, 16, 16, True, False, True): (1, 128, 3, 2),
+ (768, 3072, 8192, 32, 32, False, True, True): (1, 128, 3, 4),
+ (768, 3072, 8192, 32, 32, True, False, True): (1, 64, 3, 4),
+ (768, 3072, 8192, 64, 64, False, True, True): (3, 128, 3, 4),
+ (768, 3072, 8192, 64, 64, True, False, True): (3, 128, 3, 4),
+ (768, 3072, 8192, 128, 128, False, True, True): (4, 64, 2, 32),
+ (768, 3072, 8192, 128, 128, True, False, True): (2, 64, 1, 32),
+ (768, 3072, 16384, 16, 16, False, True, True): (1, 256, 2, 2),
+ (768, 3072, 16384, 16, 16, True, False, True): (1, 64, 3, 4),
+ (768, 3072, 16384, 32, 32, False, True, True): (8, 128, 3, 4),
+ (768, 3072, 16384, 32, 32, True, False, True): (1, 128, 3, 4),
+ (768, 3072, 16384, 64, 64, False, True, True): (1, 256, 3, 4),
+ (768, 3072, 16384, 64, 64, True, False, True): (3, 256, 3, 4),
+ (768, 3072, 16384, 128, 128, False, True, True): (3, 128, 1, 32),
+ (768, 3072, 16384, 128, 128, True, False, True): (2, 128, 2, 32),
+ (768, 3072, 32768, 16, 16, False, True, True): (1, 512, 3, 1),
+ (768, 3072, 32768, 16, 16, True, False, True): (1, 128, 3, 4),
+ (768, 3072, 32768, 32, 32, False, True, True): (1, 256, 3, 4),
+ (768, 3072, 32768, 32, 32, True, False, True): (1, 256, 3, 4),
+ (768, 3072, 32768, 64, 64, False, True, True): (2, 512, 3, 4),
+ (768, 3072, 32768, 64, 64, True, False, True): (1, 512, 3, 4),
+ (768, 3072, 32768, 128, 128, False, True, True): (1, 256, 1, 32),
+ (768, 3072, 32768, 128, 128, True, False, True): (2, 256, 2, 32),
(768, 3072, 50432, 16, 16, False, True, True): (1, 197, 3, 4),
+ (768, 3072, 50432, 16, 16, True, False, True): (1, 197, 3, 4),
(768, 3072, 50432, 32, 32, False, True, True): (1, 788, 2, 4),
+ (768, 3072, 50432, 32, 32, True, False, True): (1, 394, 3, 4),
(768, 3072, 50432, 64, 64, False, True, True): (1, 788, 3, 4),
+ (768, 3072, 50432, 64, 64, True, False, True): (2, 788, 3, 4),
(768, 3072, 50432, 128, 128, False, True, True): (1, 394, 1, 32),
+ (768, 3072, 50432, 128, 128, True, False, True): (2, 394, 2, 32),
+ (768, 3072, 65536, 16, 16, False, True, True): (1, 1024, 3, 1),
+ (768, 3072, 65536, 16, 16, True, False, True): (1, 256, 3, 4),
+ (768, 3072, 65536, 32, 32, False, True, True): (1, 512, 3, 4),
+ (768, 3072, 65536, 32, 32, True, False, True): (1, 512, 3, 4),
+ (768, 3072, 65536, 64, 64, False, True, True): (2, 1024, 3, 4),
+ (768, 3072, 65536, 64, 64, True, False, True): (5, 1024, 3, 4),
+ (768, 3072, 65536, 128, 128, False, True, True): (1, 512, 1, 32),
+ (768, 3072, 65536, 128, 128, True, False, True): (2, 512, 2, 32),
+ (768, 3072, 131072, 16, 16, False, True, True): (1, 2048, 3, 1),
+ (768, 3072, 131072, 16, 16, True, False, True): (1, 512, 3, 4),
+ (768, 3072, 131072, 32, 32, False, True, True): (1, 1024, 3, 4),
+ (768, 3072, 131072, 32, 32, True, False, True): (1, 1024, 3, 4),
+ (768, 3072, 131072, 64, 64, False, True, True): (1, 2048, 3, 4),
+ (768, 3072, 131072, 64, 64, True, False, True): (2, 2048, 3, 4),
+ (768, 3072, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
+ (768, 3072, 131072, 128, 128, True, False, True): (1, 1024, 2, 32),
(1024, 1024, 256, 16, 16, False, True, True): (4, 8, 3, 2),
(1024, 1024, 256, 16, 16, True, False, True): (2, 8, 3, 2),
(1024, 1024, 256, 32, 32, False, True, True): (1, 8, 3, 4),
@@ -4288,6 +4785,94 @@
(2048, 2048, 131072, 64, 64, True, False, True): (1, 2048, 2, 4),
(2048, 2048, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
(2048, 2048, 131072, 128, 128, True, False, True): (4, 1024, 1, 32),
+ (3072, 768, 256, 16, 16, False, True, True): (4, 4, 3, 2),
+ (3072, 768, 256, 16, 16, True, False, True): (1, 2, 6, 4),
+ (3072, 768, 256, 32, 32, False, True, True): (1, 4, 6, 4),
+ (3072, 768, 256, 32, 32, True, False, True): (5, 4, 3, 4),
+ (3072, 768, 256, 64, 64, False, True, True): (4, 4, 3, 8),
+ (3072, 768, 256, 64, 64, True, False, True): (4, 4, 3, 8),
+ (3072, 768, 256, 128, 128, False, True, True): (1, 2, 1, 32),
+ (3072, 768, 256, 128, 128, True, False, True): (5, 2, 1, 32),
+ (3072, 768, 512, 16, 16, False, True, True): (4, 4, 3, 4),
+ (3072, 768, 512, 16, 16, True, False, True): (1, 4, 3, 4),
+ (3072, 768, 512, 32, 32, False, True, True): (3, 8, 3, 4),
+ (3072, 768, 512, 32, 32, True, False, True): (3, 8, 3, 4),
+ (3072, 768, 512, 64, 64, False, True, True): (2, 8, 3, 8),
+ (3072, 768, 512, 64, 64, True, False, True): (2, 8, 3, 8),
+ (3072, 768, 512, 128, 128, False, True, True): (1, 4, 2, 32),
+ (3072, 768, 512, 128, 128, True, False, True): (1, 4, 1, 32),
+ (3072, 768, 1024, 16, 16, False, True, True): (1, 16, 3, 2),
+ (3072, 768, 1024, 16, 16, True, False, True): (3, 16, 3, 2),
+ (3072, 768, 1024, 32, 32, False, True, True): (1, 16, 3, 4),
+ (3072, 768, 1024, 32, 32, True, False, True): (3, 16, 3, 4),
+ (3072, 768, 1024, 64, 64, False, True, True): (4, 16, 3, 8),
+ (3072, 768, 1024, 64, 64, True, False, True): (4, 16, 3, 4),
+ (3072, 768, 1024, 128, 128, False, True, True): (5, 8, 1, 32),
+ (3072, 768, 1024, 128, 128, True, False, True): (5, 8, 1, 32),
+ (3072, 768, 2048, 16, 16, False, True, True): (4, 32, 3, 2),
+ (3072, 768, 2048, 16, 16, True, False, True): (1, 32, 3, 2),
+ (3072, 768, 2048, 32, 32, False, True, True): (1, 32, 3, 4),
+ (3072, 768, 2048, 32, 32, True, False, True): (1, 32, 2, 4),
+ (3072, 768, 2048, 64, 64, False, True, True): (2, 32, 3, 4),
+ (3072, 768, 2048, 64, 64, True, False, True): (4, 32, 3, 4),
+ (3072, 768, 2048, 128, 128, False, True, True): (1, 16, 1, 32),
+ (3072, 768, 2048, 128, 128, True, False, True): (1, 16, 1, 32),
+ (3072, 768, 4096, 16, 16, False, True, True): (3, 64, 3, 2),
+ (3072, 768, 4096, 16, 16, True, False, True): (1, 64, 3, 2),
+ (3072, 768, 4096, 32, 32, False, True, True): (1, 64, 3, 4),
+ (3072, 768, 4096, 32, 32, True, False, True): (1, 32, 3, 4),
+ (3072, 768, 4096, 64, 64, False, True, True): (2, 64, 3, 4),
+ (3072, 768, 4096, 64, 64, True, False, True): (2, 64, 3, 4),
+ (3072, 768, 4096, 128, 128, False, True, True): (1, 32, 1, 32),
+ (3072, 768, 4096, 128, 128, True, False, True): (1, 32, 1, 32),
+ (3072, 768, 8192, 16, 16, False, True, True): (4, 128, 3, 1),
+ (3072, 768, 8192, 16, 16, True, False, True): (1, 32, 3, 4),
+ (3072, 768, 8192, 32, 32, False, True, True): (1, 64, 3, 4),
+ (3072, 768, 8192, 32, 32, True, False, True): (1, 64, 3, 4),
+ (3072, 768, 8192, 64, 64, False, True, True): (2, 128, 3, 4),
+ (3072, 768, 8192, 64, 64, True, False, True): (2, 128, 3, 4),
+ (3072, 768, 8192, 128, 128, False, True, True): (1, 64, 1, 32),
+ (3072, 768, 8192, 128, 128, True, False, True): (1, 64, 1, 32),
+ (3072, 768, 16384, 16, 16, False, True, True): (4, 256, 3, 1),
+ (3072, 768, 16384, 16, 16, True, False, True): (1, 64, 3, 4),
+ (3072, 768, 16384, 32, 32, False, True, True): (1, 128, 3, 4),
+ (3072, 768, 16384, 32, 32, True, False, True): (1, 128, 3, 4),
+ (3072, 768, 16384, 64, 64, False, True, True): (2, 256, 3, 4),
+ (3072, 768, 16384, 64, 64, True, False, True): (2, 256, 3, 4),
+ (3072, 768, 16384, 128, 128, False, True, True): (1, 128, 1, 32),
+ (3072, 768, 16384, 128, 128, True, False, True): (1, 128, 1, 32),
+ (3072, 768, 32768, 16, 16, False, True, True): (4, 512, 3, 1),
+ (3072, 768, 32768, 16, 16, True, False, True): (1, 128, 3, 4),
+ (3072, 768, 32768, 32, 32, False, True, True): (1, 256, 3, 4),
+ (3072, 768, 32768, 32, 32, True, False, True): (1, 256, 3, 4),
+ (3072, 768, 32768, 64, 64, False, True, True): (2, 512, 3, 4),
+ (3072, 768, 32768, 64, 64, True, False, True): (2, 512, 3, 4),
+ (3072, 768, 32768, 128, 128, False, True, True): (1, 256, 1, 32),
+ (3072, 768, 32768, 128, 128, True, False, True): (1, 256, 1, 32),
+ (3072, 768, 50432, 16, 16, False, True, True): (4, 788, 3, 1),
+ (3072, 768, 50432, 16, 16, True, False, True): (1, 197, 3, 4),
+ (3072, 768, 50432, 32, 32, False, True, True): (1, 394, 3, 4),
+ (3072, 768, 50432, 32, 32, True, False, True): (1, 394, 3, 4),
+ (3072, 768, 50432, 64, 64, False, True, True): (1, 788, 3, 4),
+ (3072, 768, 50432, 64, 64, True, False, True): (2, 788, 3, 4),
+ (3072, 768, 50432, 128, 128, False, True, True): (1, 394, 1, 32),
+ (3072, 768, 50432, 128, 128, True, False, True): (1, 394, 1, 32),
+ (3072, 768, 65536, 16, 16, False, True, True): (4, 1024, 3, 1),
+ (3072, 768, 65536, 16, 16, True, False, True): (1, 256, 3, 4),
+ (3072, 768, 65536, 32, 32, False, True, True): (1, 512, 3, 4),
+ (3072, 768, 65536, 32, 32, True, False, True): (1, 512, 3, 4),
+ (3072, 768, 65536, 64, 64, False, True, True): (2, 1024, 3, 4),
+ (3072, 768, 65536, 64, 64, True, False, True): (2, 1024, 3, 4),
+ (3072, 768, 65536, 128, 128, False, True, True): (1, 512, 1, 32),
+ (3072, 768, 65536, 128, 128, True, False, True): (1, 512, 1, 32),
+ (3072, 768, 131072, 16, 16, False, True, True): (4, 2048, 3, 1),
+ (3072, 768, 131072, 16, 16, True, False, True): (1, 512, 3, 4),
+ (3072, 768, 131072, 32, 32, False, True, True): (1, 1024, 3, 4),
+ (3072, 768, 131072, 32, 32, True, False, True): (1, 1024, 3, 4),
+ (3072, 768, 131072, 64, 64, False, True, True): (2, 2048, 3, 4),
+ (3072, 768, 131072, 64, 64, True, False, True): (2, 2048, 3, 4),
+ (3072, 768, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
+ (3072, 768, 131072, 128, 128, True, False, True): (1, 1024, 1, 32),
(3072, 3072, 256, 16, 16, False, True, True): (1, 4, 5, 2),
(3072, 3072, 256, 16, 16, True, False, True): (1, 4, 3, 2),
(3072, 3072, 256, 32, 32, False, True, True): (1, 4, 4, 4),