create benchmark example tensors with correct sizes (#106238)

We need to consider the node's offset when we create benchmark example
tensors with test_cat_addmm. Otherwise, we would fail with applying
torch.as_strided to the return tensor value.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/106238
Approved by: https://github.com/jansel
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index ad343f0..1c20897 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -165,6 +165,31 @@
         with config.patch({"max_autotune": True, "autotune_in_subproc": True}):
             torch.compile(addmm, dynamic=dynamic)(x, a, b)
 
+    def test_cat_addmm(self):
+        def fn(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor):
+            return torch.cat(
+                [
+                    torch.addmm(a, b, c),
+                    torch.addmm(b, c, a),
+                ],
+                1,
+            )
+
+        args = [
+            torch.randn(4, 4, device="cuda"),
+            torch.randn(4, 4, device="cuda"),
+            torch.randn(4, 4, device="cuda"),
+        ]
+        with config.patch(
+            {
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "Triton",
+            }
+        ):
+            expected = fn(*args)
+            actual = torch.compile(fn)(*args)
+            torch.testing.assert_close(actual, expected, atol=1e-2, rtol=1e-2)
+
     def test_triton_template_with_epilogues_and_dynamic_shape(self):
         def fn(
             x: torch.Tensor, w: torch.Tensor, bias: torch.Tensor, mul: torch.Tensor
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
index 7cd12b2..3b128e5 100644
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@@ -931,6 +931,7 @@
             V.graph.sizevars.size_hints(node.get_stride()),
             device=node.get_device(),
             dtype=node.get_dtype(),
+            extra_size=node.layout.offset,
         )
 
     @staticmethod