fix torchvhsion failed case test_classification_model on slow_conv2d

Pull Request resolved: https://github.com/pytorch/pytorch/pull/77347

Approved by: https://github.com/datumbox, https://github.com/frank-wei
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 2bfa197..b4387f1 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -1407,6 +1407,7 @@
     case ConvBackend::SlowTranspose2d:
     case ConvBackend::SlowTranspose3d:
       input = input.contiguous(backend_memory_format);
+      weight = weight.contiguous(backend_memory_format);
       if (params.groups == 1) {
         output = _convolution_nogroup_backend(input, weight, bias, backend, params);
       } else {
@@ -1905,6 +1906,7 @@
     case ConvBackend::SlowTranspose3d:
     {
       input = input.contiguous(backend_memory_format);
+      weight = weight.contiguous(backend_memory_format);
       if (params.groups == 1) {
         std::tie(backend_grad_input, backend_grad_weight, backend_grad_bias) =
           _convolution_backward_nogroup_backend(
diff --git a/test/test_nn.py b/test/test_nn.py
index 91c5b70..3768ff2 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -19165,12 +19165,12 @@
     @onlyCPU
     @dtypes(torch.float, torch.double)
     def test_conv_thnn_nhwc(self, device, dtype):
-        def helper(n, c, h, w, out_channels, kernel_size, dilation, groups):
+        def helper(n, c, h, w, out_channels, kernel_size, dilation, groups, weight_memory_format):
             input = torch.randint(-3, 3, (n, c, h, w), dtype=dtype, device=device)\
                 .to(memory_format=torch.channels_last)
             input.requires_grad_()
             conv = nn.Conv2d(c, out_channels, kernel_size, dilation=dilation, groups=groups)\
-                .to(device='cpu', dtype=dtype, memory_format=torch.channels_last)
+                .to(device='cpu', dtype=dtype, memory_format=weight_memory_format)
             for p in conv.parameters():
                 p.data = torch.randint_like(p, -3, 3)
 
@@ -19197,15 +19197,16 @@
             self.assertEqual(input.grad, ref_input.grad, exact_dtype=False)
 
         with torch.backends.mkldnn.flags(enabled=False):
-            # non-dilated conv: thnn_conv2d normal path (with im2col)
-            helper(2, 8, 4, 4, out_channels=4, kernel_size=3, dilation=1, groups=1)
-            helper(2, 8, 4, 4, out_channels=8, kernel_size=3, dilation=1, groups=8)
-            # non-dilated conv: thnn_conv2d fast path (skip im2col)
-            helper(1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=1)
-            helper(1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=16)
-            # dilated conv: slow_conv_dilated2d
-            helper(2, 8, 11, 13, out_channels=16, kernel_size=3, dilation=2, groups=1)
-            helper(2, 16, 11, 13, out_channels=32, kernel_size=3, dilation=2, groups=16)
+            for mf in [torch.contiguous_format, torch.channels_last]:
+                # non-dilated conv: thnn_conv2d normal path (with im2col)
+                helper(2, 8, 4, 4, out_channels=4, kernel_size=3, dilation=1, groups=1, weight_memory_format=mf)
+                helper(2, 8, 4, 4, out_channels=8, kernel_size=3, dilation=1, groups=8, weight_memory_format=mf)
+                # non-dilated conv: thnn_conv2d fast path (skip im2col)
+                helper(1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=1, weight_memory_format=mf)
+                helper(1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=16, weight_memory_format=mf)
+                # dilated conv: slow_conv_dilated2d
+                helper(2, 8, 11, 13, out_channels=16, kernel_size=3, dilation=2, groups=1, weight_memory_format=mf)
+                helper(2, 16, 11, 13, out_channels=32, kernel_size=3, dilation=2, groups=16, weight_memory_format=mf)
 
     @onlyCUDA
     @skipCUDAIfRocmVersionLessThan((4, 3))