Fixed an issue where a user-specified default device clashed with the… (#114560)

… device placement of the RNG. This PR now ignores the user-specified default device, allocates the tensor on the CPU and then moves the tensor to the device of the input tensor. This was more or less already the standard procedure in case the default device wasn't set.

Fixes #114536.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/114560
Approved by: https://github.com/soulitzer
diff --git a/test/test_autograd.py b/test/test_autograd.py
index f91324b..0c8fe14 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -8743,6 +8743,18 @@
         self.assertEqual(y, y2)
         self.assertEqual(y_expected, y2_expected)
 
+    @unittest.skipIf(not TEST_CUDA, "test requires CUDA")
+    def test_gradcheck_default_device_placement_context(self):
+        # During gradcheck with fast_mode=True, we create a random vector on the CPU device using a CPU generator.
+        # This test ensures that this still works when the default device is set to something else by the user.
+        with torch.device('cuda'):
+            x = torch.randn(3, dtype=torch.double, requires_grad=True)
+
+            def func(inp):
+                return inp ** 2.0
+
+            self.assertTrue(gradcheck(func, x, fast_mode=True))
+
 def index_perm_variable(shape, max_indices):
     if not isinstance(shape, tuple):
         shape = (shape,)
diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py
index dff8321..d4650e9 100644
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@@ -1668,7 +1668,7 @@
             .view(x_values.shape)
         )
         values /= values.norm()
-        vec = torch.sparse_coo_tensor(x._indices(), values, x.size())
+        vec = torch.sparse_coo_tensor(x._indices(), values, x.size(), device=x.device)
     elif _is_sparse_compressed_tensor(x):
         if x.layout in {torch.sparse_csr, torch.sparse_bsr}:
             compressed_indices, plain_indices = x.crow_indices(), x.col_indices()
@@ -1683,7 +1683,12 @@
         )
         values /= values.norm()
         vec = torch.sparse_compressed_tensor(
-            compressed_indices, plain_indices, values, x.size(), layout=x.layout
+            compressed_indices,
+            plain_indices,
+            values,
+            x.size(),
+            layout=x.layout,
+            device=x.device,
         )
     else:
         dtype = _to_real_dtype(x.dtype) if downcast_complex else x.dtype
@@ -1785,13 +1790,20 @@
 def _make_vectors(inp_tensors, outputs, *, use_forward_ad):
     # Use our own generator to avoid messing with the user's RNG state
     g_cpu = torch.Generator()
+
+    def _vec_from_tensor_cpu(*args):
+        # Default allocate all tensors on CPU, so they are on the same device as the generator
+        # even if the user specified a default device
+        with torch.device("cpu"):
+            return _vec_from_tensor(*args)
+
     all_u = []
     all_u_dense = []
     for inp in inp_tensors:
-        ur = _vec_from_tensor(inp, g_cpu, True)
+        ur = _vec_from_tensor_cpu(inp, g_cpu, True)
         ur_dense = _to_flat_dense_if_sparse(ur)
         if inp.is_complex():
-            ui = _vec_from_tensor(inp, g_cpu, True)
+            ui = _vec_from_tensor_cpu(inp, g_cpu, True)
             all_u.append((ur, ui))
             ui_dense = _to_flat_dense_if_sparse(ui)
             all_u_dense.append((ur_dense, ui_dense))
@@ -1799,7 +1811,9 @@
             all_u.append(ur)
             all_u_dense.append(ur_dense)
     all_v = (
-        None if use_forward_ad else [_vec_from_tensor(out, g_cpu) for out in outputs]
+        None
+        if use_forward_ad
+        else [_vec_from_tensor_cpu(out, g_cpu) for out in outputs]
     )
     return all_v, all_u, all_u_dense