relax tol for flaky nansum_out_dtype_cuda_float32 test (#121550)

TestReductionsCUDA.test_nansum_out_dtype_cuda_float32 would fail or pass depending on the random inputs. Observed by ROCm internal QA testing.  But same problematic random inputs breaks the test for CUDA, verified on V100.

There is precedent in another test within the same file to relax tolerance.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/121550
Approved by: https://github.com/albanD
diff --git a/test/test_reductions.py b/test/test_reductions.py
index b1d043f..bb43d1a 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -1713,6 +1713,20 @@
         self._test_reduction_function_with_numpy(torch.count_nonzero, np.count_nonzero, device, dtype)
         self._test_reduction_function_with_numpy(torch.count_nonzero, np.count_nonzero, device, dtype, True)
 
+    # TODO: Investigate why the output is not close to numpy.
+    def _get_relaxed_tolerances_for(self, dtype):
+        if dtype == torch.float16:
+            atol = 0.4
+            rtol = 1e-2
+        elif dtype == torch.float32:
+            atol = 7e-05
+            rtol = 3e-06
+        else:
+            # Default values
+            atol = None
+            rtol = None
+        return atol, rtol
+
     def _test_sum_reduction_vs_numpy(self, torch_fn, np_fn, device, dtype, with_keepdim=False, with_extremal=False):
         def is_integral(dtype):
             return dtype in integral_types()
@@ -1731,16 +1745,7 @@
             exact_dtype = False
 
         # TODO: Investigate why the output is not close to numpy.
-        if dtype == torch.float16:
-            atol = 0.4
-            rtol = 1e-2
-        elif dtype == torch.float32:
-            atol = 7e-05
-            rtol = 3e-06
-        else:
-            # Default values
-            atol = None
-            rtol = None
+        atol, rtol = self._get_relaxed_tolerances_for(dtype)
         self._test_reduction_function_with_numpy(torch_fn, np_fn, device, dtype,
                                                  atol=atol, rtol=rtol, exact_dtype=exact_dtype,
                                                  with_keepdim=with_keepdim, with_extremal=with_extremal)
@@ -1771,12 +1776,14 @@
         out_dtype = dtype
         inp_dtypes = all_types_and(torch.half) if out_dtype.is_floating_point else integral_types()
         for inp_dtype in inp_dtypes:
+            # TODO: Investigate why the output is not close to numpy.
+            atol, rtol = self._get_relaxed_tolerances_for(dtype)
             shape = _rand_shape(random.randint(2, 5), min_size=5, max_size=10)
             x = _generate_input(shape, inp_dtype, device, with_extremal=False)
             torch_fn = partial(torch.nansum, dtype=out_dtype)
             np_out_dtype = torch_to_numpy_dtype_dict[out_dtype]
             np_fn = partial(np.nansum, dtype=np_out_dtype)
-            self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
+            self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None, atol=atol, rtol=rtol)
 
     @dtypes(*all_types_and(torch.half))
     def test_argminmax_multiple(self, device, dtype):