prepare removal of deprecated functionality in torch.testing (#87969)

_Redo of #86586 with all BC breaking changes granularly placed into separate commits._

---

Per title. Deprecation happened on Feb 25, 2022 in c6f1bbc0ac33be0c8ad9956e3fc15e78ddb6cb95, which made it into the 1.12 release. Since it is now 245 days later and the next release will be 1.14, the removals later in the stack comply with the [BC policy](https://github.com/pytorch/pytorch/wiki/PyTorch's-Python-Frontend-Backward-and-Forward-Compatibility-Policy#minimizing-the-disruption-of-bc-breaking-changes).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87969
Approved by: https://github.com/mruberry
diff --git a/caffe2/python/operator_test/_utils.py b/caffe2/python/operator_test/_utils.py
new file mode 100644
index 0000000..3ee1def
--- /dev/null
+++ b/caffe2/python/operator_test/_utils.py
@@ -0,0 +1,50 @@
+"""
+This file only exists since `torch.testing.assert_allclose` is deprecated, but used extensively throughout the tests in
+this package. The replacement `torch.testing.assert_close` doesn't support one feature that is needed here: comparison
+between numpy arrays and torch tensors. See https://github.com/pytorch/pytorch/issues/61844 for the reasoning why this
+was removed.
+"""
+
+import torch
+from typing import Tuple, Any, Optional
+
+_DTYPE_PRECISIONS = {
+    torch.float16: (1e-3, 1e-3),
+    torch.float32: (1e-4, 1e-5),
+    torch.float64: (1e-5, 1e-8),
+}
+
+
+def _get_default_rtol_and_atol(actual: torch.Tensor, expected: torch.Tensor) -> Tuple[float, float]:
+    actual_rtol, actual_atol = _DTYPE_PRECISIONS.get(actual.dtype, (0.0, 0.0))
+    expected_rtol, expected_atol = _DTYPE_PRECISIONS.get(expected.dtype, (0.0, 0.0))
+    return max(actual_rtol, expected_rtol), max(actual_atol, expected_atol)
+
+
+def assert_allclose(
+    actual: Any,
+    expected: Any,
+    rtol: Optional[float] = None,
+    atol: Optional[float] = None,
+    equal_nan: bool = True,
+    msg: str = "",
+) -> None:
+    if not isinstance(actual, torch.Tensor):
+        actual = torch.tensor(actual)
+    if not isinstance(expected, torch.Tensor):
+        expected = torch.tensor(expected, dtype=actual.dtype)
+
+    if rtol is None and atol is None:
+        rtol, atol = _get_default_rtol_and_atol(actual, expected)
+
+    torch.testing.assert_close(
+        actual,
+        expected,
+        rtol=rtol,
+        atol=atol,
+        equal_nan=equal_nan,
+        check_device=True,
+        check_dtype=False,
+        check_stride=False,
+        msg=msg or None,
+    )
\ No newline at end of file
diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py
index 32a2511..31ba78b 100644
--- a/caffe2/python/operator_test/layer_norm_op_test.py
+++ b/caffe2/python/operator_test/layer_norm_op_test.py
@@ -18,6 +18,8 @@
 
 import unittest
 
+from ._utils import assert_allclose
+
 
 def _layer_norm_ref(axis, epsilon, X):
     left = int(np.prod(X.shape[:axis]))
@@ -254,10 +256,9 @@
         actual_mean = self.ws.fetch_blob('mean')
         actual_std = self.ws.fetch_blob('std')
 
-        torch.testing.assert_allclose(
-            expected_norm, actual_norm, rtol=1e-4, atol=1e-4)
-        torch.testing.assert_allclose(expected_mean, actual_mean)
-        torch.testing.assert_allclose(expected_std, actual_std)
+        assert_allclose(expected_norm, actual_norm, rtol=1e-4, atol=1e-4)
+        assert_allclose(expected_mean, actual_mean)
+        assert_allclose(expected_std, actual_std)
 
     @given(X=hu.tensor(min_dim=2),
            eps=st.floats(1e-5, 1e-3),
@@ -280,10 +281,9 @@
             actual_norm, actual_mean, actual_std = torch.ops._caffe2.LayerNorm(
                 torch.tensor(X), None, None, axis, eps)
 
-        torch.testing.assert_allclose(
-            expected_norm, actual_norm, rtol=1e-4, atol=1e-4)
-        torch.testing.assert_allclose(expected_mean, actual_mean)
-        torch.testing.assert_allclose(expected_std, actual_std)
+        assert_allclose(expected_norm, actual_norm, rtol=1e-4, atol=1e-4)
+        assert_allclose(expected_mean, actual_mean)
+        assert_allclose(expected_std, actual_std)
 
     # Test case is using workspace.has_cuda_support and not
     # workspace.has_gpu_support to exclude it from HIP because tensor interop
@@ -313,10 +313,9 @@
             actual_norm, actual_mean, actual_std = torch.ops._caffe2.LayerNorm(
                 torch.tensor(X).cuda(), None, None, axis, eps)
 
-        torch.testing.assert_allclose(
-            expected_norm, actual_norm.cpu(), rtol=1e-4, atol=1e-4)
-        torch.testing.assert_allclose(expected_mean, actual_mean.cpu())
-        torch.testing.assert_allclose(expected_std, actual_std.cpu())
+        assert_allclose(expected_norm, actual_norm, rtol=1e-4, atol=1e-4)
+        assert_allclose(expected_mean, actual_mean)
+        assert_allclose(expected_std, actual_std)
 
     @given(X=hu.tensor(min_dim=2),
            eps=st.floats(1e-5, 1e-3),
@@ -352,10 +351,9 @@
             actual_norm, actual_mean, actual_std = jit_layer_norm(
                 torch.tensor(X), None, None, axis, eps, elementwise_affine)
 
-        torch.testing.assert_allclose(
-            expected_norm, actual_norm, rtol=1e-4, atol=1e-4)
-        torch.testing.assert_allclose(expected_mean, actual_mean)
-        torch.testing.assert_allclose(expected_std, actual_std)
+        assert_allclose(expected_norm, actual_norm, rtol=1e-4, atol=1e-4)
+        assert_allclose(expected_mean, actual_mean)
+        assert_allclose(expected_std, actual_std)
 
     @given(X=hu.tensor(min_dim=2), **hu.gcs)
     def test_layer_norm_brew_wrapper(self, X, gc, dc):
diff --git a/caffe2/python/operator_test/torch_integration_test.py b/caffe2/python/operator_test/torch_integration_test.py
index f99a616..d143e01 100644
--- a/caffe2/python/operator_test/torch_integration_test.py
+++ b/caffe2/python/operator_test/torch_integration_test.py
@@ -11,6 +11,8 @@
 from hypothesis import given, settings
 from scipy.stats import norm
 
+from ._utils import assert_allclose
+
 
 def generate_rois(roi_counts, im_dims):
     assert len(roi_counts) == len(im_dims)
@@ -172,7 +174,7 @@
             legacy_plus_one=True,
         )
 
-        torch.testing.assert_allclose(box_out, a)
+        assert_allclose(box_out, a)
 
     @given(
         roi_counts=st.lists(st.integers(0, 5), min_size=1, max_size=10),
@@ -268,7 +270,7 @@
         )
 
         for o, o_ref in zip(outputs, output_refs):
-            torch.testing.assert_allclose(o, o_ref)
+            assert_allclose(o, o_ref)
 
     @given(
         dim_1=st.integers(min_value=10, max_value=10),
@@ -314,7 +316,7 @@
             mask=mask,
         )
 
-        torch.testing.assert_allclose(output, a)
+        assert_allclose(output, a)
 
         # Testing return_presence_mask = True
         output, presence_mask = sparse_to_dense_mask_ref(return_presence_mask=True)
@@ -330,8 +332,8 @@
             return_presence_mask=True,
         )
 
-        torch.testing.assert_allclose(output, a)
-        torch.testing.assert_allclose(presence_mask, b)
+        assert_allclose(output, a)
+        assert_allclose(presence_mask, b)
 
     @given(
         A=st.integers(min_value=4, max_value=4),
@@ -382,8 +384,8 @@
             1.0,
             legacy_plus_one=True,
         )
-        torch.testing.assert_allclose(rois, a)
-        torch.testing.assert_allclose(rois_probs, b)
+        assert_allclose(rois, a)
+        assert_allclose(rois_probs, b)
 
     @given(
         bsz=st.integers(1, 5),
@@ -461,9 +463,9 @@
         a, b, c = torch.ops._caffe2.InferenceLSTM(
             lstm_in, num_layers, has_biases, batch_first, is_bidirectional
         )
-        torch.testing.assert_allclose(output, a)
-        torch.testing.assert_allclose(hidden, b)
-        torch.testing.assert_allclose(cell, c)
+        assert_allclose(output, a)
+        assert_allclose(hidden, b)
+        assert_allclose(cell, c)
 
     # Test case is using workspace.has_cuda_support and not workspace.has_gpu_support
     # to exclude it from HIP because tensor interop doesn't work for HIP tensors yet
@@ -517,8 +519,8 @@
             1.0,
             legacy_plus_one=True,
         )
-        torch.testing.assert_allclose(rois, a.cpu())
-        torch.testing.assert_allclose(rois_probs, b.cpu())
+        assert_allclose(rois, a.cpu())
+        assert_allclose(rois_probs, b.cpu())
 
     @given(
         N=st.integers(min_value=1, max_value=2),
@@ -567,7 +569,7 @@
             sampling_ratio=0,
             aligned=False,
         )
-        torch.testing.assert_allclose(roi_feature_ref, roi_feature.cpu())
+        assert_allclose(roi_feature_ref, roi_feature.cpu())
 
     def test_roi_align_cpu(self):
         self._test_roi_align(device="cpu")
@@ -624,7 +626,7 @@
             sampling_ratio=0,
             aligned=False,
         )
-        torch.testing.assert_allclose(roi_feature_ref, roi_feature.cpu())
+        assert_allclose(roi_feature_ref, roi_feature.cpu())
 
     def test_roi_align_rotated_cpu(self):
         self._test_roi_align_rotated(device="cpu")
@@ -674,9 +676,9 @@
         rois_idx_restore_int32 = fpn_outputs[-1]
 
         # [rois] + fpn_outputs should be equal to all_outputs
-        torch.testing.assert_allclose(rois, all_outputs[0])
+        assert_allclose(rois, all_outputs[0])
         for x, y in zip(fpn_outputs, all_outputs[1:]):
-            torch.testing.assert_allclose(x, y)
+            assert_allclose(x, y)
 
     @given(X=hu.tensor(), fast_gelu=st.booleans())
     def _test_gelu_op(self, X, fast_gelu, device):
@@ -688,7 +690,7 @@
 
         rtol = 1e-3 if fast_gelu else 1e-4
         atol = 1e-5
-        torch.testing.assert_allclose(
+        assert_allclose(
             expected_output, actual_output.cpu(), rtol=rtol, atol=atol
         )
 
@@ -719,7 +721,7 @@
             torch.tensor(data), torch.tensor(lengths, dtype=torch.int32)
         )
 
-        torch.testing.assert_allclose(expected_output, actual_output.cpu())
+        assert_allclose(expected_output, actual_output.cpu())
 
     def _test_lengths_sum_op(self, device):
         self._test_lengths_op("LengthsSum", torch.ops._caffe2.LengthsSum, device)
@@ -775,7 +777,7 @@
             height_scale=1.5,
         )
 
-        torch.testing.assert_allclose(expected_output, actual_output.cpu())
+        assert_allclose(expected_output, actual_output.cpu())
 
     def test_resize_nearest_op_cpu(self):
         return self._test_resize_nearest_op("cpu")
@@ -838,16 +840,16 @@
             binary_input,
         )
 
-        torch.testing.assert_allclose(torch.tensor(expected_output), actual_output)
+        assert_allclose(torch.tensor(expected_output), actual_output)
 
     def test_alias_with_name_is_in_place(self):
         device = "cuda" if workspace.has_cuda_support else "cpu"
         x = torch.tensor([3., 42.]).to(device=device)
         y = torch.ops._caffe2.AliasWithName(x, "new_name")
         x[1] = 6
-        torch.testing.assert_allclose(x, torch.tensor([3., 6.]).to(device=device))
+        assert_allclose(x, torch.tensor([3., 6.]).to(device=device))
         # y should also change because y is alias of x
-        torch.testing.assert_allclose(y, torch.tensor([3., 6.]).to(device=device))
+        assert_allclose(y, torch.tensor([3., 6.]).to(device=device))
 
     @unittest.skipIf(not workspace.has_cuda_support, "No cuda support")
     def test_copy_between_cpu_and_gpu(self):
@@ -855,9 +857,9 @@
         x_gpu_ref = x_cpu_ref.to("cuda")
 
         x_gpu = torch.ops._caffe2.CopyCPUToGPU(x_cpu_ref)
-        torch.testing.assert_allclose(x_gpu, x_gpu_ref)
+        assert_allclose(x_gpu, x_gpu_ref)
         x_cpu = torch.ops._caffe2.CopyGPUToCPU(x_gpu)
-        torch.testing.assert_allclose(x_cpu, x_cpu_ref)
+        assert_allclose(x_cpu, x_cpu_ref)
 
     def test_index_hash_op(self):
         data = np.random.randint(low=0, high=1000, size=(4, 4, 4))
@@ -873,7 +875,7 @@
             torch.tensor(data), seed=0, modulo=100
         )
 
-        torch.testing.assert_allclose(expected_output, actual_output.cpu())
+        assert_allclose(expected_output, actual_output.cpu())
 
     def test_bucketize_op(self):
         data = np.random.rand(8, 10).astype(np.float32) * 1000
@@ -889,7 +891,7 @@
 
         expected_output = _bucketize_ref(data)
         actual_output = torch.ops._caffe2.Bucketize(torch.tensor(data), boundaries)
-        torch.testing.assert_allclose(expected_output, actual_output.cpu())
+        assert_allclose(expected_output, actual_output.cpu())
 
     @given(X=hu.tensor(), eps=st.floats(min_value=1e-4, max_value=1e-2))
     def test_logit(self, X, eps):
@@ -901,7 +903,7 @@
 
         expected_output = ref(X, eps)
         actual_output = torch.ops._caffe2.Logit(torch.tensor(X), eps)
-        torch.testing.assert_allclose(expected_output, actual_output.cpu())
+        assert_allclose(expected_output, actual_output.cpu())
 
     def test_percentile(self):
         original_values = np.array([[3.0, 5.0, 3], [5.0, 1.0, 6.0]]).astype(np.float32)
@@ -926,7 +928,7 @@
             torch.tensor(value_to_pct),
             torch.tensor(lengths),
         )
-        torch.testing.assert_allclose(expected_output, actual_output.cpu())
+        assert_allclose(expected_output, actual_output.cpu())
 
     def test_batch_bucket_one_hot_op(self):
         data = np.array([[2, 3], [4, 1], [2, 5]]).astype(np.float32)
@@ -947,7 +949,7 @@
         actual_output = torch.ops._caffe2.BatchBucketOneHot(
             torch.tensor(data), torch.tensor(lengths), torch.tensor(boundaries)
         )
-        torch.testing.assert_allclose(expected_output, actual_output.cpu())
+        assert_allclose(expected_output, actual_output.cpu())
 
     def test_gather_ranges_to_dense_op(self):
         data = np.array([1, 2, 3, 4, 5, 6, 7, 8])
@@ -1033,8 +1035,8 @@
                 torch.tensor(values[1]),
             ]
         )
-        torch.testing.assert_allclose(expected_merged_lengths, output_merged_lengths)
-        torch.testing.assert_allclose(expected_merged_values, output_merged_values)
+        assert_allclose(expected_merged_lengths, output_merged_lengths)
+        assert_allclose(expected_merged_values, output_merged_values)
 
     def test_learning_rate(self):
         base_lr = 0.05
@@ -1097,7 +1099,7 @@
         packed_tensor, _ = torch.ops._caffe2.PackSegments(lengths, s)
         self.assertEqual(packed_tensor.numpy().shape, (2, 2, 3, 3))
         unpacked_tensor = torch.ops._caffe2.UnpackSegments(lengths, packed_tensor)
-        torch.testing.assert_allclose(s, unpacked_tensor)
+        assert_allclose(s, unpacked_tensor)
 
 
 if __name__ == "__main__":
diff --git a/docs/source/fx.rst b/docs/source/fx.rst
index 664fee1..29d73b3 100644
--- a/docs/source/fx.rst
+++ b/docs/source/fx.rst
@@ -1039,7 +1039,7 @@
         traced.eval()
 
         x = torch.randn(5, 3)
-        torch.testing.assert_allclose(traced(x), x)
+        torch.testing.assert_close(traced(x), x)
         """
         AssertionError: Tensor-likes are not close!
 
@@ -1071,7 +1071,7 @@
         traced.eval()
 
         x = torch.randn(5, 3)
-        torch.testing.assert_allclose(traced(x), x)
+        torch.testing.assert_close(traced(x), x)
 
   - Because of this difference, consider marking modules that interact with the ``training`` flag dynamically as leaf modules.
 
diff --git a/functorch/benchmarks/operator_authoring.py b/functorch/benchmarks/operator_authoring.py
index 88e558b..cbd816e 100644
--- a/functorch/benchmarks/operator_authoring.py
+++ b/functorch/benchmarks/operator_authoring.py
@@ -77,7 +77,7 @@
         assert result_nnc.dtype == result_aten.dtype
         assert result_nnc.size() == result_aten.size()
         assert result_nnc.stride() == result_aten.stride()
-        torch.testing.assert_allclose(result_aten, result_nnc)
+        torch.testing.assert_close(result_aten, result_nnc)
         return (lambda: nnc(*args), lambda: aten(*args))
 
     return benchmark_loop(setup)
@@ -90,7 +90,7 @@
         result_nnc = torch.clone(a)
         nnc(result_nnc, b, out=result_nnc)
         aten(result_aten, b, out=result_aten)
-        torch.testing.assert_allclose(result_aten, result_nnc)
+        torch.testing.assert_close(result_aten, result_nnc)
         return (lambda: nnc(a, b, out=a), lambda: aten(a, b, out=a))
 
     return benchmark_loop(inplace_setup)
@@ -103,7 +103,7 @@
         result_nnc = out(n)
         aten(*args, out=result_aten)
         nnc(*args, out=result_nnc)
-        torch.testing.assert_allclose(result_aten, result_nnc)
+        torch.testing.assert_close(result_aten, result_nnc)
         result = out(n)
         return (lambda: nnc(*args, out=result), lambda: aten(*args, out=result))
 
@@ -118,7 +118,7 @@
         correct = grad_var.grad.clone()
         grad_var.grad.zero_()
         nnc(*args).sum().backward()
-        torch.testing.assert_allclose(correct, grad_var.grad)
+        torch.testing.assert_close(correct, grad_var.grad)
         return (
             lambda: nnc(*args).sum().backward(),
             lambda: aten(*args).sum().backward(),
diff --git a/functorch/benchmarks/pointwise_scorecard.py b/functorch/benchmarks/pointwise_scorecard.py
index ac4cf5f..15863dc 100644
--- a/functorch/benchmarks/pointwise_scorecard.py
+++ b/functorch/benchmarks/pointwise_scorecard.py
@@ -195,13 +195,13 @@
         if shape == medium_transpose:
             raise RuntimeError("pointwise_operator hangs on medium_transpose")
         pw_op = pointwise_operator(operator)
-        torch.testing.assert_allclose(operator(*args), pw_op(*args))
+        torch.testing.assert_close(operator(*args), pw_op(*args))
     except Exception:
         print(f"pointwise_operator failed on {operator.__name__}, {shape.__name__}")
         nope.add((operator, shape))
 
     ts_op = torch.jit.script(operator)
-    torch.testing.assert_allclose(operator(*args), ts_op(*args))
+    torch.testing.assert_close(operator(*args), ts_op(*args))
 
 
 print("fuser,device,operator,shape,time")
diff --git a/functorch/examples/compilation/fuse_module.py b/functorch/examples/compilation/fuse_module.py
index dafbc80..ec091eb 100644
--- a/functorch/examples/compilation/fuse_module.py
+++ b/functorch/examples/compilation/fuse_module.py
@@ -36,7 +36,7 @@
 compiled_mod = compiled_module(mod, fw_compiler, bw_compiler)
 
 for a, b in zip(run(mod, input), run(compiled_mod, input)):
-    torch.testing.assert_allclose(a, b)
+    torch.testing.assert_close(a, b)
 
 out = mod(input)
 out.sum().backward()
@@ -45,7 +45,7 @@
 compiled_mod.orig_module.param.grad = None
 
 for a, b in zip(run(mod, input), run(compiled_mod, input)):
-    torch.testing.assert_allclose(a, b)
+    torch.testing.assert_close(a, b)
 
 for _ in range(5):
     i = 10000
diff --git a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
index ead934e..d3ea932 100644
--- a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
+++ b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
@@ -124,7 +124,7 @@
         # Register hook case, get the hook grads.
         hook_grads = self._get_grads(process_group, DDPCommHookType.ALLREDUCE)
 
-        torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=0)
+        torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=0)
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
@@ -141,7 +141,7 @@
         # Register hook case, get the hook grads.
         hook_grads = self._get_grads(process_group, DDPCommHookType.FP16_COMPRESS)
 
-        torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
+        torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
@@ -158,7 +158,7 @@
         # Register hook case, get the hook grads.
         hook_grads = self._get_grads(process_group, DDPCommHookType.QUANTIZE_PER_TENSOR)
 
-        torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
+        torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
@@ -177,7 +177,7 @@
             process_group, DDPCommHookType.QUANTIZE_PER_CHANNEL
         )
 
-        torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
+        torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
 
 
     @requires_nccl()
@@ -198,7 +198,7 @@
         hook_grads.div_(self.world_size)
         dist.all_reduce(hook_grads, group=process_group)
 
-        torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=0)
+        torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=0)
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py
index 234af4f..79ed6da 100644
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@@ -110,7 +110,7 @@
         def _check_equal(local, fsdp):
             with FSDP.summon_full_params(fsdp):
                 for p1, p2 in zip(fsdp.parameters(), local.parameters()):
-                    torch.testing.assert_allclose(p1, p2)
+                    torch.testing.assert_close(p1, p2)
 
         for sharding_strategy in [
             ShardingStrategy.FULL_SHARD,
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index b6201d4..1270185 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -1421,7 +1421,7 @@
         # theoretical results.
         dist = Poisson(rate_zero)
         dist.log_prob(torch.ones_like(rate_zero)).backward()
-        torch.testing.assert_allclose(rate_zero.grad, torch.inf)
+        self.assertEqual(rate_zero.grad, torch.inf)
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_poisson_sample(self):
diff --git a/test/functorch/functorch_additional_op_db.py b/test/functorch/functorch_additional_op_db.py
index b090121..6343e74 100644
--- a/test/functorch/functorch_additional_op_db.py
+++ b/test/functorch/functorch_additional_op_db.py
@@ -4,8 +4,7 @@
 
 import torch
 
-from torch.testing import \
-    (floating_types, floating_types_and, all_types_and_complex_and)
+from torch.testing._internal.common_dtype import floating_types, floating_types_and, all_types_and_complex_and
 from torch.testing._internal.common_utils import make_tensor
 from torch.testing._internal.common_methods_invocations import OpInfo, SampleInput, DecorateInfo
 
diff --git a/test/fx/test_fx_param_shape_control_flow.py b/test/fx/test_fx_param_shape_control_flow.py
index e9af35d..04db468 100644
--- a/test/fx/test_fx_param_shape_control_flow.py
+++ b/test/fx/test_fx_param_shape_control_flow.py
@@ -91,26 +91,26 @@
         performs both mm and relu ops in cascade
         """
         x = torch.randn(10, 5)
-        torch.testing.assert_allclose(mm_only_mod(x), torch.mm(x, mm_only_mod.get_mul_matrix()))
+        torch.testing.assert_close(mm_only_mod(x), torch.mm(x, mm_only_mod.get_mul_matrix()))
         tracer = torch.fx.Tracer(param_shapes_constant=True)
         traced_graph = tracer.trace(mm_only_mod)
 
         # verify the graph module calculates the same result
         graph_mod_mm = torch.fx.GraphModule(mm_only_mod, traced_graph)
-        torch.testing.assert_allclose(graph_mod_mm(x), torch.mm(x, mm_only_mod.get_mul_matrix()))
+        torch.testing.assert_close(graph_mod_mm(x), torch.mm(x, mm_only_mod.get_mul_matrix()))
 
 
         # Make a new module with different parameter shape to go down the different
         # code path
         x = torch.randn(10, 15)
-        torch.testing.assert_allclose(relu_mod(x), torch.relu(torch.mm(x, relu_mod.get_mul_matrix())))
+        torch.testing.assert_close(relu_mod(x), torch.relu(torch.mm(x, relu_mod.get_mul_matrix())))
 
         tracer2 = torch.fx.Tracer(param_shapes_constant=True)
         traced_graph2 = tracer2.trace(relu_mod)
 
         # verify the graph module calculates the same result
         graph_mod_relu = torch.fx.GraphModule(relu_mod, traced_graph2)
-        torch.testing.assert_allclose(graph_mod_relu(x), torch.relu(torch.mm(x, relu_mod.get_mul_matrix())))
+        torch.testing.assert_close(graph_mod_relu(x), torch.relu(torch.mm(x, relu_mod.get_mul_matrix())))
 
 
         graph1_node_targets = [n.target for n in traced_graph.nodes]
diff --git a/test/jit/test_misc.py b/test/jit/test_misc.py
index 98ec783..8a5d4ea 100644
--- a/test/jit/test_misc.py
+++ b/test/jit/test_misc.py
@@ -379,4 +379,4 @@
         expected = foo(x, 0)
         scripted = torch.jit.script(foo)
         actual = scripted(x, 0)
-        torch.testing.assert_allclose(expected, actual)
+        torch.testing.assert_close(expected, actual)
diff --git a/test/mobile/test_lite_script_type.py b/test/mobile/test_lite_script_type.py
index 9a778fb..8769a4b 100644
--- a/test/mobile/test_lite_script_type.py
+++ b/test/mobile/test_lite_script_type.py
@@ -28,7 +28,7 @@
         buffer.seek(0)
         mobile_module = _load_for_lite_interpreter(buffer)  # Error here
         mobile_module_result = mobile_module(sample_input).a
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(
             script_module_result,
             mobile_module_result
         )
@@ -91,7 +91,7 @@
         buffer_mobile.seek(0)
         mobile_module = _load_for_lite_interpreter(buffer_mobile)
         mobile_module_result = mobile_module(sample_input)
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(
             script_module_result,
             mobile_module_result
         )
@@ -117,7 +117,7 @@
         buffer_mobile.seek(0)
         mobile_module = _load_for_lite_interpreter(buffer_mobile)
         mobile_module_result = mobile_module(sample_input)
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(
             script_module_result,
             mobile_module_result
         )
@@ -136,7 +136,7 @@
         buffer_mobile.seek(0)
         mobile_module = _load_for_lite_interpreter(buffer_mobile)
         mobile_module_result = mobile_module(sample_input)
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(
             script_module_result,
             mobile_module_result
         )
@@ -166,7 +166,7 @@
         buffer_mobile.seek(0)
         mobile_module = _load_for_lite_interpreter(buffer_mobile)
         mobile_module_result = mobile_module(sample_input)
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(
             script_module_result.baz.di,
             mobile_module_result.baz.di
         )
diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index 35d3ba3..a204350 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -1461,7 +1461,7 @@
         X = torch.randn(5 , 10)
         quantized_X = X.to(torch.bfloat16)
         dedequantized_X = quantized_X.to(torch.float32)
-        torch.testing.assert_allclose(X, dedequantized_X, rtol=1e-4, atol=5e-3)
+        torch.testing.assert_close(X, dedequantized_X, rtol=1e-4, atol=5e-3)
 
     def test_decomposed_quantize(self):
         # register the ops
diff --git a/test/quantization/core/test_workflow_module.py b/test/quantization/core/test_workflow_module.py
index 7194872..6ac8bed 100644
--- a/test/quantization/core/test_workflow_module.py
+++ b/test/quantization/core/test_workflow_module.py
@@ -1011,11 +1011,11 @@
         )
 
         # Compare params with reference
-        torch.testing.assert_allclose(out, out_ref)
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(out, out_ref)
+        torch.testing.assert_close(
             running_min_op, mod.activation_post_process.min_val
         )
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(
             running_max_op, mod.activation_post_process.max_val
         )
 
@@ -1066,11 +1066,11 @@
             )
 
             # Compare params with reference
-            torch.testing.assert_allclose(out, out_ref)
-            torch.testing.assert_allclose(
+            torch.testing.assert_close(out, out_ref)
+            torch.testing.assert_close(
                 running_min_op, mod.activation_post_process.min_val
             )
-            torch.testing.assert_allclose(
+            torch.testing.assert_close(
                 running_max_op, mod.activation_post_process.max_val
             )
 
@@ -1095,12 +1095,12 @@
             x = torch.randn(5, 5, device=device)
             out = mod(x)
             out_ref = mod_ref(x)
-            torch.testing.assert_allclose(out, out_ref)
-            torch.testing.assert_allclose(
+            torch.testing.assert_close(out, out_ref)
+            torch.testing.assert_close(
                 mod_ref.activation_post_process.min_val,
                 mod.activation_post_process.min_val,
             )
-            torch.testing.assert_allclose(
+            torch.testing.assert_close(
                 mod_ref.activation_post_process.max_val,
                 mod.activation_post_process.max_val,
             )
@@ -1151,20 +1151,20 @@
                     False,
                 )
                 # Compare params with reference
-                torch.testing.assert_allclose(out, out_ref)
+                torch.testing.assert_close(out, out_ref)
                 if mod.observer_enabled[0]:
-                    torch.testing.assert_allclose(
+                    torch.testing.assert_close(
                         running_min_op, mod.activation_post_process.min_val
                     )
-                    torch.testing.assert_allclose(
+                    torch.testing.assert_close(
                         running_max_op, mod.activation_post_process.max_val
                     )
                 if mod.fake_quant_enabled:
-                    torch.testing.assert_allclose(scale, mod.scale)
-                    torch.testing.assert_allclose(zero_point, mod.zero_point)
+                    torch.testing.assert_close(scale, mod.scale)
+                    torch.testing.assert_close(zero_point, mod.zero_point)
 
-            torch.testing.assert_allclose(mod.state_dict()['activation_post_process.min_val'], running_min_op)
-            torch.testing.assert_allclose(mod.state_dict()['activation_post_process.max_val'], running_max_op)
+            torch.testing.assert_close(mod.state_dict()['activation_post_process.min_val'], running_min_op)
+            torch.testing.assert_close(mod.state_dict()['activation_post_process.max_val'], running_max_op)
 
     def test_fused_mod_reduce_range(self):
         obs = FusedMovingAvgObsFakeQuantize(quant_min=0, quant_max=255, dtype=torch.quint8, reduce_range=True)
diff --git a/test/quantization/core/test_workflow_ops.py b/test/quantization/core/test_workflow_ops.py
index b459b58..a0687d8 100644
--- a/test/quantization/core/test_workflow_ops.py
+++ b/test/quantization/core/test_workflow_ops.py
@@ -1083,7 +1083,7 @@
 
             self.assertEqual(in_running_min_ref, in_running_min_op)
             self.assertEqual(in_running_max_ref, in_running_max_op)
-            torch.testing.assert_allclose(out, x_in)
+            torch.testing.assert_close(out, x_in)
 
         # Test empty input works
         x = torch.empty(0, 5, device=device)
@@ -1176,7 +1176,7 @@
                     x_in = x
                 self.assertEqual(in_running_min_ref, in_running_min_op)
                 self.assertEqual(in_running_max_ref, in_running_max_op)
-                torch.testing.assert_allclose(out, x_in)
+                torch.testing.assert_close(out, x_in)
 
     @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),)
     @settings(deadline=None)
@@ -1218,7 +1218,7 @@
             False,
         )
         # verify the output matches
-        torch.testing.assert_allclose(out, x_fake_quant)
+        torch.testing.assert_close(out, x_fake_quant)
 
         # verify the gradient matches expectation of fake_quant op
         dout = torch.rand_like(x, dtype=torch.float).to(device)
@@ -1264,7 +1264,7 @@
             False,
         )
         # verify the output matches
-        torch.testing.assert_allclose(out, x)
+        torch.testing.assert_close(out, x)
 
         # verify the gradient matches expectation of fake_quant op
         dout = torch.rand_like(x, dtype=torch.float).to(device)
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 04109ce..2bc1ed4 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -8157,7 +8157,7 @@
             inp = torch.randn(5, 5, device=device, requires_grad=True)
             out_ref = prepared_ref(inp)
             out = prepared(inp)
-            torch.testing.assert_allclose(out, out_ref)
+            torch.testing.assert_close(out, out_ref)
 
             # try backward pass
             labels = torch.randn(5, 5, device=device)
@@ -8165,7 +8165,7 @@
             grad = torch.autograd.grad(loss, [inp])
             loss_ref = (out_ref - labels).sum()
             grad_ref = torch.autograd.grad(loss_ref, [inp])
-            torch.testing.assert_allclose(grad[0], grad_ref[0])
+            torch.testing.assert_close(grad[0], grad_ref[0])
 
         if 'fbgemm' in torch.backends.quantized.supported_engines:
             # During the lowering step in convert, fold_weight calls quantized::linear_prepack
@@ -8178,7 +8178,7 @@
             out = converted(inp)
             out_ref = converted_ref(inp)
 
-            torch.testing.assert_allclose(out, out_ref)
+            torch.testing.assert_close(out, out_ref)
 if __name__ == '__main__':
     raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
                        "\tpython test/test_quantization.py TESTNAME\n\n"
diff --git a/test/test_fx.py b/test/test_fx.py
index eac58fb..9a46a50 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -234,7 +234,7 @@
         new_instance.__init__(gm3, gm3.graph)
 
         x = torch.randn(5, 3)
-        torch.testing.assert_allclose(new_instance(x), torch.relu(x))
+        torch.testing.assert_close(new_instance(x), torch.relu(x))
 
     def test_custom_import(self):
         graph = torch.fx.Graph()
@@ -809,7 +809,7 @@
         traced = torch.fx.symbolic_trace(ec)
 
         x = torch.randn(bs, d_hid)
-        torch.testing.assert_allclose(ec(x), traced(x))
+        torch.testing.assert_close(ec(x), traced(x))
 
 
     def test_node_tagging(self):
@@ -1126,7 +1126,7 @@
 
         traced = torch.fx.symbolic_trace(foo)
         x = (torch.randn(5, 3),)
-        torch.testing.assert_allclose(traced(x), x[0])
+        torch.testing.assert_close(traced(x), x[0])
 
         bio = io.BytesIO()
 
@@ -1136,7 +1136,7 @@
 
         loaded = torch.load(bio)
 
-        torch.testing.assert_allclose(loaded(x), x[0])
+        torch.testing.assert_close(loaded(x), x[0])
 
     def test_torch_fx_len(self):
         class FXLenTest(torch.nn.Module):
@@ -1806,7 +1806,7 @@
         interp = Interpreter(gm)
         x = torch.randn(5, 3)
         out = interp.run(x)
-        torch.testing.assert_allclose(out, x + 3.14159)
+        torch.testing.assert_close(out, x + 3.14159)
 
     def test_interpreter_not_enough_args(self):
         class Model(torch.nn.Module):
@@ -2315,8 +2315,8 @@
         traced1.recompile()
 
         x = torch.randn(15, 15)
-        torch.testing.assert_allclose(traced1(x), torch.relu(x))
-        torch.testing.assert_allclose(copied(x), torch.neg(x))
+        torch.testing.assert_close(traced1(x), torch.relu(x))
+        torch.testing.assert_close(copied(x), torch.neg(x))
 
     def test_direct_param_use(self):
         class TransposeTest(torch.nn.Module):
@@ -2699,7 +2699,7 @@
         replica = gm._replicate_for_data_parallel()
         out_replica = replica(x)
 
-        torch.testing.assert_allclose(out_replica, out)
+        torch.testing.assert_close(out_replica, out)
 
     def test_ast_rewriter_rewrites_assert(self):
         class M(torch.nn.Module):
@@ -3045,7 +3045,7 @@
         traced_graph = MyCustomTracer().trace(model)
         gm2 = torch.fx.GraphModule(model, traced_graph)
         gm2.delete_all_unused_submodules()
-        torch.testing.assert_allclose(gm2(inputs), model(inputs))
+        torch.testing.assert_close(gm2(inputs), model(inputs))
 
     def test_fx_stateless(self):
         class MockModule(torch.nn.Module):
diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index ae7a225..a8fc077 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -782,7 +782,7 @@
 
         x = torch.randn(5, 3)
         foo = torch.randn(5, 3)
-        torch.testing.assert_allclose(split(x, foo=foo), traced(x, foo=foo))
+        torch.testing.assert_close(split(x, foo=foo), traced(x, foo=foo))
 
     @skipIfNoTorchVision
     def test_subgraph_trivial_resnet(self):
@@ -814,7 +814,7 @@
         split = split_module(traced, mtt, lambda node: 0)
 
         x = torch.randn(50, 512)
-        torch.testing.assert_allclose(split(x), traced(x))
+        torch.testing.assert_close(split(x), traced(x))
 
     def test_normalize_binary_operators(self):
         ops_to_test = {
diff --git a/test/test_jit_autocast.py b/test/test_jit_autocast.py
index 93674bb..d311eb6 100644
--- a/test/test_jit_autocast.py
+++ b/test/test_jit_autocast.py
@@ -797,7 +797,7 @@
                 y = traced_model(x.clone())
             with torch.cpu.amp.autocast(), torch.no_grad():
                 y2 = model(x.clone())
-            torch.testing.assert_allclose(y.double(), y2.double(), rtol=1e-03, atol=1e-03)
+            torch.testing.assert_close(y.double(), y2.double(), rtol=1e-03, atol=1e-03)
         for i in range(self.models.__len__()):
             test_nchw_autocast_jit_trace_model(self.models[i], self.inputs[i])
 
@@ -812,7 +812,7 @@
                 y = traced_model(x.clone().to(memory_format=torch.channels_last))
             with torch.cpu.amp.autocast(), torch.no_grad():
                 y2 = model(x.clone().to(memory_format=torch.channels_last))
-            torch.testing.assert_allclose(y.double(), y2.double(), rtol=1e-03, atol=1e-03)
+            torch.testing.assert_close(y.double(), y2.double(), rtol=1e-03, atol=1e-03)
         for i in range(self.models.__len__()):
             if self.inputs[i].size().__len__() == 5:
                 # NHWC 3D case not support yet
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index e1c820f..1958529 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -2202,7 +2202,9 @@
         def test(fn, args):
             trace = torch.jit.trace(fn, args)
             self.assertAllFused(trace.graph_for(*args))
-            torch.testing.assert_allclose(fn(*args), trace(*args))
+            # TODO: Are `NaN`'s actually ok here or did this pass silently before, because `equal_nan=True` was the
+            #  default?
+            torch.testing.assert_close(fn(*args), trace(*args), equal_nan=True)
 
         def bn(i, x):
             return torch.batch_norm(i, x, x, x, x, False, 0.1, 1e-4, False).relu()
diff --git a/test/test_module_init.py b/test/test_module_init.py
index dc05a95..98dcb3e 100644
--- a/test/test_module_init.py
+++ b/test/test_module_init.py
@@ -4,7 +4,7 @@
 import torch
 from unittest import mock
 from unittest.mock import MagicMock, patch
-from torch.testing import floating_types
+from torch.testing._internal.common_dtype import floating_types
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes
 from torch.testing._internal.common_quantization import skipIfNoFBGEMM
 from torch.testing._internal.common_utils import TestCase, run_tests
diff --git a/test/test_mps.py b/test/test_mps.py
index c89a9d2..03410e2 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -65,7 +65,7 @@
         return np.maximum(np_features, np.zeros(np_features.shape)).astype(np_features.dtype)
 
     def testNpRelu(self):
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(
             np.array([[0., 0.7, 0.0, 0.3, 0.0], [0.1, 0.0, 0.5, 0.0, 0.9]]),
             self._npRelu(
                 np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
@@ -79,7 +79,7 @@
         py_relu = torch.nn.ReLU(inplace=False)(py_tensor)
         py_relu_cpu = py_relu.to("cpu")
 
-        torch.testing.assert_allclose(np_relu, py_relu_cpu)
+        self.assertEqual(np_relu, py_relu_cpu)
 
     def _testReluInPlace(self, np_features, device):
         np_relu = self._npRelu(np_features)
@@ -89,9 +89,9 @@
         py_relu = torch.nn.ReLU(inplace=True)(py_tensor)
         py_relu_cpu = py_relu.to("cpu")
 
-        torch.testing.assert_allclose(np_relu, py_relu_cpu)
+        self.assertEqual(np_relu, py_relu_cpu)
         # Inplace Relu modifies the initial input and it should match the output of Relu
-        torch.testing.assert_allclose(np_relu, py_tensor.to("cpu"))
+        self.assertEqual(np_relu, py_tensor.to("cpu"))
 
     def testNumbersCPU(self):
         for t in [np.int32]:
@@ -156,7 +156,7 @@
         return np.maximum(np_features, negative_slope * np_features).astype(np_features.dtype)
 
     def testNpLeakyRelu(self):
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(
             np.array([[-0.09, 0.7, -0.05, 0.3, -0.01],
                       [0.1, -0.03, 0.5, -0.07, 0.9]]),
             self._npLeakyRelu(
@@ -171,14 +171,14 @@
 
         cpu_leaky_relu = relu_op(cpu_x)
         mps_leaky_relu = relu_op(mps_x)
-        torch.testing.assert_allclose(cpu_leaky_relu, mps_leaky_relu.to('cpu'))
+        torch.testing.assert_close(cpu_leaky_relu, mps_leaky_relu.to('cpu'))
 
         # test backward pass
         cpu_grad = torch.ones_like(cpu_leaky_relu)
         mps_grad = cpu_grad.to('mps')
         cpu_leaky_relu.backward(gradient=cpu_grad)
         mps_leaky_relu.backward(gradient=mps_grad)
-        torch.testing.assert_allclose(cpu_x.grad, mps_x.grad.to('cpu'))
+        torch.testing.assert_close(cpu_x.grad, mps_x.grad.to('cpu'))
 
     def testNumbersCPU(self):
         for t in [np.float32]:
@@ -257,14 +257,14 @@
 
         cpu_leaky_relu = relu_op(cpu_x)
         mps_leaky_relu = relu_op(mps_x)
-        torch.testing.assert_allclose(cpu_leaky_relu, mps_leaky_relu.to('cpu'))
+        torch.testing.assert_close(cpu_leaky_relu, mps_leaky_relu.to('cpu'))
 
         # test backward pass
         cpu_grad = torch.ones_like(cpu_leaky_relu)
         mps_grad = cpu_grad.to('mps')
         cpu_leaky_relu.backward(gradient=cpu_grad)
         mps_leaky_relu.backward(gradient=mps_grad)
-        torch.testing.assert_allclose(cpu_x.grad, mps_x.grad.to('cpu'))
+        torch.testing.assert_close(cpu_x.grad, mps_x.grad.to('cpu'))
 
     def testNumbersGPU(self):
         for t in [np.float32]:
@@ -293,14 +293,14 @@
         B = torch.ones(5, 6).to("mps")
         C = torch.ones(6, 5).to("mps")
         D = torch.mm(B, C).cpu()
-        torch.testing.assert_allclose(D, torch.full((5, 5), 6.0))
+        torch.testing.assert_close(D, torch.full((5, 5), 6.0))
 
     def test_addmm(self):
         A = torch.ones(5, 5).to("mps")
         B = torch.ones(5, 6).to("mps")
         C = torch.ones(6, 5).to("mps")
         D = torch.addmm(A, B, C).to("cpu")
-        torch.testing.assert_allclose(D, torch.full((5, 5), 7.0))
+        torch.testing.assert_close(D, torch.full((5, 5), 7.0))
 
     def test_bmm(self):
         batch1_cpu = torch.randn(10, 3, 4)
@@ -355,7 +355,7 @@
     def test_local_scalar_dense_mps(self):
         x_cpu = torch.randn(1)
         y_mps = x_cpu.to("mps")
-        torch.testing.assert_allclose(x_cpu.item(), y_mps.item())
+        torch.testing.assert_close(x_cpu.item(), y_mps.item())
 
     def test_linear_1d_weight(self):
         device = 'cpu'
diff --git a/test/test_nn.py b/test/test_nn.py
index cab9db7..13036ef 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -4907,7 +4907,7 @@
                 n, k = k, n
             Id = torch.eye(k, dtype=X.dtype, device=X.device).expand(*(X.size()[:-2]), k, k)
             eps = 10 * n * torch.finfo(X.dtype).eps
-            torch.testing.assert_allclose(X.mH @ X, Id, atol=eps, rtol=0.)
+            torch.testing.assert_close(X.mH @ X, Id, atol=eps, rtol=0.)
 
 
         def assert_weight_allclose_Q(weight, W):
@@ -4920,7 +4920,7 @@
             Q *= R.diagonal(dim1=-2, dim2=-1).sgn().unsqueeze(-2)
             if wide_matrix:
                 Q = Q.mT
-            torch.testing.assert_allclose(Q, weight, atol=1e-5, rtol=0.)
+            torch.testing.assert_close(Q, weight, atol=1e-5, rtol=0.)
 
 
         for shape, dtype, use_linear in product(((4, 4), (5, 3), (3, 5)),  # square/ tall / wide
@@ -4979,7 +4979,7 @@
                     w_new = w_new.mT
                 if can_initialize:
                     m.weight = w_new
-                    torch.testing.assert_allclose(w_new, m.weight, atol=1e-5, rtol=0.)
+                    torch.testing.assert_close(w_new, m.weight, atol=1e-5, rtol=0.)
                 else:
                     msg = "assign to the matrix exponential or the Cayley parametrization"
                     with self.assertRaisesRegex(NotImplementedError, msg):
diff --git a/test/test_optim.py b/test/test_optim.py
index a55a74d..e611f75 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -1650,7 +1650,7 @@
         new_scheduler = CosineAnnealingLR(
             self.opt, T_max=T_max, eta_min=eta_min, last_epoch=0)
         new_lrs = new_scheduler._last_lr
-        torch.testing.assert_allclose(original_lrs, new_lrs, rtol=1e-4, atol=1e-5)
+        torch.testing.assert_close(original_lrs, new_lrs, rtol=1e-4, atol=1e-5)
 
     def test_reduce_lr_on_plateau1(self):
         epochs = 10
diff --git a/test/test_torch.py b/test/test_torch.py
index f84d8af..b507f68 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -37,7 +37,7 @@
     skipCUDAMemoryLeakCheckIf, BytesIOContext,
     skipIfRocm, skipIfNoSciPy, TemporaryFileName, TemporaryDirectoryName,
     wrapDeterministicFlagAPITest, DeterministicGuard, CudaSyncGuard,
-    skipIfNotRegistered, bytes_to_scalar, parametrize, skipIfMps)
+    skipIfNotRegistered, bytes_to_scalar, parametrize, skipIfMps, noncontiguous_like)
 from multiprocessing.reduction import ForkingPickler
 from torch.testing._internal.common_device_type import (
     expectedFailureMeta,
@@ -2959,10 +2959,9 @@
                     dest = make_tensor(size, device=device, dtype=dtype, noncontiguous=dest_noncontig)
                     src_size = size[:dim] + (num_src,) + size[dim + 1:]
                     src = make_tensor(src_size, device=device, dtype=dtype, noncontiguous=src_noncontig)
-                    idx = torch.randint(num_dest, (num_src,), dtype=idx_dtype, device=device)
-                    if index_noncontig:
-                        # noncontiguous_like fails with RuntimeError: XLA tensors do not have storage
-                        idx = torch.testing.make_non_contiguous(idx)
+                    idx = torch.testing.make_tensor(
+                        num_src, low=0, high=num_dest, dtype=idx_dtype, device=device, noncontiguous=index_noncontig
+                    )
                     expected = dest.clone()
                     dest.index_reduce_(dim, idx, src, reduce, include_self=include_self)
                     # fill rows in idx with reduction inits if include_self=False
@@ -5588,10 +5587,10 @@
                             dest = make_tensor(dest.shape, device=device, dtype=dest.dtype, noncontiguous=True)
                         src = torch.randn(num_copy, *other_sizes, device=device)
                         if not src_contig:
-                            src = torch.testing.make_non_contiguous(src)
+                            src = noncontiguous_like(src)
                         idx = torch.randperm(num_dest, dtype=dtype, device=device).narrow(0, 0, num_copy)
                         if not index_contig:
-                            idx = torch.testing.make_non_contiguous(idx)
+                            idx = noncontiguous_like(idx)
                         # index_add_ without alpha argument
                         dest2 = dest.clone()
                         dest.index_add_(0, idx, src)
diff --git a/torch/fx/OVERVIEW.md b/torch/fx/OVERVIEW.md
index f2995eb..9c07070 100644
--- a/torch/fx/OVERVIEW.md
+++ b/torch/fx/OVERVIEW.md
@@ -61,7 +61,7 @@
 symbolic_traced : torch.fx.GraphModule = symbolic_trace(module)
 
 input = torch.rand(3, 4)
-torch.testing.assert_allclose(symbolic_traced(input), module(input))
+torch.testing.assert_close(symbolic_traced(input), module(input))
 ```
 
 Here, we set up a simple Module that exercises different language features: fetching a parameter, applying an arithmetic operator, applying a submodule (linear), and applying a Tensor method. `symbolic_trace` returns an instance of GraphModule, which is in itself a subclass of `nn.Module`. We can see that the `symbolic_traced` instance runs and returns the same result as the original module instance module.
diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py
index 95218bf..6428d4c 100644
--- a/torch/fx/interpreter.py
+++ b/torch/fx/interpreter.py
@@ -57,7 +57,7 @@
             gm = torch.fx.symbolic_trace(fn)
             input = torch.randn(3, 4)
             result = NegSigmSwapInterpreter(gm).run(input)
-            torch.testing.assert_allclose(result, torch.neg(input).sigmoid())
+            torch.testing.assert_close(result, torch.neg(input).sigmoid())
 
     Args:
         module (GraphModule): The module to be executed
@@ -395,7 +395,7 @@
 
             transformed : torch.nn.Module = NegSigmSwapXformer(gm).transform()
             input = torch.randn(3, 4)
-            torch.testing.assert_allclose(transformed(input), torch.neg(input).sigmoid())
+            torch.testing.assert_close(transformed(input), torch.neg(input).sigmoid())
 
     Args:
         module (GraphModule): The ``Module`` to be transformed.
diff --git a/torch/jit/_freeze.py b/torch/jit/_freeze.py
index 47da766..af0a132 100644
--- a/torch/jit/_freeze.py
+++ b/torch/jit/_freeze.py
@@ -136,9 +136,9 @@
         mod (:class:`ScriptModule`): a frozen module to be optimized
 
         optimize_numerics (bool): If ``True``, a set of optimization passes will be run that does not strictly
-        preserve numerics. These optimizations preserve default rtol and atol of `torch.testing.assert_allclose`
+        preserve numerics. These optimizations preserve default rtol and atol of `torch.testing.assert_close`
         when applied on a single transformation, however in a module where many transformations are applied
-        the rtol or atol may no longer fall within the default `assert_allclose` tolerance. Conv -> Batchnorm folding,
+        the rtol or atol may no longer fall within the default `assert_close` tolerance. Conv -> Batchnorm folding,
         Conv-Add/Sub, and Conv -> Mul/Div folding all may alter numerics.
 
     Returns:
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index 616c1c9..0dca22f 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -1033,7 +1033,9 @@
                 self.assertEqual(param.device, cpu_device)
             fsdp_loss = fsdp_loss.cuda()
         fsdp_unsharded_params = get_full_params(fsdp_model)
-        torch.testing.assert_allclose(ref_loss, fsdp_loss)
+        # TODO: Are mismatching dtypes actually ok here or did this pass silently before, because `check_dtype=False`
+        #  was the default?
+        torch.testing.assert_close(ref_loss, fsdp_loss, check_dtype=False)
         # Do not check for parameter parity if using mixed precision since (1)
         # the DDP parameters are in FP16 (from `half()`) while the FSDP
         # parameters are in FP32 (from `summon_full_params()`) and (2) DDP runs
diff --git a/torch/testing/_internal/opinfo/definitions/signal.py b/torch/testing/_internal/opinfo/definitions/signal.py
index 9404cc8..c915e93 100644
--- a/torch/testing/_internal/opinfo/definitions/signal.py
+++ b/torch/testing/_internal/opinfo/definitions/signal.py
@@ -7,6 +7,7 @@
 import numpy
 
 import torch
+from torch.testing._internal.common_dtype import floating_types_and
 from torch.testing._internal.common_utils import TEST_SCIPY
 from torch.testing._internal.opinfo.core import (
     DecorateInfo,
@@ -14,7 +15,6 @@
     OpInfo,
     SampleInput,
 )
-from torch.testing._legacy import floating_types_and
 
 if TEST_SCIPY:
     import scipy.signal