test/functorch/test_ops.py - platform/external/pytorch - Git at Google

 # Owner(s): ["module: functorch"]

 # Copyright (c) Facebook, Inc. and its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.

 import functools
 import itertools
 import unittest

 from common_utils import (
     check_vmap_fallback,
     decorate,
     expectedFailureIf,
     generate_vmap_inputs,
     get_fallback_and_vmap_exhaustive,
     is_batch_norm_training,
     is_valid_inplace_sample_input,
     loop,
     loop2,
     opsToleranceOverride,
     skip,
     skipOps,
     tol1,
     tol2,
     xfail,
 )
 from functorch_additional_op_db import additional_op_db

 import torch
 import torch.autograd.forward_ad as fwAD
 from functorch import grad, jacfwd, jacrev, vjp, vmap
 from torch import Tensor
 from torch._functorch.eager_transforms import _as_tuple, jvp
 from torch.testing._internal.autograd_function_db import autograd_function_db
 from torch.testing._internal.common_cuda import with_tf32_off
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
     ops,
     tol,
     toleranceOverride,
 )
 from torch.testing._internal.common_methods_invocations import op_db
 from torch.testing._internal.common_utils import (
     is_iterable_of_tensors,
     IS_MACOS,
     IS_X86,
     noncontiguous_like,
     parametrize,
     run_tests,
     runOnRocm,
     skipIfRocm,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
     TestCase,
     unMarkDynamoStrictTest,
 )
 from torch.testing._internal.opinfo.core import SampleInput
 from torch.utils import _pytree as pytree
 from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten


 aten = torch.ops.aten


 # Version of autograd.grad with some differences:
 #   - pytree inputs is allowed (but leaves of the pytree have to all
 #     be tensors)
 #   - if an input is not used as part of derivatives, we will return a
 #     zero-filled tensor for the result
 def _autograd_grad(
     outputs, inputs, grad_outputs=None, retain_graph=False, create_graph=True
 ):
     inputs, inputs_spec = tree_flatten(inputs)
     diff_inputs = tuple(inp for inp in inputs if inp.requires_grad)
     if grad_outputs is None:
         diff_outputs = tuple(out for out in outputs if out.requires_grad)
     else:
         diff_grad_outputs = [
             (out, go) for out, go in zip(outputs, grad_outputs) if out.requires_grad
         ]
         if len(diff_grad_outputs) == 0:
             diff_outputs, grad_outputs = (), ()
         else:
             diff_outputs, grad_outputs = zip(*diff_grad_outputs)
     grad_inputs = torch.autograd.grad(
         diff_outputs,
         diff_inputs,
         grad_outputs,
         retain_graph=retain_graph,
         create_graph=create_graph,
         allow_unused=True,
     )
     result = []
     grad_inputs_iter = iter(grad_inputs)
     for inp in inputs:
         if inp.requires_grad:
             grad_input = next(grad_inputs_iter)
             if grad_input is None:
                 result.append(torch.zeros_like(inp))
             else:
                 result.append(grad_input)
         else:
             result.append(torch.zeros_like(inp))
     return tree_unflatten(result, inputs_spec)


 def diff_arg(arg, requires_grad=True):
     def is_differentiable_arg(arg):
         if requires_grad:
             return arg.requires_grad
         else:
             return arg.is_floating_point() or arg.is_complex()

     if is_iterable_of_tensors(arg):
         if all(is_differentiable_arg(a) for a in arg):
             return True
         if all(not is_differentiable_arg(a) for a in arg):
             return False
         raise RuntimeError("NYI: The test runner can't handle this")
     return isinstance(arg, Tensor) and is_differentiable_arg(arg)


 # Given f, returns an f' such that:
 # - f' takes only positional arguments
 # - All arguments to f' are floating-point Tensors
 # - All outputs of f' are floating-point Tensors
 def normalize_op_input_output2(
     f, args, kwargs, output_process_fn_grad=None, requires_grad=True
 ):
     flat_args, args_spec = tree_flatten(args)
     diff_argnums = tuple(
         i
         for i, arg in enumerate(flat_args)
         if diff_arg(arg, requires_grad=requires_grad)
     )
     assert len(diff_argnums) > 0
     primals = tuple(flat_args[i] for i in diff_argnums)

     @functools.wraps(f)
     def wrapped(*primals):
         _args = list(flat_args)
         for num, arg in zip(diff_argnums, primals):
             _args[num] = arg
         _args = tree_unflatten(_args, args_spec)
         result = f(*_args, **kwargs)
         if output_process_fn_grad is not None:
             result = output_process_fn_grad(result)
         if isinstance(result, tuple):
             result = tuple(r for r in result if torch.is_floating_point(r))
             assert len(result) > 0
         return result

     return wrapped, primals


 # TODO: consolidate with normalize_op_input_output2
 def normalize_op_input_output3(
     f, args, kwargs, sample_args, output_process_fn_grad=None
 ):
     flat_args, args_spec = tree_flatten(args)
     flat_sample_args = pytree.tree_leaves(sample_args)
     diff_argnums = tuple(
         i
         for i, (arg, sample) in enumerate(zip(flat_args, flat_sample_args))
         if diff_arg(sample, requires_grad=True)
     )
     assert len(diff_argnums) > 0
     primals = tuple(flat_args[i] for i in diff_argnums)

     @functools.wraps(f)
     def wrapped(*primals):
         _args = list(flat_args)
         for num, arg in zip(diff_argnums, primals):
             _args[num] = arg
         _args = tree_unflatten(_args, args_spec)
         result = f(*_args, **kwargs)
         if output_process_fn_grad is not None:
             result = output_process_fn_grad(result)
         if isinstance(result, tuple):
             result = tuple(r for r in result if torch.is_floating_point(r))
             assert len(result) > 0
         return result

     return wrapped, primals


 def normalize_op_input_output(f, sample, requires_grad=True):
     args = tuple([sample.input] + list(sample.args))
     return normalize_op_input_output2(
         f,
         args,
         sample.kwargs,
         sample.output_process_fn_grad,
         requires_grad=requires_grad,
     )


 def ref_vjp(f, *primals):
     result = f(*primals)

     def wrapped(cotangents):
         return _autograd_grad(_as_tuple(result), primals, _as_tuple(cotangents))

     return result, wrapped


 def simulate_jvp(f, primals, tangents):
     primals_out, tangents_out = torch.autograd.functional.jvp(f, primals, tangents)
     return primals_out, tangents_out


 def ref_jvp(f, primals, tangents):
     with fwAD.dual_level():
         duals = tuple(fwAD.make_dual(p, t) for p, t in zip(primals, tangents))
         result_duals = f(*duals)
         result_duals, spec = tree_flatten(result_duals)
         primals_out, tangents_out = zip(*(fwAD.unpack_dual(d) for d in result_duals))
         return tree_unflatten(primals_out, spec), tree_unflatten(tangents_out, spec)


 def get_sample_cotangents(f, sample):
     fn, primals = normalize_op_input_output(f, sample)
     output = fn(*primals)
     return tree_map(torch.randn_like, output)


 # returns a new function g(*args, *cotangents)
 # that computes vjps and (*args, cotangents)
 def get_vjp_fn_and_args_with_cotangents(f, sample, cotangents):
     args = tuple([sample.input] + list(sample.args))
     kwargs = sample.kwargs
     flat_args, args_spec = tree_flatten(args)
     flat_cotangents, cotangents_spec = tree_flatten(cotangents)

     @functools.wraps(f)
     def wrapped(*args):
         assert len(args) == len(flat_args) + len(flat_cotangents)
         actual_args = args[: len(flat_args)]
         cotangents = args[len(flat_args) :]
         actual_args = tree_unflatten(actual_args, args_spec)
         cotangents = tree_unflatten(cotangents, cotangents_spec)

         fn, primals = normalize_op_input_output3(
             f, actual_args, kwargs, flat_args, sample.output_process_fn_grad
         )
         _, vjp_fn = vjp(fn, *primals)
         return vjp_fn(cotangents)

     return wrapped, tuple(flat_args + flat_cotangents)


 # Returns a new function g(*args, *cotangents) that computes vjps and
 # sample (*args, *cotangents)
 def get_vjpfull_variant(f, sample):
     fn, primals = normalize_op_input_output(f, sample)
     return _get_vjpfull_variant(fn, primals)


 def get_vjpfull_variant2(f, args, kwargs):
     fn, primals = normalize_op_input_output2(f, args, kwargs)
     return _get_vjpfull_variant(fn, primals)


 def _get_vjpfull_variant(fn, primals):
     result = fn(*primals)
     cotangents = _as_tuple(
         tree_map(lambda x: torch.randn_like(x, requires_grad=True), result)
     )
     num_primals = len(primals)
     args = (*primals, *cotangents)

     @functools.wraps(fn)
     def wrapped(*args):
         primals = args[:num_primals]
         cotangents = args[num_primals:]
         result, vjp_fn = vjp(fn, *primals)
         if isinstance(result, torch.Tensor):
             assert len(cotangents) == 1
             cotangents = cotangents[0]
         return vjp_fn(cotangents)

     return wrapped, args


 def get_jvp_variant(f, sample):
     # We want this higher-order variant of jvp, so that it can
     # be used to wrap vmap
     fn, primals = normalize_op_input_output(f, sample, requires_grad=False)
     tangents = _as_tuple(tree_map(lambda x: torch.randn_like(x), primals))

     @functools.wraps(f)
     def wrapped(*args):
         tangents = args
         primals_out, tangents_out = jvp(fn, primals, tangents)

         if isinstance(primals_out, torch.Tensor):
             return (primals_out, tangents_out)
         else:
             flat_primals_out = pytree.tree_leaves(primals_out)
             flat_tangents_out = pytree.tree_leaves(tangents_out)
             return tuple(flat_primals_out + flat_tangents_out)

     return wrapped, tangents


 def get_jvp_variant_primals_tangents2(
     f, args, kwargs, output_process_fn_grad=None, requires_grad=False
 ):
     fn, primals = normalize_op_input_output2(
         f, args, kwargs, output_process_fn_grad, requires_grad
     )
     tangents = _as_tuple(tree_map(lambda x: torch.randn_like(x), primals))
     return _get_jvp_variant(fn, primals, tangents)


 def get_jvp_variant_primals_tangents(f, sample):
     # We want this higher-order variant of jvp, so that it can
     # be used to wrap vmap
     fn, primals = normalize_op_input_output(f, sample, requires_grad=False)
     tangents = _as_tuple(tree_map(lambda x: torch.randn_like(x), primals))
     return _get_jvp_variant(fn, primals, tangents)


 def _get_jvp_variant(fn, primals, tangents):
     @functools.wraps(fn)
     def wrapped(*args):
         primals_in = args[: len(primals)]
         tangents_in = args[len(primals) :]
         primals_out, tangents_out = jvp(fn, primals_in, tangents_in)

         if isinstance(primals_out, torch.Tensor):
             return (primals_out, tangents_out)
         else:
             flat_primals_out = pytree.tree_leaves(primals_out)
             flat_tangents_out = pytree.tree_leaves(tangents_out)
             return tuple(flat_primals_out + flat_tangents_out)

     return wrapped, primals + tangents


 def is_inplace(op, variant):
     if hasattr(variant, "__wrapped__"):
         return variant.__wrapped__ is op.get_inplace()
     return variant is op.get_inplace()


 vjp_fail = {
     xfail("tensor_split"),  # data_ptr composite compliance
     # Very minor accuracy issue on ROCm
     decorate("nn.functional.scaled_dot_product_attention", decorator=skipIfRocm),
 }

 aliasing_ops = {
     "T",
     "broadcast_to",
     "conj",
     "contiguous",
     "diagonal",  # linalg.diagonal is an alias
     "expand",
     "flatten",
     "imag",
     "mH",  # adjoint is an alias
     "mT",
     "movedim",  # moveaxis is an alias
     "narrow",
     "permute",
     "positive",
     # 'ravel', is composite implicit autograd and may call clone
     "real",
     "reshape",
     "resolve_conj",
     "resolve_neg",
     "select",
     "squeeze",
     "transpose",  # swapdims and swapaxes are aliases
     "unflatten",
     "unfold",
     "unsqueeze",
     "view",
     "view_as",
     "view_as_complex",
     "view_as_real",
 }

 aliasing_ops_list_return = {
     "chunks",
     "dsplit",
     "hsplit",
     "split",
     "unbind",
     "vsplit",
     # 'tensor_split' not composite compliant, see vjp_fail
 }

 skip_noncontig = {
     "_batch_norm_with_update",
     "as_strided_copy",
 }


 @unittest.skipIf(TEST_WITH_ASAN, "tests time out with asan, are probably redundant")
 @unMarkDynamoStrictTest
 class TestOperators(TestCase):
     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @skipOps(
         "TestOperators",
         "test_grad",
         vjp_fail.union(
             {
                 xfail(
                     "chalf", "", device_type="cpu"
                 ),  # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
                 xfail(
                     "sparse.sampled_addmm", ""
                 ),  # RuntimeError: Sparse CSR tensors do not have strides
                 xfail(
                     "sparse.mm", "reduce"
                 ),  # RuntimeError: Sparse CSR tensors do not have strides
                 # Non-contiguous Bugs
                 #
                 # AssertionError: Tensor-likes are not close!
                 xfail("_softmax_backward_data", device_type="cpu"),
                 xfail("as_strided"),
                 xfail("as_strided", "partial_views"),
                 # RuntimeError: !self.requires_grad() || self.is_contiguous()
                 xfail("as_strided_scatter"),
                 # RuntimeError: Tensor must have a last dimension with stride 1
                 xfail("view_as_complex"),
                 # query: last dimension must be contiguous
                 # Fused attention kernels require last dim to be contiguous
                 decorate(
                     "nn.functional.scaled_dot_product_attention",
                     decorator=expectedFailureIf(not TEST_WITH_ROCM),
                 ),  # Works on ROCm
                 xfail("torch.ops.aten._flash_attention_forward"),
                 xfail("torch.ops.aten._efficient_attention_forward"),
                 # RuntimeError: Expected contiguous tensor, but got
                 # non-contiguous tensor for argument #2 'grad_output'
                 decorate(
                     "_batch_norm_with_update",
                     decorator=expectedFailureIf(TEST_WITH_ROCM),
                     device_type="cuda",
                 ),
             }
         ),
     )
     @opsToleranceOverride(
         "TestOperators",
         "test_grad",
         (
             tol1(
                 "nn.functional.binary_cross_entropy_with_logits",
                 {torch.float32: tol(atol=1e-04, rtol=1e-04)},
             ),
             tol1("masked.cumprod", {torch.float32: tol(atol=1e-05, rtol=1e-05)}),
             tol1("svd_lowrank", {torch.float32: tol(atol=3e-04, rtol=3e-04)}),
             tol1(
                 "linalg.multi_dot",
                 {torch.float32: tol(atol=1e-05, rtol=8e-04)},
                 device_type="cuda",
             ),
             tol1(
                 "linalg.tensorsolve",
                 {torch.float32: tol(atol=3e-04, rtol=3e-04)},
                 device_type="cuda",
             ),
             tol1(
                 "nn.functional.multi_head_attention_forward",
                 {torch.float32: tol(atol=8e-04, rtol=1e-03)},
             ),
             tol1(
                 "__rmatmul__",
                 {torch.float32: tol(atol=3e-04, rtol=3e-04)},
                 device_type="cuda",
             ),
             tol1(
                 "matmul",
                 {torch.float32: tol(atol=3e-04, rtol=3e-04)},
                 device_type="cuda",
             ),
             tol1(
                 "pca_lowrank",
                 {torch.float32: tol(atol=3e-05, rtol=4e-06)},
                 device_type="cpu",
             ),
         ),
     )
     def test_grad(self, device, dtype, op):
         if op.name in vjp_fail:
             self.skipTest("Skipped; Expected failures")
             return

         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
             return

         samples = op.sample_inputs(device, dtype, requires_grad=True)

         if is_inplace(op, op.get_op()):
             self.skipTest("Skipped for redundancy. test_vjp handles in-place testing.")
             return

         for sample in samples:
             args = [sample.input] + list(sample.args)
             kwargs = sample.kwargs

             if op.name not in skip_noncontig:
                 noncontig_sample = sample.noncontiguous()
                 noncontig_args = [noncontig_sample.input] + list(noncontig_sample.args)
                 noncontig_kwargs = noncontig_sample.kwargs

             diff_argnums = tuple(i for i, arg in enumerate(args) if diff_arg(arg))
             assert len(diff_argnums) > 0
             diff_args = tuple(args[i] for i in diff_argnums)

             def wrapped_fn(*args, **kwargs):
                 result = op(*args, **kwargs)
                 if sample.output_process_fn_grad is not None:
                     result = sample.output_process_fn_grad(result)

                 def abs_if_complex(t):
                     if t.dtype.is_complex:
                         return t.abs()
                     return t

                 # Reduce into single value for grad
                 if isinstance(result, torch.Tensor):
                     return abs_if_complex(result.sum())
                 result = sum(abs_if_complex(res.sum()) for res in result)
                 return result

             result = grad(wrapped_fn, diff_argnums)(*args, **kwargs)
             expected = _autograd_grad(_as_tuple(wrapped_fn(*args, **kwargs)), diff_args)
             self.assertEqual(result, expected)

             if op.name not in skip_noncontig:
                 result_noncontig = grad(wrapped_fn, diff_argnums)(
                     *noncontig_args, **noncontig_kwargs
                 )
                 self.assertEqual(result_noncontig, expected)

     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @skipOps(
         "TestOperators",
         "test_jvp",
         set(
             {
                 # Composite ops that do bad things. Need to be fixed in PyTorch core.
                 # RuntimeError: Cannot access data pointer of Tensor that doesn't have storage
                 xfail("tensor_split"),
                 # BUG: silent incorrectness: runs and produces numerical differences
                 skip("nn.functional.max_unpool1d"),  # fails everywhere except on mac
                 skip(
                     "nn.functional.max_unpool2d"
                 ),  # fails everywhere except on windows
                 skip("nn.functional.max_unpool3d"),  # fails everywhere except on mac
                 xfail(
                     "native_batch_norm"
                 ),  # TODO: fails comparing None to tensor of 0s for saved_mean/var tangents
                 xfail(
                     "_native_batch_norm_legit"
                 ),  # TODO: fails comparing None to tensor of 0s for saved_mean/var tangents
                 xfail(
                     "_batch_norm_with_update"
                 ),  # TODO: fails comparing None to tensor of 0s for saved_mean/var tangents
                 xfail("nn.functional.scaled_dot_product_attention"),
                 xfail("torch.ops.aten._flash_attention_forward"),
                 xfail("torch.ops.aten._efficient_attention_forward"),
                 xfail(
                     "nn.functional.rrelu"
                 ),  # in-place test errors out with no formula implemented
                 xfail(
                     "NumpyExpMarkDirtyAutogradFunction"
                 ),  # TODO: https://github.com/pytorch/pytorch/issues/91280
                 # --- Non-Contiguous Failures! ---
                 # This is expected to fail as the operator
                 # expects last dim to have stride=1
                 xfail("view_as_complex"),
                 # BUG
                 # AssertionError: Tensor-likes are not close!
                 xfail("as_strided"),
                 xfail("as_strided", "partial_views"),
                 xfail("as_strided_scatter"),
                 decorate(
                     "linalg.det",
                     "singular",
                     decorator=expectedFailureIf(IS_MACOS and IS_X86),
                 ),
             }
         ),
     )
     @opsToleranceOverride(
         "TestOperators",
         "test_jvp",
         (
             tol1(
                 "nn.functional.conv_transpose3d",
                 {torch.float32: tol(atol=1e-04, rtol=1.3e-06)},
                 device_type="cuda",
             ),
             tol1(
                 "linalg.tensorsolve",
                 {torch.float32: tol(atol=1e-04, rtol=1.3e-05)},
                 device_type="cuda",
             ),
             tol1(
                 "masked.prod",
                 {torch.float32: tol(atol=1e-05, rtol=1.3e-05)},
                 device_type="cuda",
             ),
             tol1(
                 "nn.functional.binary_cross_entropy_with_logits",
                 {torch.float32: tol(atol=4e-04, rtol=4e-04)},
             ),
             tol1(
                 "nn.functional.batch_norm", {torch.float32: tol(atol=4e-05, rtol=5e-05)}
             ),
             tol1("nn.functional.conv2d", {torch.float32: tol(atol=4e-05, rtol=5e-05)}),
             tol1("svd_lowrank", {torch.float32: tol(atol=5e-05, rtol=5e-05)}),
             tol1("pca_lowrank", {torch.float32: tol(atol=5e-05, rtol=5e-05)}),
             tol1(
                 "nn.functional.multi_head_attention_forward",
                 {torch.float32: tol(atol=6e-05, rtol=2e-05)},
             ),
             tol2(
                 "linalg.pinv", "hermitian", {torch.float32: tol(atol=5e-5, rtol=2e-5)}
             ),
         ),
     )
     def test_jvp(self, device, dtype, op):
         # TODO: get rid of vjp_decomp when we add decomposition support to
         # PyTorch's forward-mode ad. Currently the decomposition support only
         # works for functorch.jvp
         VJP_DECOMP = {
             "nn.functional.logsigmoid",
         }
         if op.name in VJP_DECOMP:
             fixme_ref_jvp_local = simulate_jvp
         else:
             fixme_ref_jvp_local = ref_jvp

         if not op.supports_forward_ad and op.name not in VJP_DECOMP:
             self.skipTest("Skipped! Forward AD not supported.")
             return

         samples = op.sample_inputs(device, dtype, requires_grad=True)

         outplace_variant = op if not is_inplace(op, op.get_op()) else None
         inplace_variant = op.inplace_variant if op.supports_inplace_autograd else None

         for sample in samples:
             if outplace_variant:
                 self.jvp_opinfo_test(
                     outplace_variant,
                     sample,
                     sample.output_process_fn_grad,
                     clone_inputs=False,
                     fixme_ref_jvp_local=fixme_ref_jvp_local,
                     test_noncontig=op.name not in skip_noncontig,
                 )
             if is_valid_inplace_sample_input(sample, op, inplace_variant):
                 self.jvp_opinfo_test(
                     inplace_variant,
                     sample,
                     sample.output_process_fn_grad,
                     clone_inputs=True,
                     fixme_ref_jvp_local=fixme_ref_jvp_local,
                     test_noncontig=op.name not in skip_noncontig,
                 )

     def jvp_opinfo_test(
         self,
         fn,
         sample,
         output_process_fn,
         clone_inputs,
         fixme_ref_jvp_local,
         test_noncontig,
     ):
         # NB: we used requires_grad=True to determine where the primals are,
         # but don't need that information otherwise
         args = (sample.input,) + sample.args
         kwargs = sample.kwargs
         contig_fn, primals = normalize_op_input_output2(
             fn, args, kwargs, output_process_fn, requires_grad=True
         )
         orig_primals = tree_map(lambda x: x.detach(), primals)
         orig_tangents = tree_map(lambda x: torch.randn_like(x), primals)

         def maybe_clone_inputs():
             if clone_inputs:
                 primals = tree_map(torch.clone, orig_primals)
                 tangents = tree_map(torch.clone, orig_tangents)
                 return primals, tangents
             return orig_primals, orig_tangents

         primals, tangents = maybe_clone_inputs()
         expected_primal_outs, expected_tangent_outs = fixme_ref_jvp_local(
             contig_fn, primals, tangents
         )

         primals, tangents = maybe_clone_inputs()
         primal_outs, tangent_outs = jvp(contig_fn, primals, tangents)

         self.assertEqual(primal_outs, expected_primal_outs)
         self.assertEqual(tangent_outs, expected_tangent_outs)

         if test_noncontig:
             noncontig_sample = sample.noncontiguous()
             noncontig_args = (noncontig_sample.input,) + noncontig_sample.args
             noncontig_kwargs = sample.kwargs
             noncontig_fn, primals = normalize_op_input_output2(
                 fn,
                 noncontig_args,
                 noncontig_kwargs,
                 output_process_fn,
                 requires_grad=True,
             )
             noncontig_primals = tree_map(lambda x: x.detach(), primals)
             noncontig_tangents = tree_map(
                 lambda x: noncontiguous_like(x), orig_tangents
             )
             noncontig_primal_outs, noncontig_tangent_outs = jvp(
                 noncontig_fn, noncontig_primals, noncontig_tangents
             )

             self.assertEqual(noncontig_primal_outs, expected_primal_outs)
             self.assertEqual(noncontig_tangent_outs, expected_tangent_outs)

     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @skipOps(
         "TestOperators",
         "test_vjp",
         vjp_fail.union(
             {
                 xfail("sparse.sampled_addmm", ""),
                 xfail("sparse.mm", "reduce"),
                 # ---- Non-Contiguous Failures ----
                 # This is expected to fail as the operator
                 # expects last dim to have stride=1
                 xfail("view_as_complex"),
                 # RuntimeError: query: last dimension must be contiguous
                 # The fused attention kernels require the last dim to be contiguous
                 decorate(
                     "nn.functional.scaled_dot_product_attention",
                     decorator=expectedFailureIf(not TEST_WITH_ROCM),
                 ),  # Works on ROCm
                 xfail("torch.ops.aten._flash_attention_forward"),
                 xfail("torch.ops.aten._efficient_attention_forward"),
                 # BUG
                 # AssertionError: Tensor-likes are not close!
                 xfail("as_strided"),
                 xfail("as_strided_scatter"),
                 xfail("_softmax_backward_data", device_type="cpu"),
                 xfail("as_strided", "partial_views"),
             }
         ),
     )
     @opsToleranceOverride(
         "TestOperators",
         "test_vjp",
         (
             tol1(
                 "nn.functional.conv_transpose3d",
                 {torch.float32: tol(atol=5e-05, rtol=9e-05)},
                 device_type="cuda",
             ),
             tol1(
                 "nn.functional.binary_cross_entropy_with_logits",
                 {torch.float32: tol(atol=1e-04, rtol=1e-04)},
             ),
             tol1(
                 "nn.functional.multi_head_attention_forward",
                 {torch.float32: tol(atol=2e-03, rtol=2e-04)},
             ),
             tol1("__rmatmul__", {torch.float32: tol(atol=1e-05, rtol=1e-05)}),
             tol1("matmul", {torch.float32: tol(atol=1e-05, rtol=1e-05)}),
             tol2(
                 "linalg.pinv", "hermitian", {torch.float32: tol(atol=1e-05, rtol=1e-05)}
             ),
             tol1("linalg.tensorsolve", {torch.float32: tol(atol=9e-03, rtol=2e-04)}),
             tol1("linalg.multi_dot", {torch.float32: tol(atol=1e-04, rtol=1e-04)}),
             tol1("svd_lowrank", {torch.float32: tol(atol=1e-04, rtol=1e-04)}),
             tol1("pca_lowrank", {torch.float32: tol(atol=1e-04, rtol=1e-04)}),
         ),
     )
     def test_vjp(self, device, dtype, op):
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
             return

         samples = op.sample_inputs(device, dtype, requires_grad=True)

         def _test(_op, inplace=False):
             for sample in samples:
                 if inplace and not is_valid_inplace_sample_input(
                     sample, op, op.inplace_variant
                 ):
                     continue
                 fn, primals = normalize_op_input_output(_op, sample)
                 result = fn(*primals)
                 cotangents = tree_map(lambda x: torch.randn_like(x), result)

                 out, vjp_fn = vjp(fn, *primals)
                 self.assertEqual(out, result)
                 result_vjps = vjp_fn(cotangents)

                 _, vjp_fn = ref_vjp(fn, *primals)
                 expected_vjps = vjp_fn(cotangents)

                 self.assertEqual(result_vjps, expected_vjps)

                 if op.name not in skip_noncontig:
                     noncontig_fn, noncontig_primals = normalize_op_input_output(
                         _op, sample.noncontiguous()
                     )
                     noncontig_cotangents = tree_map(
                         lambda x: noncontiguous_like(x), cotangents
                     )
                     out_noncontig, vjp_fn = vjp(noncontig_fn, *noncontig_primals)
                     self.assertEqual(out_noncontig, result)
                     noncontig_result_vjps = vjp_fn(noncontig_cotangents)
                     self.assertEqual(noncontig_result_vjps, expected_vjps)

         _test(op)
         for a_op in op.aliases:
             _test(a_op)
         if op.inplace_variant:

             def f(inp, *args, **kwargs):
                 return op.inplace_variant(inp.clone(), *args, **kwargs)

             _test(f, inplace=True)

     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @skipOps(
         "TestOperators",
         "test_vjpvjp",
         vjp_fail.union(
             {
                 skip("nn.functional.max_unpool1d"),  # silent incorrectness; Flaky
                 skip("nn.functional.max_unpool2d"),  # silent incorrectness; Flaky
                 xfail("nn.functional.ctc_loss"),  # Not Implemented
                 xfail(
                     "native_layer_norm", ""
                 ),  # Expected a proper Tensor but got None for argument #1 'other'
                 xfail("sparse.sampled_addmm", ""),  # sparse tensors have no strides
                 xfail("sparse.mm", "reduce"),  # sparse tensors have no strides
                 skip("nn.functional.scaled_dot_product_attention"),
                 xfail("torch.ops.aten._flash_attention_forward"),
                 xfail("torch.ops.aten._efficient_attention_forward"),
                 # AssertionError: Tensor-likes are not close!
                 # Mismatched elements: 1 / 15 (6.7%)
                 # Greatest absolute difference: 24.0 at index (2, 4) (up to 1e-05 allowed)
                 # Greatest relative difference: 1.7933241714393998e-06 at index (2, 4) (up to 1.3e-06 allowed)
                 # The failure occurred for item [0]
                 xfail("masked.prod"),
             }
         ),
     )
     @opsToleranceOverride(
         "TestOperators",
         "test_vjpvjp",
         (
             tol1(
                 "nn.functional.conv_transpose3d",
                 {torch.float32: tol(atol=5e-05, rtol=9e-05)},
                 device_type="cuda",
             ),
             tol1("prod", {torch.float32: tol(atol=2e-05, rtol=1e-04)}),
             tol1("masked.cumprod", {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
             tol1("cumprod", {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
             tol1("linalg.vander", {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
             tol2(
                 "linalg.det", "singular", {torch.float32: tol(atol=2e-05, rtol=2e-05)}
             ),
         ),
     )
     def test_vjpvjp(self, device, dtype, op):
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
             return
         if not op.supports_gradgrad:
             self.skipTest("Skipped! Operation does not support gradgrad")
             return

         samples = op.sample_inputs(device, dtype, requires_grad=True)

         def test(_op, inplace=False):
             for sample in samples:
                 if inplace and not is_valid_inplace_sample_input(
                     sample, op, op.inplace_variant
                 ):
                     continue
                 fn, args = get_vjpfull_variant(_op, sample)
                 result = fn(*args)
                 cotangents = tree_map(lambda x: torch.randn_like(x), result)

                 # Compute vjp of vjp
                 _, vjp_fn = vjp(fn, *args)
                 result_vjps = vjp_fn(cotangents)

                 # Compute ref_vjp of vjp. We could have done ref_vjp of ref_vjp,
                 # but since we're confident that vjp works by itself, this is
                 # an equivalent way to test that.
                 _, vjp_fn = ref_vjp(fn, *args)
                 expected_vjps = vjp_fn(cotangents)

                 self.assertEqual(result_vjps, expected_vjps)

         test(op)
         if op.inplace_variant:

             def fn(inp, *args, **kwargs):
                 return op.inplace_variant(inp.clone(), *args, **kwargs)

             test(fn, inplace=True)

     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
     @skipOps(
         "TestOperators",
         "test_vmapvjpvjp",
         vjp_fail.union(
             {
                 skip("atleast_1d"),  # Takes too long
                 skip("atleast_2d"),  # Takes too long
                 skip("atleast_3d"),  # Takes too long
                 skip("ormqr"),  # Takes too long
                 xfail("as_strided"),  # incorrect output
                 xfail("as_strided", "partial_views"),  # incorrect output
                 xfail("as_strided_scatter"),  # incorrect output
                 skip("bernoulli"),  # calls random op
                 xfail("bfloat16"),  # rank 4 tensor for channels_last
                 xfail("cdouble"),  # rank 4 tensor for channels_last
                 xfail("cfloat"),  # rank 4 tensor for channels_last
                 xfail("chalf"),  # rank 4 tensor for channels_last
                 xfail("double"),  # rank 4 tensor for channels_last
                 xfail("float"),  # rank 4 tensor for channels_last
                 xfail("half"),  # rank 4 tensor for channels_last
                 xfail(
                     "NumpyCubeNotComposableAutogradFunction"
                 ),  # Not composable autograd.Function
                 # It looks like you're either (1) calling .item() on a Tensor or
                 # (2) attempting to use a Tensor in some data-dependent control flow or
                 # (3) encountering this error in PyTorch internals.
                 xfail("index_reduce", "prod"),
                 decorate(
                     "linalg.householder_product", decorator=runOnRocm
                 ),  # works on ROCm
                 xfail(
                     # nans
                     "masked.softmax",
                     device_type="cpu",
                 ),
                 xfail(
                     "nanquantile", device_type="cpu"
                 ),  # vmap not implemented for at::equal.
                 xfail("native_layer_norm"),  # vmap: inplace into a regular tensor
                 # got a batched tensor as input while the running_mean or running_var,
                 # which will be updated in place, were not batched.
                 xfail("nn.functional.batch_norm"),
                 xfail(
                     "nn.functional.binary_cross_entropy"
                 ),  # vmap: inplace into a regular tensor
                 xfail(
                     "nn.functional.ctc_loss"
                 ),  # derivate not implemented for _ctc_loss_backward
                 # flaky on ROCM needs investigation
                 decorate("nn.functional.conv_transpose2d", decorator=skipIfRocm),
                 skip("nn.functional.dropout"),  # calls random op
                 skip("nn.functional.dropout2d"),  # calls random op
                 skip("nn.functional.dropout3d"),  # calls random op
                 skip("nn.functional.alpha_dropout"),  # calls random op
                 skip(
                     "nn.functional.feature_alpha_dropout", "with_train"
                 ),  # calls random op
                 skip("nn.functional.fractional_max_pool2d"),  # calls random op
                 skip("nn.functional.fractional_max_pool3d"),  # calls random op
                 xfail("nn.functional.scaled_dot_product_attention"),  # randomness
                 xfail("torch.ops.aten._efficient_attention_forward"),  # outputs ints
                 xfail("nn.functional.multi_head_attention_forward"),  # randomness
                 # It looks like you're either (1) calling .item() on a Tensor or
                 # (2) attempting to use a Tensor in some data-dependent control flow or
                 # (3) encountering this error in PyTorch internals.
                 xfail("nn.functional.gaussian_nll_loss"),
                 # got a batched tensor as input while the running_mean or running_var,
                 # which will be updated in place, were not batched.
                 xfail("nn.functional.instance_norm"),
                 xfail(
                     "nn.functional.layer_norm"
                 ),  # vmap: inplace into a regular tensor
                 # RuntimeError: NYI: querying is_contiguous inside of vmap
                 # for memory_format other than torch.contiguous_formats
                 xfail("nn.functional.max_pool2d"),
                 # RuntimeError: NYI: Tensor.clone(memory_format) inside vmap is only
                 # supported with memory_format torch.preserve_format or
                 # torch.contiguous_format (got ChannelsLast)
                 xfail("nn.functional.max_unpool2d"),
                 # RuntimeError: NYI: Tensor.clone(memory_format) inside vmap is only
                 # supported with memory_format torch.preserve_format
                 # or torch.contiguous_format (got ChannelsLast)s
                 xfail("nn.functional.max_unpool2d", "grad"),
                 xfail(
                     "nn.functional.rrelu"
                 ),  # RuntimeError: vmap: we do not yet support aten::rrelu_with_noise.
                 xfail("normal"),  # calls random op
                 xfail("normal", "number_mean"),  # calls random op
                 xfail("pca_lowrank"),  # calls random op
                 xfail(
                     "quantile", device_type="cpu"
                 ),  # Batching rule not implemented for `at::equal`
                 xfail(
                     "scatter_reduce", "prod"
                 ),  # vmap (looks like you are calling item/data-dependent)
                 xfail(
                     "sparse.sampled_addmm"
                 ),  # RuntimeError: Sparse CSR tensors do not have strides
                 xfail(
                     "sparse.mm", "reduce"
                 ),  # RuntimeError: Sparse CSR tensors do not have strides
                 xfail("svd_lowrank"),  # calls random op
                 xfail("to"),  # rank 4 tensor for channels_last
                 xfail(
                     "view_as_complex"
                 ),  # RuntimeError: Tensor must have a last dimension with stride 1
                 # got a batched tensor as input while the running_mean or running_var,
                 # which will be updated in place, were not batched.
                 xfail("nn.functional.batch_norm", "without_cudnn"),
                 # view doesn't work on sparse
                 xfail("to_sparse"),
                 xfail("native_batch_norm"),
                 xfail("_native_batch_norm_legit"),
                 # TODO: implement batching rule
                 xfail("_batch_norm_with_update"),
             }
         ),
     )
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
     @opsToleranceOverride(
         "TestOperators",
         "test_vmapvjpvjp",
         (
             tol1("linalg.svd", {torch.float32: tol(atol=1e-03, rtol=5e-04)}),
             tol1("linalg.lu", {torch.float32: tol(atol=5e-04, rtol=7e-04)}),
             tol1("linalg.lu_factor", {torch.float32: tol(atol=2e-03, rtol=2e-02)}),
             tol1("linalg.multi_dot", {torch.float32: tol(atol=2e-03, rtol=2e-04)}),
             tol1("svd", {torch.float32: tol(atol=1e-03, rtol=5e-04)}),
             tol1("matrix_exp", {torch.float32: tol(atol=1e-03, rtol=5e-04)}),
             tol1("masked.prod", {torch.float32: tol(atol=2e-03, rtol=2e-04)}),
         ),
     )
     @skipOps(
         "TestOperators",
         "test_vmapvjpvjp",
         {
             xfail("as_strided", "partial_views"),
             xfail("as_strided_copy"),
         },
     )
     def test_vmapvjpvjp(self, device, dtype, op):
         # Since, we test `vjpvjp` independently,
         # for this test, we just verify that vmap
         # of `vjpvjp` is correct.
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
             return
         if not op.supports_gradgrad:
             self.skipTest("Skipped! Operation does not support gradgrad")
             return

         samples = op.sample_inputs(device, dtype, requires_grad=True)

         # TODO: test in-place
         if is_inplace(op, op.get_op()):
             self.skipTest("Skipped! NYI: inplace-testing not supported.")
             return

         for sample in samples:
             fn, args = get_vjpfull_variant(op, sample)
             result = fn(*args)
             cotangents = tree_map(lambda x: torch.randn_like(x), result)
             cotangents = pytree.tree_leaves(cotangents)
             num_args = len(args)

             args_and_cotangents = tuple(args) + tuple(cotangents)

             def vjp_of_vjp(*args_and_cotangents):
                 args = args_and_cotangents[:num_args]
                 cotangents = args_and_cotangents[num_args:]
                 result, vjp_fn = vjp(fn, *args)
                 result_vjps = vjp_fn(cotangents)
                 result = pytree.tree_leaves(result)
                 result_vjps = pytree.tree_leaves(result_vjps)
                 return (*result, *result_vjps)

             is_batch_norm_and_training = is_batch_norm_training(op.name, sample.kwargs)
             generator = get_fallback_and_vmap_exhaustive(
                 vjp_of_vjp,
                 args_and_cotangents,
                 {},
                 is_batch_norm_and_training=is_batch_norm_and_training,
             )
             for loop_out, batched_out in generator:
                 self.assertEqual(loop_out, batched_out)

     vmapvjp_fail = vjp_fail.union(
         {
             # -------------------- ALLOWED FAILURES --------------------------------
             # The following are not bugs and are expected behavior
             xfail("masked_select"),  # Not possible due to dynamic shapes
             skip("bernoulli"),  # randomness
             skip("normal", ""),  # randomness
             skip("normal", "number_mean"),  # randomness
             skip("nn.functional.rrelu"),  # randomness
             skip("nn.functional.feature_alpha_dropout", "with_train"),  # randomness
             skip("nn.functional.feature_alpha_dropout", "without_train"),  # randomness
             skip("nn.functional.dropout"),  # randomness
             skip("nn.functional.dropout2d"),  # randomness
             skip("nn.functional.dropout3d", ""),  # randomness
             skip("nn.functional.alpha_dropout"),  # randomness
             skip("nn.functional.scaled_dot_product_attention"),  # randomness
             xfail("torch.ops.aten._efficient_attention_forward"),  # outputs ints
             skip("nn.functional.multi_head_attention_forward"),  # randomness
             xfail(
                 "index_put", ""
             ),  # not possible due to dynamic shapes; we support a subset
             xfail("nn.functional.fractional_max_pool2d"),  # random
             xfail("nn.functional.fractional_max_pool3d"),  # random
             xfail("pca_lowrank", ""),  # randomness
             xfail("svd_lowrank", ""),  # randomness
             xfail("to_sparse", ""),  # non-dense output
             skip(
                 "to"
             ),  # RuntimeError: required rank 4 tensor to use channels_last format
             xfail("as_strided", "partial_views"),
             xfail(
                 "NumpyCubeNotComposableAutogradFunction"
             ),  # Not composable autograd.Function
             # ----------------------------------------------------------------------
             # ---------------------------- BUGS ------------------------------------
             # All of the following are bugs and need to be fixed
             skip(
                 "linalg.svdvals"
             ),  # # really annoying thing where it passes correctness check but not has_batch_rule
             skip("native_batch_norm"),
             skip("_native_batch_norm_legit"),
             # TODO: implement batching rule
             skip("_batch_norm_with_update"),
             xfail("__getitem__", ""),  # dynamic error
             xfail("nanquantile", device_type="cpu"),  # checks q via a .item() call
             xfail("nn.functional.gaussian_nll_loss"),  # checks var for if any value < 0
             xfail("narrow"),  # .item() call
             xfail("quantile", device_type="cpu"),  # checks q via a .item() call
             xfail("view_as_complex"),  # Tensor must have a last dimension with stride 1
             # required rank 4 tensor to use channels_last format
             xfail("bfloat16"),
             xfail("double"),
             xfail("float"),
             xfail("half"),
             xfail("cdouble", ""),
             xfail("cfloat", ""),
             xfail("chalf", ""),
             xfail("scatter_reduce", "prod"),  # item call
             # Batching rule not implemented for aten::_use_cudnn_ctc_loss.Tensor
             xfail("nn.functional.ctc_loss", device_type="cuda"),
             # NYI: querying is_contiguous inside of vmap for memory_format other than torch.contiguous_format
             xfail("nn.functional.max_unpool2d"),
             xfail("nn.functional.max_unpool2d", "grad"),
             xfail("sparse.sampled_addmm", ""),
             xfail("sparse.mm", "reduce"),
             xfail("as_strided_scatter", ""),  # calls as_strided
             xfail("index_reduce", "prod"),  # .item() call
             # ---------------------------------------------------------------------
         }
     )

     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
     @opsToleranceOverride(
         "TestOperators",
         "test_vmapvjp",
         (
             tol1(
                 "linalg.svd",
                 {torch.float32: tol(atol=5e-04, rtol=1e-04)},
                 device_type="cuda",
             ),
             tol1(
                 "svd", {torch.float32: tol(atol=5e-04, rtol=1e-04)}, device_type="cuda"
             ),
             tol1(
                 "linalg.householder_product",
                 {torch.float32: tol(atol=3e-04, rtol=9e-04)},
             ),
             tol1(
                 "matrix_exp",
                 {torch.float32: tol(atol=5e-04, rtol=1e-04)},
                 device_type="cuda",
             ),
             tol1(
                 "nn.functional.layer_norm",
                 {torch.float32: tol(atol=3e-4, rtol=1e-4)},
                 device_type="cpu",
             ),
             tol1(
                 "native_layer_norm",
                 {torch.float32: tol(atol=3e-4, rtol=1e-4)},
                 device_type="cpu",
             ),
         ),
     )
     @skipOps(
         "TestOperators",
         "test_vmapvjp",
         vmapvjp_fail.union(
             {
                 xfail("as_strided"),
                 xfail("as_strided_copy"),
                 xfail("as_strided", "partial_views"),
             }
         ),
     )
     def test_vmapvjp(self, device, dtype, op):
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
             return

         samples = op.sample_inputs(device, dtype, requires_grad=True)

         # TODO: test in-place
         if is_inplace(op, op.get_op()):
             self.skipTest("Skipped! NYI: inplace-testing not supported.")
             return
         for sample in samples:
             cotangents = get_sample_cotangents(op, sample)
             fn, args = get_vjp_fn_and_args_with_cotangents(op, sample, cotangents)
             is_batch_norm_and_training = is_batch_norm_training(op.name, sample.kwargs)
             generator = get_fallback_and_vmap_exhaustive(
                 fn, args, {}, is_batch_norm_and_training=is_batch_norm_and_training
             )
             for loop_out, batched_out in generator:
                 self.assertEqual(loop_out, batched_out)

     vmapjvpall_fail = {
         # -------------------- ALLOWED FAILURES --------------------------------
         # The following are expected (not a bug)
         skip("bernoulli", ""),  # randomness
         skip("nn.functional.dropout"),  # randomness
         skip("nn.functional.rrelu"),  # randomness
         skip("nn.functional.dropout2d", ""),
         skip("nn.functional.dropout3d", ""),
         skip("nn.functional.scaled_dot_product_attention"),  # randomness
         xfail("torch.ops.aten._efficient_attention_forward"),  # outputs ints
         skip("nn.functional.multi_head_attention_forward"),  # randomness
         skip("nn.functional.alpha_dropout"),  # randomness
         skip("nn.functional.feature_alpha_dropout", "without_train"),
         skip("nn.functional.feature_alpha_dropout", "with_train"),
         xfail(
             "nn.functional.fractional_max_pool2d"
         ),  # Cannot access data pointer of Tensor that doesn't have storage
         xfail(
             "nn.functional.fractional_max_pool3d"
         ),  # Cannot access data pointer of Tensor that doesn't have storage
         # Not actually a problem: embedding with max_norm mutates the weight
         # and causes different runs to produce different results.
         # skip because this is flaky depending on what the max_norm is!
         skip("nn.functional.embedding", ""),
         skip("to"),  # RuntimeError: required rank 4 tensor to use channels_last format
         xfail(
             "NumpyExpMarkDirtyAutogradFunction"
         ),  # vmap: inplace into a regular tensor
         # ----------------------------------------------------------------------
         # ---------------------------- BUGS ------------------------------------
         # The following are bugs that we should fix
         xfail("masked.mean"),  # silent incorrectness (nan difference)
         xfail("as_strided", "partial_views"),  # Tensor-likes are not close!
         xfail(
             "nn.functional.soft_margin_loss", ""
         ),  # soft_margin_loss_backward does not support forward-ad
         xfail("tensor_split"),  # data_ptr composite compliance
         xfail("quantile"),  # at::equal batching rule (cpu), also, in-place vmap (cuda)
         skip("as_strided"),  # Test runner cannot handle this
         # requires special handling, and does not yet have a batching rule. Feel free to file a github issue!
         xfail("as_strided_scatter"),
         xfail(
             "nn.functional.gaussian_nll_loss"
         ),  # .item or data-dependent control flow
         xfail("scatter"),  # forward-mode AD does not support at::scatter
         xfail(
             "nanquantile"
         ),  # at::equal batching rule (cpu), also, in-place vmap (cuda)
         xfail("view_as_complex"),  # Tensor must have a last dimension with stride 1
         skip("pca_lowrank", ""),  # randomness
         skip("svd_lowrank", ""),  # randomness
         xfail("double"),  # required rank 4 tensor to use channels_last format
         xfail("cdouble"),  # required rank 4 tensor to use channels_last format
         # potential silent incorrectness
         skip(
             "nn.functional.max_unpool1d"
         ),  # Flaky, seems to sometimes his max_unpool2d
         skip("nn.functional.max_unpool2d"),  # fails everywhere except on mac
         skip("nn.functional.max_unpool3d"),  # fails everywhere except on mac
         # erroring because running_mean and running_var aren't differentiable
         xfail("nn.functional.batch_norm"),
         xfail("nn.functional.batch_norm", "without_cudnn"),
         xfail("native_batch_norm"),
         xfail("_native_batch_norm_legit"),
         # TODO: implement batching rule
         xfail("_batch_norm_with_update"),
         # ----------------------------------------------------------------------
     }

     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
     @opsToleranceOverride(
         "TestOperators",
         "test_vmapjvpall",
         (
             tol1(
                 "nn.functional.conv_transpose3d",
                 {torch.float32: tol(atol=2e-04, rtol=9e-3)},
                 device_type="cuda",
             ),
             tol1(
                 "linalg.householder_product",
                 {torch.float32: tol(atol=2e-04, rtol=9e-3)},
             ),
         ),
     )
     @skipOps(
         "TestOperators",
         "test_vmapjvpall",
         vmapjvpall_fail.union(
             {
                 xfail("as_strided_copy"),
                 decorate(
                     "linalg.det",
                     "singular",
                     decorator=expectedFailureIf(IS_MACOS and IS_X86),
                 ),
             }
         ),
     )
     # This is technically a superset of test_vmapjvp. We should either delete test_vmapjvp
     # or figure out if we can split vmapjvpall. It's useful to keep test_vmapjvp intact
     # because that corresponds to "batched forward-mode AD" testing in PyTorch core
     def test_vmapjvpall(self, device, dtype, op):
         if is_inplace(op, op.get_op()):
             # TODO: test in-place
             self.skipTest("Skipped! NYI: inplace-testing not supported.")
             return

         samples = op.sample_inputs(device, dtype, requires_grad=False)

         if not op.supports_forward_ad:
             self.skipTest("Skipped! Forward AD not supported.")
             return

         for sample in samples:
             arg_values = [sample.input] + list(sample.args)
             kwarg_values = sample.kwargs
             args = tuple(arg_values) + tuple(kwarg_values)
             fn, args = get_jvp_variant_primals_tangents(op, sample)
             is_batch_norm_and_training = is_batch_norm_training(op.name, kwarg_values)
             generator = get_fallback_and_vmap_exhaustive(
                 fn, args, {}, is_batch_norm_and_training=is_batch_norm_and_training
             )
             for loop_out, batched_out in generator:
                 self.assertEqual(loop_out, batched_out)

     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @skipOps(
         "TestOperators",
         "test_vmapjvpall_has_batch_rule",
         vmapjvpall_fail.union(
             {
                 skip(
                     "to"
                 ),  # RuntimeError: required rank 4 tensor to use channels_last format
                 xfail(
                     "cdouble"
                 ),  # RuntimeError: required rank 4 tensor to use channels_last format
                 xfail("cumprod"),
                 xfail("masked_fill"),
                 xfail("fill"),
                 skip("masked.mean"),  # ???
                 xfail("masked_scatter"),
                 xfail("put"),
                 xfail("take"),
                 xfail("nn.functional.feature_alpha_dropout", "without_train"),
                 xfail("nn.functional.dropout2d", ""),
                 xfail("pca_lowrank", ""),
                 xfail("svd_lowrank", ""),
                 xfail("nn.functional.feature_alpha_dropout", "with_train"),
                 xfail("special.log_ndtr", ""),
                 xfail("fft.ihfft2"),  # conj_physical fallback
                 xfail("fft.ihfftn"),  # conj_physical fallback
                 xfail("nn.functional.max_unpool3d", "grad"),
                 xfail("nn.functional.max_unpool2d", "grad"),
                 xfail("nn.functional.soft_margin_loss", ""),
                 xfail("nn.functional.max_unpool1d", "grad"),
                 xfail("nn.functional.embedding", ""),
                 xfail(
                     "scatter_reduce", "sum"
                 ),  # aten::scatter_reduce.two hit the vmap fallback
                 xfail(
                     "scatter_reduce", "mean"
                 ),  # aten::scatter_reduce.two hit the vmap fallback
                 xfail(
                     "scatter_reduce", "amin"
                 ),  # aten::scatter_reduce.two hit the vmap fallback
                 xfail(
                     "scatter_reduce", "amax"
                 ),  # aten::scatter_reduce.two hit the vmap fallback
                 xfail("nn.functional.glu"),
                 xfail("nn.functional.bilinear"),  # trilinear doesn't have batching rule
                 xfail("linalg.lu", ""),
                 xfail("nn.functional.dropout3d", ""),
                 xfail("as_strided_scatter", ""),
                 xfail("masked.cumprod", ""),
                 xfail("renorm"),  # hit vmap fallback, which is disabled
                 xfail("t_copy"),
                 xfail("unsqueeze_copy"),
             }
         ),
     )
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
     def test_vmapjvpall_has_batch_rule(self, device, dtype, op):
         if is_inplace(op, op.get_op()):
             # TODO: test in-place
             self.skipTest("Skipped! NYI: inplace-testing not supported.")
             return

         samples = op.sample_inputs(device, dtype, requires_grad=False)

         if not op.supports_forward_ad:
             self.skipTest("Skipped! Forward AD not supported.")
             return

         def test():
             for sample in samples:
                 arg_values = [sample.input] + list(sample.args)
                 kwarg_values = sample.kwargs
                 args = tuple(arg_values) + tuple(kwarg_values)
                 fn, args = get_jvp_variant_primals_tangents(op, sample)
                 is_batch_norm_and_training = is_batch_norm_training(
                     op.name, kwarg_values
                 )
                 for loop_out, batched_out in get_fallback_and_vmap_exhaustive(
                     fn,
                     args,
                     {},
                     is_batch_norm_and_training=is_batch_norm_and_training,
                     compute_loop_out=False,
                 ):
                     pass

         check_vmap_fallback(self, test, op, dry_run=False)

     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
     @skipOps(
         "TestOperators",
         "test_vmapvjp_has_batch_rule",
         vmapvjp_fail.union(
             {
                 skip(
                     "to"
                 ),  # RuntimeError: required rank 4 tensor to use channels_last format
                 xfail("view_as_complex"),
                 xfail("cummax"),
                 xfail("cummin"),
                 xfail("fill"),
                 xfail(
                     "narrow"
                 ),  # Batching rule not implemented for `narrow.Tensor` (and view op)
                 xfail("special.log_ndtr"),
                 xfail("linalg.householder_product"),
                 xfail("masked_fill"),
                 xfail("masked_scatter"),
                 xfail("masked_select"),
                 xfail("nanquantile"),
                 xfail("ormqr"),
                 xfail("put"),
                 xfail(
                     "scatter_reduce", "sum"
                 ),  # aten::scatter_reduce.two hit the vmap fallback
                 xfail(
                     "scatter_reduce", "mean"
                 ),  # aten::scatter_reduce.two hit the vmap fallback
                 xfail(
                     "scatter_reduce", "amin"
                 ),  # aten::scatter_reduce.two hit the vmap fallback
                 xfail(
                     "scatter_reduce", "amax"
                 ),  # aten::scatter_reduce.two hit the vmap fallback
                 xfail("quantile"),
                 xfail("renorm"),
                 xfail("take"),
                 xfail("tensor_split"),
                 xfail("to_sparse"),
                 xfail("unfold"),
                 xfail("unfold_copy"),
                 xfail("nn.functional.dropout"),
                 xfail("fft.ihfft2"),
                 xfail("fft.ihfftn"),
                 xfail("nn.functional.gaussian_nll_loss"),
                 xfail("nn.functional.bilinear"),
                 xfail("nn.functional.fractional_max_pool3d"),
                 xfail("nn.functional.ctc_loss"),
                 xfail("nn.functional.rrelu"),
                 xfail("nn.functional.embedding_bag"),
                 xfail("nn.functional.fractional_max_pool2d"),
                 xfail("nn.functional.feature_alpha_dropout", "with_train"),
                 xfail("pca_lowrank", ""),
                 xfail("nn.functional.dropout2d", ""),
                 xfail("nn.functional.feature_alpha_dropout", "without_train"),
                 xfail("svd_lowrank", ""),
                 xfail("nn.functional.max_unpool2d", ""),
                 xfail("nn.functional.multi_margin_loss", ""),
                 xfail("nn.functional.multilabel_margin_loss", ""),
                 xfail("nn.functional.pdist", ""),
                 xfail("scatter_reduce", "prod"),
                 xfail("nn.functional.max_unpool1d", ""),
                 xfail("nn.functional.max_unpool3d", ""),
                 xfail("nn.functional.max_unpool3d", "grad"),
                 xfail("nn.functional.soft_margin_loss", ""),
                 xfail("nn.functional.max_unpool1d", "grad"),
                 xfail("nn.functional.max_unpool2d", "grad"),
                 xfail("linalg.lu", ""),
                 xfail("cdouble", ""),
                 xfail("cfloat", ""),
                 xfail("chalf", ""),
                 xfail(
                     "index_reduce", "prod"
                 ),  # aten::index_reduce hit the vmap fallback which is currently disabled
                 xfail(
                     "index_reduce", "mean"
                 ),  # aten::index_reduce hit the vmap fallback which is currently disabled
                 xfail(
                     "index_reduce", "amax"
                 ),  # aten::index_reduce hit the vmap fallback which is currently disabled
                 xfail(
                     "index_reduce", "amin"
                 ),  # aten::index_reduce hit the vmap fallback which is currently disabled
                 xfail("nn.functional.dropout3d", ""),
                 xfail("as_strided_scatter", ""),
                 xfail("_segment_reduce", "offsets"),
                 xfail("_segment_reduce", "lengths"),
                 xfail("sparse.sampled_addmm", ""),
                 xfail("sparse.mm", "reduce"),
                 xfail("native_batch_norm"),
                 xfail("_native_batch_norm_legit"),
                 # TODO: implement batching rule
                 xfail("_batch_norm_with_update"),
                 xfail("native_dropout_backward"),
                 xfail(
                     "index_fill"
                 ),  # aten::_unique hit the vmap fallback which is currently disabled
                 xfail("t_copy"),
                 xfail("unsqueeze_copy"),
             }
         ),
     )
     def test_vmapvjp_has_batch_rule(self, device, dtype, op):
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
             return

         samples = op.sample_inputs(device, dtype, requires_grad=True)

         # TODO: test in-place
         if is_inplace(op, op.get_op()):
             self.skipTest("Skipped! NYI: inplace-testing not supported.")
             return

         def test():
             for sample in samples:
                 cotangents = get_sample_cotangents(op, sample)
                 fn, args = get_vjp_fn_and_args_with_cotangents(op, sample, cotangents)
                 is_batch_norm_and_training = is_batch_norm_training(
                     op.name, sample.kwargs
                 )
                 for loop_out, batched_out in get_fallback_and_vmap_exhaustive(
                     fn,
                     args,
                     {},
                     is_batch_norm_and_training=is_batch_norm_and_training,
                     compute_loop_out=False,
                 ):
                     pass
                 for a_op in op.aliases:
                     fn, args = get_vjp_fn_and_args_with_cotangents(
                         a_op, sample, cotangents
                     )
                     for loop_out, batched_out in get_fallback_and_vmap_exhaustive(
                         fn,
                         args,
                         {},
                         is_batch_norm_and_training=is_batch_norm_and_training,
                         compute_loop_out=False,
                     ):
                         pass

         check_vmap_fallback(self, test, op, dry_run=False)

     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @skipOps(
         "TestOperators",
         "test_vjpvmap",
         vjp_fail.union(
             {
                 skip("bernoulli", ""),  # vjpvmap testing can't handle randomness
                 skip("normal", ""),  # vjpvmap testing can't handle randomness
                 skip(
                     "normal", "number_mean"
                 ),  # vjpvmap testing can't handle randomness
                 skip("nn.functional.rrelu"),  # randomness
                 skip("nn.functional.feature_alpha_dropout", "with_train"),  # randomness
                 skip(
                     "nn.functional.feature_alpha_dropout", "without_train"
                 ),  # randomness
                 skip("nn.functional.scaled_dot_product_attention"),
                 xfail("torch.ops.aten._efficient_attention_forward"),  # outputs ints
                 skip("nn.functional.multi_head_attention_forward"),  # randomness
                 skip("nn.functional.alpha_dropout"),  # randomness
                 skip(
                     "to"
                 ),  # RuntimeError: required rank 4 tensor to use channels_last format
                 skip("to_sparse", ""),  # non-dense output
                 skip("ormqr", ""),  # takes too long
                 xfail(
                     "NumpyCubeNotComposableAutogradFunction"
                 ),  # Not composable autograd.Function
                 # fallback path doesn't work
                 # All of the following are bugs and need to be fixed
                 xfail("__getitem__", ""),
                 xfail("index_put", ""),
                 xfail("view_as_complex"),
                 xfail("nn.functional.gaussian_nll_loss"),
                 xfail("masked_select"),
                 xfail(
                     "narrow"
                 ),  # Batching rule not implemented for `narrow.Tensor` (and view op)
                 skip(
                     "nn.functional.fractional_max_pool3d"
                 ),  # generator works on cpu, fails on cuda
                 skip(
                     "nn.functional.fractional_max_pool2d"
                 ),  # generator works on cpu, fails on cuda
                 xfail("column_stack", ""),
                 xfail("nn.functional.dropout2d", ""),
                 xfail("svd_lowrank", ""),
                 xfail("pca_lowrank", ""),
                 xfail("clamp"),
                 # something weird happening with channels_last
                 xfail("bfloat16"),
                 xfail("double"),
                 xfail("float"),
                 xfail("half"),
                 xfail("cdouble"),
                 xfail("cfloat"),
                 xfail("nn.functional.dropout3d", ""),
                 xfail("as_strided_scatter", ""),
                 xfail("sparse.sampled_addmm", ""),
                 xfail("sparse.mm", "reduce"),
                 xfail("native_batch_norm"),
                 xfail("_native_batch_norm_legit"),
                 # TODO: implement batching rule
                 xfail("_batch_norm_with_update"),
                 xfail("as_strided", "partial_views"),
             }
         ),
     )
     def test_vjpvmap(self, device, dtype, op):
         # NB: there is no vjpvmap_has_batch_rule test because that is almost
         # certainly redundant with the vmap_has_batch_rule test in test_vmap.py

         # one-off skip
         if op.name == "nn.functional.dropout":
             self.skipTest("Skipped!")

         if not op.supports_autograd:
             # If the op doesn't support autograd, vmap(op) won't either
             self.skipTest("Skipped! Autograd not supported.")
             return

         # TODO: test in-place
         if is_inplace(op, op.get_op()):
             self.skipTest("Skipped! NYI: inplace-testing not supported.")
             return

         samples = op.sample_inputs(device, dtype, requires_grad=True)
         batch_norm_fns = (
             "nn.functional.batch_norm",
             "nn.functional.instance_norm",
         )  # instance norm calls batch norm
         is_batch_norm = op.name in batch_norm_fns

         for sample in samples:
             args = [sample.input] + list(sample.args)
             kwargs = sample.kwargs

             is_batch_norm_and_training = is_batch_norm and is_batch_norm_training(
                 op.name, kwargs
             )
             generator = generate_vmap_inputs(
                 args, kwargs, is_batch_norm_and_training=is_batch_norm_and_training
             )

             for batched_args, in_dims, kwargs in generator:
                 vmapped_op = vmap(op, in_dims)
                 fn, primals = normalize_op_input_output2(
                     vmapped_op, batched_args, kwargs, sample.output_process_fn_grad
                 )
                 result = fn(*primals)
                 cotangents = tree_map(lambda x: torch.randn_like(x), result)

                 _, vjp_fn = vjp(fn, *primals)
                 result_vjps = vjp_fn(cotangents)

                 _, vjp_fn = ref_vjp(fn, *primals)
                 expected_vjps = vjp_fn(cotangents)

                 self.assertEqual(result_vjps, expected_vjps)

     def _compare_jacobians_of_vjp(
         self, fn, cotangents_and_primals, argnums=None, atol_rtol=None
     ):
         if argnums is None:
             argnums = tuple(range(len(cotangents_and_primals)))

         def get_vjp(cotangents, *primals):
             _, vjp_fn = vjp(fn, *primals)
             return vjp_fn(cotangents)

         jacobian_jvp = jacfwd(get_vjp, argnums)(*cotangents_and_primals)
         jacobian_vjp = jacrev(get_vjp, argnums)(*cotangents_and_primals)

         # For dtype changing operations, the jacobians have different dtype.
         jacobian_jvp = tree_map(lambda x: x.to(torch.float), jacobian_jvp)
         jacobian_vjp = tree_map(lambda x: x.to(torch.float), jacobian_vjp)

         if atol_rtol is not None:
             (atol, rtol) = atol_rtol
             self.assertEqual(jacobian_jvp, jacobian_vjp, atol=atol, rtol=rtol)
         else:
             self.assertEqual(jacobian_jvp, jacobian_vjp)

     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @skipOps(
         "TestOperators",
         "test_jvpvjp",
         vjp_fail.union(
             {
                 xfail("to_sparse", ""),  # NYI
                 # RuntimeError: Trying to set a forward gradient that has a different size than that of the original Tensor,
                 # this is not supported. Tensor is of size [5, 2, 3] while the given forward gradient is of size [1, 2, 3].
                 xfail("normal", ""),
                 xfail("cdist", ""),  # NYI: forward-AD for _cdist_forward
                 xfail("cholesky", ""),  # NYI: forward-AD for cholesky
                 xfail(
                     "nn.functional.embedding_bag", ""
                 ),  # NYI: forward-AD for _embedding_bag
                 xfail(
                     "nn.functional.grid_sample", ""
                 ),  # NYI: forward AD for grid_sampler_2d
                 xfail("grid_sampler_2d", ""),  # NYI: forward AD for grid_sampler_2d
                 xfail(
                     "nn.functional.hardsigmoid", ""
                 ),  # NYI: forward AD for hardsigmoid_backward
                 xfail(
                     "nn.functional.huber_loss", ""
                 ),  # NYI: forward AD for huber_loss_backward
                 xfail("NumpyCubeNotComposableAutogradFunction"),  # not composable
                 xfail("ormqr", ""),  # NYI: forward AD for ormqr
                 xfail(
                     "nn.functional.multilabel_margin_loss", ""
                 ),  # NYI: multilabel_margin_loss_forward
                 xfail(
                     "nn.functional.soft_margin_loss", ""
                 ),  # NYI: forward-AD for soft_margin_loss_backward
                 xfail("nn.functional.ctc_loss", ""),  # NYI: forward-AD for _ctc_loss
                 xfail("nn.functional.pdist", ""),  # NYI: forward-AD with _pdist_forward
                 skip("nn.functional.scaled_dot_product_attention"),
                 xfail("torch.ops.aten._efficient_attention_forward"),  # outputs ints
                 xfail(
                     "nn.functional.multi_margin_loss", ""
                 ),  # NYI: forward AD with multi_margin_loss
                 skip(
                     "linalg.householder_product", "", device_type="cuda"
                 ),  # flaky, I'm not sure why
                 xfail("sparse.sampled_addmm", ""),  # Sparse tensors have no strides
                 xfail(
                     "_segment_reduce", "offsets"
                 ),  # NYI: forward-AD for _segment_reduce
                 xfail("sparse.mm", "reduce"),  # Sparse tensors have no strides
                 xfail("index_reduce", "prod"),  # NYI: forward-AD for index_reduce
                 xfail("index_reduce", "mean"),  # NYI: forward-AD for index_reduce
                 xfail("index_reduce", "amax"),  # NYI: forward-AD for index_reduce
                 xfail("index_reduce", "amin"),  # NYI: forward-AD for index_reduce
                 xfail(
                     "_segment_reduce", "lengths"
                 ),  # NYI: forward-AD for _segment_reduce
                 xfail("native_dropout_backward"),  # NYI
             }
         ),
     )
     @opsToleranceOverride(
         "TestOperators",
         "test_jvpvjp",
         (
             tol1("masked.prod", {torch.float32: tol(atol=1e-04, rtol=1.3e-05)}),
             tol1("masked.cumprod", {torch.float32: tol(atol=1e-04, rtol=5e-04)}),
             tol1(
                 "cumprod",
                 {torch.float32: tol(atol=1e-03, rtol=5e-04)},
                 device_type="cuda",
             ),
             tol1(
                 "linalg.det",
                 {torch.float32: tol(atol=3e-05, rtol=5e-06)},
                 device_type="cuda",
             ),
             tol1(
                 "linalg.vander",
                 {torch.float32: tol(atol=1e-04, rtol=1.3e-05)},
                 device_type="cuda",
             ),
             tol1(
                 "nn.functional.group_norm", {torch.float32: tol(atol=1e-03, rtol=1e-03)}
             ),
             tol2(
                 "linalg.pinv", "hermitian", {torch.float32: tol(atol=5e-03, rtol=5e-03)}
             ),
         ),
     )
     def test_jvpvjp(self, device, dtype, op):
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
             return

         samples = op.sample_inputs(device, dtype, requires_grad=True)

         # TODO: test in-place
         if is_inplace(op, op.get_op()):
             self.skipTest("Skipped! NYI: inplace-testing not supported.")
             return

         for sample in samples:
             fn, primals = normalize_op_input_output(op, sample)
             result = fn(*primals)
             cotangents = tree_map(lambda x: torch.randn_like(x), result)

             primals_tangents = tree_map(lambda x: torch.randn_like(x), primals)
             cotangents_tangents = tree_map(lambda x: torch.randn_like(x), cotangents)

             def push_vjp(primals, cotangents):
                 _, vjp_fn = vjp(fn, *primals)
                 return vjp_fn(cotangents)

             result = jvp(
                 push_vjp, (primals, cotangents), (primals_tangents, cotangents_tangents)
             )
             self.assertEqual(len(result), 2)

             def tree_map2(fn, first, second):
                 flat_first, spec_first = tree_flatten(first)
                 flat_second, spec_second = tree_flatten(second)
                 assert spec_first == spec_second
                 flat_result = [fn(f, s) for f, s in zip(flat_first, flat_second)]
                 return tree_unflatten(flat_result, spec_first)

             def reference(primals, cotangents, primals_tangents, cotangents_tangents):
                 with fwAD.dual_level():
                     primal_duals = tree_map2(fwAD.make_dual, primals, primals_tangents)
                     _, vjp_fn = ref_vjp(fn, *primal_duals)

                     cotangent_duals = tree_map2(
                         fwAD.make_dual, cotangents, cotangents_tangents
                     )
                     result = vjp_fn(cotangent_duals)

                     flat_result, spec = tree_flatten(result)
                     primals_out, tangents_out = zip(
                         *[fwAD.unpack_dual(r) for r in flat_result]
                     )
                     tangents_out = [
                         t if t is not None else torch.zeros_like(p)
                         for p, t in zip(primals_out, tangents_out)
                     ]
                     expected = (
                         tree_unflatten(primals_out, spec),
                         tree_unflatten(tangents_out, spec),
                     )
                 return expected

             expected = reference(
                 primals, cotangents, primals_tangents, cotangents_tangents
             )
             self.assertEqual(result, expected)

     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
     @skipOps(
         "TestOperators",
         "test_vmapjvpvjp",
         vjp_fail.union(
             {
                 # Following operators take too long, hence skipped
                 skip("atleast_1d"),
                 skip("atleast_2d"),
                 skip("atleast_3d"),
                 skip("meshgrid", "list_of_tensors"),
                 skip("meshgrid", "variadic_tensors"),
                 skip("broadcast_tensors"),
                 skip("linalg.lstsq"),
                 skip("nn.functional.bilinear"),
                 skip("native_layer_norm"),
                 skip("ormqr"),
                 # Not actually a problem
                 xfail("NumpyCubeNotComposableAutogradFunction"),  # not composable
                 xfail(
                     "NumpyExpMarkDirtyAutogradFunction"
                 ),  # vmap: inplace into a regular tensor
                 # Potential bugs/errors
                 xfail("as_strided"),  # AssertionError: Tensor-likes are not close!
                 xfail(
                     "as_strided", "partial_views"
                 ),  # AssertionError: Tensor-likes are not close!
                 xfail("as_strided_copy"),  # AssertionError: Tensor-likes are not close!
                 xfail(
                     "as_strided_scatter"
                 ),  # AssertionError: Tensor-likes are not close!
                 xfail("bernoulli"),  # calls random op
                 xfail("bfloat16"),  # required rank 4 tensor to use channels_last format
                 xfail("cdist"),  # Forward AD not implemented and no decomposition
                 xfail("cdouble"),  # required rank 4 tensor to use channels_last format
                 xfail("cfloat"),  # required rank 4 tensor to use channels_last format
                 xfail("chalf"),  # required rank 4 tensor to use channels_last format
                 xfail("cholesky"),  # Forward AD not implemented and no decomposition
                 xfail("ormqr"),  # Forward AD not implemented and no decomposition
                 xfail("double"),  # required rank 4 tensor to use channels_last format
                 xfail("float"),  # required rank 4 tensor to use channels_last format
                 xfail("half"),  # required rank 4 tensor to use channels_last format
                 xfail("index_reduce", "prod"),  # NYI: forward AD for index_reduce
                 xfail("index_reduce", "mean"),  # NYI: forward AD for index_reduce
                 xfail("index_reduce", "amax"),  # NYI: forward AD for index_reduce
                 xfail("index_reduce", "amin"),  # NYI: forward AD for index_reduce
                 xfail(
                     "mvlgamma", "mvlgamma_p_1"
                 ),  # vmap: inplace into a regular tensor
                 xfail(
                     "mvlgamma", "mvlgamma_p_3"
                 ),  # vmap: inplace into a regular tensor
                 xfail(
                     "mvlgamma", "mvlgamma_p_5"
                 ),  # vmap: inplace into a regular tensor
                 xfail("nanquantile"),  # Batching rule not implemented for aten::equal
                 # RuntimeError: Batch norm got a batched tensor as input while the
                 # running_mean or running_var, which will be updated in place,
                 # were not batched.
                 xfail("nn.functional.batch_norm"),
                 xfail("nn.functional.batch_norm", "without_cudnn"),
                 xfail(
                     "nn.functional.ctc_loss"
                 ),  # ForwardAD not implemented and no decomposition
                 xfail("nn.functional.dropout2d"),  # calls random op
                 xfail("nn.functional.dropout3d"),  # calls random op
                 xfail("nn.functional.dropout"),  # calls random op
                 xfail("nn.functional.scaled_dot_product_attention"),  # randomness
                 xfail("torch.ops.aten._efficient_attention_forward"),  # outputs ints
                 xfail("nn.functional.multi_head_attention_forward"),  # randomness
                 xfail(
                     "nn.functional.embedding_bag"
                 ),  # Forward AD not implemented and no decomposition
                 xfail("nn.functional.alpha_dropout"),  # calls randomn op
                 xfail(
                     "nn.functional.feature_alpha_dropout", "with_train"
                 ),  # calls random op
                 xfail("nn.functional.fractional_max_pool2d"),  # calls random op
                 xfail("nn.functional.fractional_max_pool3d"),  # calls random op
                 xfail("nn.functional.gaussian_nll_loss"),  # data depenedant flow
                 xfail(
                     "nn.functional.grid_sample"
                 ),  # Forward AD not implemented and no decomposition
                 xfail(
                     "grid_sampler_2d"
                 ),  # Forward AD not implemented and no decomposition
                 xfail(
                     "nn.functional.hardsigmoid"
                 ),  # Forward AD not implemented and no decomposition
                 xfail(
                     "nn.functional.hinge_embedding_loss"
                 ),  # vmap: inplace into a regular tensor
                 xfail(
                     "nn.functional.huber_loss"
                 ),  # Forward AD not implemented and no decomposition
                 # RuntimeError: Batch norm got a batched tensor as input while the
                 # running_mean or running_var, which will be updated in place,
                 # were not batched.
                 xfail("nn.functional.instance_norm"),
                 # NYI: Tensor.clone(memory_format) inside vmap is only supported with
                 # memory_format torch.preserve_format or torch.contiguous_format (got ChannelsLast)
                 xfail("nn.functional.max_unpool2d"),
                 xfail("nn.functional.max_unpool2d", "grad"),
                 xfail(
                     "nn.functional.multi_margin_loss"
                 ),  # Forward AD not implemented and no decomposition
                 xfail(
                     "nn.functional.multilabel_margin_loss"
                 ),  # Forward AD not implemented and no decomposition
                 xfail(
                     "nn.functional.pdist"
                 ),  # Forward AD not implemented and no decomposition
                 xfail(
                     "nn.functional.rrelu"
                 ),  # vmap: we do not yet support aten::rrelu_with_noise.
                 xfail(
                     "nn.functional.soft_margin_loss"
                 ),  # Forward AD not implemented and no decomposition
                 xfail("normal"),  # calls random op
                 xfail("normal", "number_mean"),  # calls random op
                 xfail("pca_lowrank"),  # calls random op
                 xfail("quantile"),  # Batching rule not implemented for aten::equal
                 xfail(
                     "scatter_reduce", "prod"
                 ),  # Forward AD not implemented and no decomposition
                 xfail(
                     "_segment_reduce", "lengths"
                 ),  # Forward AD not implemented and no decomposition
                 xfail(
                     "_segment_reduce", "offsets"
                 ),  # Forward AD not implemented and no decomposition
                 xfail(
                     "sparse.sampled_addmm"
                 ),  # RuntimeError: Sparse CSR tensors do not have strides
                 xfail(
                     "sparse.mm", "reduce"
                 ),  # RuntimeError: Sparse CSR tensors do not have strides
                 xfail("svd_lowrank"),  # calls random op
                 xfail(
                     "to"
                 ),  # RuntimeError: required rank 4 tensor to use channels_last format
                 xfail("to_sparse"),  # Forward AD not implemented and no decomposition
                 xfail(
                     "view_as_complex"
                 ),  # RuntimeError: Tensor must have a last dimension with stride 1
                 # RuntimeError: Batch norm got a batched tensor as
                 # input while the running_mean or running_var, which will be updated in
                 # place, were not batched.
                 xfail("native_batch_norm"),
                 xfail("_native_batch_norm_legit"),
                 # TODO: implement batching rule
                 xfail("_batch_norm_with_update"),
                 xfail("native_dropout_backward"),
             }
         ),
     )
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
     @opsToleranceOverride(
         "TestOperators",
         "test_vmapjvpvjp",
         (
             tol1("linalg.svd", {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
             tol1(
                 "linalg.householder_product",
                 {torch.float32: tol(atol=5e-03, rtol=5e-03)},
             ),
             tol1("linalg.multi_dot", {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
             tol2(
                 "linalg.pinv", "hermitian", {torch.float32: tol(atol=5e-04, rtol=5e-04)}
             ),
             tol1(
                 "nn.functional.conv_transpose2d",
                 {torch.float32: tol(atol=5e-04, rtol=5e-04)},
             ),
             tol1("svd", {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
             tol1("matrix_exp", {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
         ),
     )
     def test_vmapjvpvjp(self, device, dtype, op):
         # Since we test `jvpvjp` separately,
         # in this we just check that vmap of `jvpvjp`
         # is correct.
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
             return

         samples = op.sample_inputs(device, dtype, requires_grad=True)

         # TODO: test in-place
         if is_inplace(op, op.get_op()):
             self.skipTest("Skipped! NYI: inplace-testing not supported.")
             return

         for sample in samples:
             fn, primals = normalize_op_input_output(op, sample)
             result = fn(*primals)
             cotangents = tree_map(lambda x: torch.randn_like(x), result)

             primals_tangents = tree_map(lambda x: torch.randn_like(x), primals)
             cotangents_tangents = tree_map(lambda x: torch.randn_like(x), cotangents)

             def push_vjp(primals, cotangents):
                 _, vjp_fn = vjp(fn, *primals)
                 return vjp_fn(cotangents)

             args, spec = tree_flatten(
                 ((primals, cotangents), (primals_tangents, cotangents_tangents))
             )

             def jvp_of_vjp(*args):
                 (primals, tangents) = tree_unflatten(args, spec)
                 primals_out, tangents_out = jvp(push_vjp, primals, tangents)

                 flat_primals_out = pytree.tree_leaves(primals_out)
                 flat_tangents_out = pytree.tree_leaves(tangents_out)
                 return tuple(flat_primals_out + flat_tangents_out)

             is_batch_norm_and_training = is_batch_norm_training(op, sample.kwargs)
             generator = get_fallback_and_vmap_exhaustive(
                 jvp_of_vjp,
                 args,
                 {},
                 is_batch_norm_and_training=is_batch_norm_and_training,
             )
             for loop_out, batched_out in generator:
                 self.assertEqual(loop_out, batched_out)

     def _make_extremal_inputs(self, shape, device):
         if shape is None:
             return (None,)
         return (
             torch.full(shape, -1000.0, device=device),
             torch.zeros(shape, device=device),
             torch.full(shape, 1000.0, device=device),
         )

     def _arg_and_kwarg_options(self, args_options, kwargs_options):
         return itertools.product(*args_options, kwargs_options)

     def test_extremal_numerics_nll_loss(self, device):
         N, C = 3, 4
         d1, d2, d3 = 5, 6, 7
         shapes = (
             ((N, C), (N,), (C,)),
             ((N, C), (N,), None),
             ((N, C, d1, d2, d3), (N, d1, d2, d3), (C,)),
             ((N, C, d1, d2, d3), (N, d1, d2, d3), None),
         )
         kwargs_options = (
             {"ignore_index": 0, "reduction": "mean"},
             {"reduction": "sum"},
             {"reduction": "none"},
             {},
         )
         for input_shape, target_shape, weight_shape in shapes:
             input_options = self._make_extremal_inputs(input_shape, device)
             for input, kwargs in self._arg_and_kwarg_options(
                 (input_options,), kwargs_options
             ):
                 if weight_shape is None:
                     weight = None
                 else:
                     weight = torch.randn(weight_shape, device=device)
                 target = torch.randint(0, C, target_shape, device=device)
                 target[
                     0
                 ] = 1  # since we're ignoring index 0, at least one element must be non-zero

                 fn = functools.partial(
                     torch.nn.functional.nll_loss, target=target, weight=weight, **kwargs
                 )
                 result = fn(input)
                 cotangents = torch.randn_like(result, device=device)
                 self._compare_jacobians_of_vjp(fn, (cotangents, input))

     def test_extremal_numerics_l1_loss(self, device):
         N, C, H, W = 3, 4, 5, 6
         shapes = ((N, C), (N, C, H), (N, C, H, W))
         kwargs_options = ({"reduction": "sum"}, {"reduction": "none"}, {})
         for shape in shapes:
             input_options = self._make_extremal_inputs(shape, device)
             target_options = self._make_extremal_inputs(shape, device)
             for input, target, kwargs in self._arg_and_kwarg_options(
                 (input_options, target_options), kwargs_options
             ):
                 result = torch.nn.functional.l1_loss(input, target)
                 cotangents = torch.randn_like(result, device=device)
                 self._compare_jacobians_of_vjp(
                     torch.nn.functional.l1_loss, (cotangents, input, target)
                 )

     def test_extremal_numerics_mse_loss(self, device):
         N, C, H, W = 3, 4, 5, 6
         shapes = ((N, C), (N, C, H), (N, C, H, W))
         kwargs_options = ({"reduction": "sum"}, {"reduction": "none"}, {})
         for shape in shapes:
             input_options = self._make_extremal_inputs(shape, device)
             target_options = self._make_extremal_inputs(shape, device)
             for input, target, kwargs in self._arg_and_kwarg_options(
                 (input_options, target_options), kwargs_options
             ):
                 result = torch.nn.functional.mse_loss(input, target)
                 cotangents = torch.randn_like(result, device=device)
                 self._compare_jacobians_of_vjp(
                     torch.nn.functional.mse_loss, (cotangents, input, target)
                 )

     def test_extremal_numerics_softmax(self, device):
         N, C, H, W = 3, 4, 5, 6
         shapes = ((N, C), (N, C, H), (N, C, H, W))
         kwargs_options = ({"dim": 1}, {})
         for shape in shapes:
             input_options = self._make_extremal_inputs(shape, device)
             for input, kwargs in self._arg_and_kwarg_options(
                 (input_options,), kwargs_options
             ):
                 result = torch.nn.functional.softmax(input)
                 cotangents = torch.randn_like(result, device=device)
                 self._compare_jacobians_of_vjp(
                     torch.nn.functional.softmax, (cotangents, input)
                 )

     def test_extremal_numerics_log_softmax(self, device):
         N, C, H, W = 3, 4, 5, 6
         shapes = ((N, C), (N, C, H), (N, C, H, W))
         kwargs_options = ({"dim": 1}, {})
         for shape in shapes:
             input_options = self._make_extremal_inputs(shape, device)
             for input, kwargs in self._arg_and_kwarg_options(
                 (input_options,), kwargs_options
             ):
                 result = torch.nn.functional.log_softmax(input)
                 cotangents = torch.randn_like(result, device=device)
                 self._compare_jacobians_of_vjp(
                     torch.nn.functional.log_softmax, (cotangents, input)
                 )

     def test_extremal_numerics_cross_entropy(self, device):
         N, C = 3, 4
         d1, d2, d3 = 5, 6, 7
         shapes = (
             ((N, C), (N,), (C,)),
             ((N, C), (N,), None),
             ((N, C), (N, C), (C,)),
             ((N, C), (N, C), None),
             ((C,), (), (C,)),
             ((C,), (), None),
             ((C,), (C,), (C,)),
             ((C,), (C,), None),
             ((N, C, d1, d2, d3), (N, d1, d2, d3), (C,)),
             ((N, C, d1, d2, d3), (N, d1, d2, d3), None),
             ((N, C, d1, d2, d3), (N, C, d1, d2, d3), (C,)),
             ((N, C, d1, d2, d3), (N, C, d1, d2, d3), None),
         )
         for input_shape, target_shape, weight_shape in shapes:
             input_options = self._make_extremal_inputs(input_shape, device)
             kwargs_options = [{"reduction": "sum"}, {"reduction": "none"}, {}]
             if input_shape != target_shape:
                 kwargs_options.append({"ignore_index": 0, "reduction": "mean"})

             for input, kwargs in self._arg_and_kwarg_options(
                 (input_options,), kwargs_options
             ):
                 if weight_shape is None:
                     weight = None
                 else:
                     weight = torch.randn(weight_shape, device=device)

                 if input_shape == target_shape:
                     target = torch.rand(target_shape, device=device)
                 elif len(target_shape) == 0:
                     target = torch.tensor(
                         1, device=device
                     )  # must be non-zero since ignore_index may be 0
                 else:
                     target = torch.randint(0, C, target_shape, device=device)

                 fn = functools.partial(
                     torch.nn.functional.cross_entropy,
                     target=target,
                     weight=weight,
                     **kwargs,
                 )
                 result = fn(input)
                 cotangents = torch.randn_like(result, device=device)
                 self._compare_jacobians_of_vjp(
                     fn, (cotangents, input), atol_rtol=(1e-4, 1e-5)
                 )

     def test_extremal_numerics_binary_cross_entropy(self, device):
         N, C, H, W = 3, 4, 5, 6
         shapes = ((N, C), (N, C, H), (N, C, H, W))
         for shape in shapes:
             weight_options = self._make_extremal_inputs(shape, device)
             kwargs_options = [{"reduction": "sum"}, {"reduction": "none"}, {}]

             for weight, kwargs in self._arg_and_kwarg_options(
                 (weight_options,), kwargs_options
             ):
                 input = torch.rand(shape, device=device)
                 target = torch.rand(shape, device=device)
                 fn = functools.partial(
                     torch.nn.functional.binary_cross_entropy,
                     target=target,
                     weight=weight,
                     **kwargs,
                 )
                 result = fn(input)
                 cotangents = torch.randn_like(result, device=device)
                 self._compare_jacobians_of_vjp(
                     fn, (cotangents, input), atol_rtol=(1e-4, 2e-5)
                 )

     def test_extremal_numerics_layer_norm(self, device):
         N, C, H, W = 3, 4, 5, 6
         shapes = ((N, C), (N, C, H), (N, C, H, W))
         for shape in shapes:
             input_options = self._make_extremal_inputs(shape, device)
             normalized_shape = shape[1:]
             weight_options = self._make_extremal_inputs(normalized_shape, device)
             bias_options = self._make_extremal_inputs(normalized_shape, device)

             for input, bias, weight in self._arg_and_kwarg_options(
                 (input_options, bias_options, weight_options), ()
             ):

                 def fn(input, weight, bias):
                     return torch.nn.functional.layer_norm(
                         input, normalized_shape, weight=weight, bias=bias
                     )

                 result = fn(input, weight, bias)
                 cotangents = torch.randn_like(result, device=device)
                 self._compare_jacobians_of_vjp(fn, (cotangents, input, weight, bias))

     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
     @ops(
         op_db + additional_op_db + autograd_function_db,
         allowed_dtypes=(torch.float32, torch.double),
     )
     @skipOps(
         "TestOperators",
         "test_vmap_autograd_grad",
         {
             # The size of tensor a (4) must match the size of tensor b (10) at non-singleton dimension 0
             xfail("masked_select"),
             xfail("nn.functional.max_unpool2d", "grad"),  # contiguous call
             xfail("nn.functional.max_unpool2d"),  # contiguous call
             xfail("to_sparse"),  # dispatch key issue
             xfail("torch.ops.aten._efficient_attention_forward"),  # outputs ints
             # https://github.com/pytorch/pytorch/issues/96560#issuecomment-2151063723
             # ** minor accuracy issue for float32 on ROCm
             decorate("xlogy", decorator=skipIfRocm),
             # numerical inconsistencies, look like bugs
             skip(
                 "matrix_exp", dtypes=(torch.float32,), device_type="cuda"
             ),  # fails on linux, passes on windows
             skip(
                 "ldexp", dtypes=(torch.float32,), device_type="cpu"
             ),  # fails on all but mac
             skip("__rmatmul__"),  # flaky needs investigation
             skip("matmul"),  # flaky needs investigation
             skip("nn.functional.conv_transpose3d"),  # flaky needs investigation
             skip("nn.functional.conv_transpose2d"),  # flaky needs investigation
             skip("nn.functional.conv_transpose1d"),  # flaky needs investigation
             skip(
                 "nn.functional.layer_norm", dtypes=(torch.float32,), device_type="cpu"
             ),  # fails on windows
             skip(
                 "linalg.lu_factor", dtypes=(torch.float32,), device_type="cuda"
             ),  # fails on all but windows
             skip(
                 "linalg.lu_factor_ex", dtypes=(torch.float32,), device_type="cuda"
             ),  # fails on all but windows
             skip("linalg.multi_dot", "", device_type="cpu"),
             skip("sparse.sampled_addmm", ""),
             skip("sparse.mm", "reduce"),
             skip("native_layer_norm", "", device_type="cpu"),
             # RuntimeError: Expected contiguous tensor, but got
             # non-contiguous tensor for argument #2 'grad_output'
             decorate(
                 "_batch_norm_with_update",
                 decorator=expectedFailureIf(TEST_WITH_ROCM),
                 device_type="cuda",
             ),
         },
     )
     @opsToleranceOverride(
         "TestOperators",
         "test_vmap_autograd_grad",
         (
             tol1(
                 "ldexp",
                 {torch.float32: tol(atol=3e-04, rtol=1.6e-06)},
                 device_type="cuda",
             ),
             tol1(
                 "linalg.householder_product",
                 {torch.float32: tol(atol=5e-04, rtol=9e-03)},
                 device_type="cuda",
             ),
             tol1(
                 "linalg.householder_product",
                 {torch.float32: tol(atol=6e-03, rtol=1e-03)},
                 device_type="cpu",
             ),
             tol1(
                 "linalg.multi_dot",
                 {torch.float32: tol(atol=2e-04, rtol=1e-04)},
                 device_type="cuda",
             ),
             tol2(
                 "linalg.pinv", "hermitian", {torch.float32: tol(atol=5e-06, rtol=5e-06)}
             ),
             tol1("nn.functional.conv3d", {torch.float32: tol(atol=5e-04, rtol=9e-03)}),
             tol1(
                 "nn.functional.conv2d",
                 {torch.float32: tol(atol=3e-05, rtol=5e-06)},
                 device_type="cuda",
             ),
             tol1("svd_lowrank", {torch.float32: tol(atol=5e-05, rtol=5e-05)}),
             tol1("pca_lowrank", {torch.float32: tol(atol=5e-05, rtol=5e-05)}),
         ),
     )
     def test_vmap_autograd_grad(self, device, dtype, op):
         def is_differentiable(inp):
             return isinstance(inp, Tensor) and (
                 inp.grad_fn is not None or inp.requires_grad
             )

         def get_flat_differentiable(tree):
             flattened = pytree.tree_leaves(tree)
             return tuple(i for i in flattened if is_differentiable(i))

         def get_differentiable_linked(list1, list2):
             paired_list = zip(list1, list2)
             paired_list = tuple(
                 (first, second)
                 for (first, second) in paired_list
                 if is_differentiable(first)
             )
             return zip(*paired_list)

         def filter_none(out):
             flattened = pytree.tree_leaves(out)
             return tuple(o for o in flattened if o is not None)

         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
             return

         sample_inputs = op.sample_inputs(device, dtype, requires_grad=True)

         for sample_input in sample_inputs:
             fn, primals = normalize_op_input_output(op, sample_input)
             out = fn(*primals)
             cotangents = tree_map(torch.randn_like, out)

             def compute_grad(cotangents):
                 out_flattened = out
                 cotangents_flattened = cotangents
                 if not isinstance(out_flattened, torch.Tensor):
                     out_flattened = pytree.tree_leaves(out)
                     cotangents_flattened = pytree.tree_leaves(cotangents)
                     out_flattened, cotangents_flattened = get_differentiable_linked(
                         out_flattened, cotangents_flattened
                     )

                 return filter_none(
                     torch.autograd.grad(
                         out_flattened,
                         get_flat_differentiable(primals),
                         cotangents_flattened,
                         retain_graph=True,
                         allow_unused=True,
                     )
                 )

             is_batch_norm_and_training = is_batch_norm_training(op, sample_input.kwargs)
             generator = get_fallback_and_vmap_exhaustive(
                 compute_grad,
                 (cotangents,),
                 {},
                 is_batch_norm_and_training=is_batch_norm_and_training,
             )
             for loop_out, batched_out in generator:
                 self.assertEqual(loop_out, batched_out)

     def test_vmapvmapjvp_linalg_solve(self):
         ops = [op for op in op_db if op.name == "linalg.solve"]
         assert len(ops) > 0

         # this specializes a lot of code from the get_fallback_and_vmap_exhaustive test. If we need this more
         # generally, this could go for a refactor

         B0 = 2
         B1 = 3

         # we want to check the case where A will be seen as contiguous by jvp but during the vmap calls will become
         # non-contiguous because vmap will expand. This will happen during both levels of vmap
         A = torch.randn(4, 4)
         k = torch.randn(4, 5, B1, B0)
         fn, args = get_jvp_variant_primals_tangents(
             torch.linalg.solve, SampleInput(A, args=(k,))
         )

         in_dims_all = (None, -1, None, -1)
         batched_out = vmap(vmap(fn, in_dims=in_dims_all), in_dims=in_dims_all)(*args)
         loop_out = loop2(fn, in_dims_all, in_dims_all, 0, 0, B0, B1, *args)
         self.assertEqual(loop_out, batched_out)

     @ops(
         filter(lambda op: op.name in aliasing_ops, op_db + additional_op_db),
         allowed_dtypes=(torch.float,),
     )
     @parametrize("grad_op", ["jvp", "vjp"])
     def test_view_then_inplace(self, device, dtype, op, grad_op):
         for sample_input in op.sample_inputs(device, dtype):

             def f(x):
                 op(sample_input.input, *sample_input.args, **sample_input.kwargs).copy_(
                     x
                 )
                 return x

             without_grad = op(
                 sample_input.input, *sample_input.args, **sample_input.kwargs
             )
             if grad_op == "jvp":
                 with self.assertRaisesRegex(
                     RuntimeError,
                     "During a grad .* attempted to call in-place operation",
                 ):
                     jvp(
                         f,
                         (torch.randn_like(without_grad),),
                         (torch.randn_like(without_grad),),
                     )
             else:
                 assert grad_op == "vjp"
                 with self.assertRaisesRegex(
                     RuntimeError,
                     "During a grad .* attempted to call in-place operation",
                 ):
                     vjp(f, torch.randn_like(without_grad))

     @ops(
         filter(
             lambda op: op.name in aliasing_ops_list_return, op_db + additional_op_db
         ),
         allowed_dtypes=(torch.float,),
     )
     @parametrize("grad_op", ["jvp", "vjp"])
     def test_view_then_inplace_list_return(self, device, dtype, op, grad_op):
         for sample_input in op.sample_inputs(device, dtype):

             def f(x):
                 op(sample_input.input, *sample_input.args, **sample_input.kwargs)[
                     0
                 ].copy_(x)
                 return x

             without_grad = op(
                 sample_input.input, *sample_input.args, **sample_input.kwargs
             )[0]
             with self.assertRaisesRegex(
                 RuntimeError, "During a grad .* attempted to call in-place operation"
             ):
                 if grad_op == "jvp":
                     jvp(
                         f,
                         (torch.randn_like(without_grad),),
                         (torch.randn_like(without_grad),),
                     )
                 else:
                     assert grad_op == "vjp"
                     vjp(f, torch.randn_like(without_grad))

     @parametrize("grad_op", ["jvp", "vjp"])
     def test_view_then_inplace_special(self, grad_op):
         # some things in __getitem__ use at::index, which doesn't alias, so this tests a subset of them that do alias
         ops = [
             lambda x: x[0],
             lambda x: x[0, 0, 0],
             lambda x: x[:1],
             lambda x: x[:, :1],
             lambda x: x[:, :1, :],
         ]

         for op in ops:

             def f(x):
                 op(captured).copy_(x)
                 return x

             captured = torch.randn(4, 3, 3)
             without_grad = op(captured)
             if grad_op == "jvp":
                 with self.assertRaisesRegex(
                     RuntimeError,
                     "During a grad .* attempted to call in-place operation",
                 ):
                     jvp(
                         f,
                         (torch.randn_like(without_grad),),
                         (torch.randn_like(without_grad),),
                     )
             else:
                 assert grad_op == "vjp"
                 with self.assertRaisesRegex(
                     RuntimeError,
                     "During a grad .* attempted to call in-place operation",
                 ):
                     vjp(f, torch.randn_like(without_grad))

     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
     # NOTE: [three-transform testing]
     # We only test the autograd_function_db tests here.
     #
     # Usually testing the composition of two transforms is sufficient to convince
     # ourselves that an operator is correctly implemented. For the following cases,
     # we want to be extra sure, so we send those through some three-transform tests:
     # - autograd.Function. The mechanism is via PyDispatcher/HigherOrderOperator, not the
     #   regular PyTorch dispatcher, so it's good to exercise more caution.
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
     @skipOps(
         "TestOperators",
         "test_vmapvjpvmap",
         {
             xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable
         },
     )
     def test_vmapvjpvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
         B = 2
         for sample in samples:
             args = [sample.input] + list(sample.args)
             kwargs = sample.kwargs
             generator = generate_vmap_inputs(args, kwargs, batch_size=B)
             for batched_args, in_dims, kwargs in generator:
                 inner_vmapped_op = vmap(op, in_dims)
                 inner_mapped_op = functools.partial(loop, op, in_dims, 0, B)

                 inner_vmapped_fn, primals = normalize_op_input_output2(
                     inner_vmapped_op,
                     batched_args,
                     kwargs,
                     sample.output_process_fn_grad,
                 )
                 inner_mapped_fn, _ = normalize_op_input_output2(
                     inner_mapped_op, batched_args, kwargs, sample.output_process_fn_grad
                 )
                 result = inner_mapped_fn(*primals)
                 cotangents = tree_map(lambda x: torch.rand_like(x), result)

                 def apply_vjp(fn):
                     def inner(primals, cotangents):
                         _, vjp_fn = vjp(fn, *primals)
                         return vjp_fn(cotangents)

                     return inner

                 vjpvmap_fn = apply_vjp(inner_vmapped_fn)
                 vjpmap_fn = apply_vjp(inner_mapped_fn)
                 batched_args = (primals, cotangents)
                 generator = generate_vmap_inputs(batched_args, {})

                 for batched_args, in_dims, _ in generator:
                     # strategy: compare vmap(vjp(vmap(op)) vs map(vjp(map(op))
                     vmapvjpvmap_fn = vmap(vjpvmap_fn, in_dims)
                     mapvjpmap_fn = functools.partial(loop, vjpmap_fn, in_dims, 0, B)

                     result = vmapvjpvmap_fn(*batched_args)
                     expected = mapvjpmap_fn(*batched_args)
                     self.assertEqual(result, expected)

     # See NOTE: [three-transform testing]
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
     @skipOps(
         "TestOperators",
         "test_vjpvmapvmap",
         {
             xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable
         },
     )
     def test_vjpvmapvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
         B = 2
         for sample in samples:
             args = [sample.input] + list(sample.args)
             kwargs = sample.kwargs
             generator = generate_vmap_inputs(args, kwargs, batch_size=B)
             for batched_args, inner_in_dims, kwargs in generator:
                 inner_vmapped_op = vmap(op, inner_in_dims)
                 inner_mapped_op = functools.partial(loop, op, inner_in_dims, 0, B)
                 generator = generate_vmap_inputs(batched_args, kwargs)
                 for batched_args, in_dims, kwargs in generator:
                     # strategy: compare vjp(vmap(vmap(op)) vs vjp(map(map(op))
                     vmapped_op = vmap(inner_vmapped_op, in_dims)
                     mapped_op = functools.partial(loop, inner_mapped_op, in_dims, 0, B)

                     vmapped_fn, primals = normalize_op_input_output2(
                         vmapped_op, batched_args, kwargs, sample.output_process_fn_grad
                     )
                     mapped_fn, _ = normalize_op_input_output2(
                         mapped_op, batched_args, kwargs, sample.output_process_fn_grad
                     )

                     result = mapped_fn(*primals)
                     cotangents = tree_map(lambda x: torch.rand_like(x), result)

                     _, vjp_fn = vjp(mapped_fn, *primals)
                     expected_vjps = vjp_fn(cotangents)

                     _, vjp_fn = vjp(vmapped_fn, *primals)
                     result_vjps = vjp_fn(cotangents)

                     self.assertEqual(result_vjps, expected_vjps)

     # See NOTE: [three-transform testing]
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
     @skipOps(
         "TestOperators",
         "test_vjpvjpvmap",
         {
             xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable
         },
     )
     def test_vjpvjpvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
         B = 2
         for sample in samples:
             args = [sample.input] + list(sample.args)
             kwargs = sample.kwargs
             generator = generate_vmap_inputs(args, kwargs, batch_size=B)
             for batched_args, in_dims, kwargs in generator:
                 inner_vmapped_op = vmap(op, in_dims)
                 inner_mapped_op = functools.partial(loop, op, in_dims, 0, B)

                 vjpmap_fn, args = get_vjpfull_variant2(
                     inner_mapped_op, batched_args, kwargs
                 )
                 vjpvmap_fn, _ = get_vjpfull_variant2(
                     inner_vmapped_op, batched_args, kwargs
                 )

                 vjpvjpvmap_fn, new_args = get_vjpfull_variant2(vjpvmap_fn, args, {})
                 vjpvjpmap_fn, _ = get_vjpfull_variant2(vjpmap_fn, args, {})

                 expected = vjpvjpmap_fn(*new_args)
                 result = vjpvjpvmap_fn(*new_args)
                 self.assertEqual(result, expected)

     # We're generally convinced that jvp x vmap works (vmap turns an operator
     # into another operator and we test jvp support for operators). So
     # we only test it on the things we're not sure about:
     # - the autograd.Function <> functorch interaction
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
     @skipOps(
         "TestOperators",
         "test_jvpvmap",
         {
             xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable
         },
     )
     def test_jvpvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
         B = 2
         for sample in samples:
             args = [sample.input] + list(sample.args)
             kwargs = sample.kwargs
             generator = generate_vmap_inputs(args, kwargs, batch_size=B)
             for batched_args, in_dims, kwargs in generator:
                 inner_vmapped_op = vmap(op, in_dims)
                 inner_mapped_op = functools.partial(loop, op, in_dims, 0, B)

                 jvpvmap_op, primals = get_jvp_variant_primals_tangents2(
                     inner_vmapped_op,
                     batched_args,
                     kwargs,
                     sample.output_process_fn_grad,
                 )
                 jvpmap_op, _ = get_jvp_variant_primals_tangents2(
                     inner_mapped_op, batched_args, kwargs, sample.output_process_fn_grad
                 )

                 expected = jvpmap_op(*primals)
                 result = jvpvmap_op(*primals)
                 self.assertEqual(result, expected)

     # See NOTE: [three-transform testing]
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
     @skipOps(
         "TestOperators",
         "test_jvpvmapvmap",
         {
             xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable
         },
     )
     def test_jvpvmapvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
         B = 2
         for sample in samples:
             args = [sample.input] + list(sample.args)
             kwargs = sample.kwargs
             generator = generate_vmap_inputs(args, kwargs, batch_size=B)
             for batched_args, inner_in_dims, kwargs in generator:
                 inner_vmapped_op = vmap(op, inner_in_dims)
                 inner_mapped_op = functools.partial(loop, op, inner_in_dims, 0, B)
                 generator = generate_vmap_inputs(batched_args, kwargs)
                 for batched_args, in_dims, kwargs in generator:
                     # strategy: compare jvp(vmap(vmap(op)) vs jvp(map(map(op))
                     vmapped_op = vmap(inner_vmapped_op, in_dims)
                     mapped_op = functools.partial(loop, inner_mapped_op, in_dims, 0, B)

                     jvpvmapvmap_fn, primals = get_jvp_variant_primals_tangents2(
                         vmapped_op, batched_args, kwargs, sample.output_process_fn_grad
                     )
                     jvpmapmap_fn, _ = get_jvp_variant_primals_tangents2(
                         mapped_op, batched_args, kwargs, sample.output_process_fn_grad
                     )

                     expected = jvpmapmap_fn(*primals)
                     result = jvpvmapvmap_fn(*primals)
                     self.assertEqual(result, expected)

     # See NOTE: [three-transform testing]
     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
     @skipOps(
         "TestOperators",
         "test_vmapjvpvmap",
         {
             xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable
         },
     )
     def test_vmapjvpvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
         B = 2
         for sample in samples:
             args = [sample.input] + list(sample.args)
             kwargs = sample.kwargs
             generator = generate_vmap_inputs(args, kwargs, batch_size=B)
             for batched_args, in_dims, kwargs in generator:
                 inner_vmapped_op = vmap(op, in_dims)
                 inner_mapped_op = functools.partial(loop, op, in_dims, 0, B)

                 jvpvmap_fn, primals = get_jvp_variant_primals_tangents2(
                     inner_vmapped_op,
                     batched_args,
                     kwargs,
                     sample.output_process_fn_grad,
                 )
                 jvpmap_fn, _ = get_jvp_variant_primals_tangents2(
                     inner_mapped_op, batched_args, kwargs, sample.output_process_fn_grad
                 )

                 generator = generate_vmap_inputs(primals, {})

                 for batched_args, in_dims, _ in generator:
                     # strategy: compare vmap(jvp(vmap(op)) vs map(jvp(map(op))
                     vmapjvpvmap_fn = vmap(jvpvmap_fn, in_dims)
                     mapjvpmap_fn = functools.partial(loop, jvpmap_fn, in_dims, 0, B)

                     result = vmapjvpvmap_fn(*batched_args)
                     expected = mapjvpmap_fn(*batched_args)
                     self.assertEqual(result, expected)

     # See NOTE: [three-transform testing]
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
     @skipOps(
         "TestOperators",
         "test_jvpjvpvmap",
         {
             xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable
         },
     )
     def test_jvpjvpvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
         B = 2
         for sample in samples:
             args = [sample.input] + list(sample.args)
             kwargs = sample.kwargs
             generator = generate_vmap_inputs(args, kwargs, batch_size=B)
             for batched_args, in_dims, kwargs in generator:
                 inner_vmapped_op = vmap(op, in_dims)
                 inner_mapped_op = functools.partial(loop, op, in_dims, 0, B)

                 jvpmap_fn, args = get_jvp_variant_primals_tangents2(
                     inner_mapped_op, batched_args, kwargs, sample.output_process_fn_grad
                 )
                 jvpvmap_fn, _ = get_jvp_variant_primals_tangents2(
                     inner_vmapped_op,
                     batched_args,
                     kwargs,
                     sample.output_process_fn_grad,
                 )

                 jvpjvpvmap_fn, new_args = get_jvp_variant_primals_tangents2(
                     jvpvmap_fn, args, {}
                 )
                 jvpjvpmap_fn, _ = get_jvp_variant_primals_tangents2(jvpmap_fn, args, {})

                 expected = jvpjvpmap_fn(*new_args)
                 result = jvpjvpvmap_fn(*new_args)
                 self.assertEqual(result, expected)

     # See NOTE: [three-transform testing]
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
     @skipOps(
         "TestOperators",
         "test_jvpvjpvmap",
         {
             xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable
         },
     )
     def test_jvpvjpvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
         B = 2
         for sample in samples:
             args = [sample.input] + list(sample.args)
             kwargs = sample.kwargs
             generator = generate_vmap_inputs(args, kwargs, batch_size=B)
             for batched_args, in_dims, kwargs in generator:
                 inner_vmapped_op = vmap(op, in_dims)
                 inner_mapped_op = functools.partial(loop, op, in_dims, 0, B)

                 vjpmap_fn, args = get_vjpfull_variant2(
                     inner_mapped_op, batched_args, kwargs
                 )
                 vjpvmap_fn, _ = get_vjpfull_variant2(
                     inner_vmapped_op, batched_args, kwargs
                 )

                 jvpvjpvmap_fn, new_args = get_jvp_variant_primals_tangents2(
                     vjpvmap_fn, args, {}
                 )
                 jvpvjpmap_fn, _ = get_jvp_variant_primals_tangents2(vjpmap_fn, args, {})

                 expected = jvpvjpmap_fn(*new_args)
                 result = jvpvjpvmap_fn(*new_args)
                 self.assertEqual(result, expected)

     def test_data_write_errors_under_transform(self, device):
         t = torch.randn(3, 3, device=device)

         def fn(t):
             t.data = torch.randn(3, 3)
             return t.sum()

         msg = "mutating directly with `.data` inside functorch transform"
         with self.assertRaisesRegex(RuntimeError, msg):
             grad(fn)(t)

         with self.assertRaisesRegex(RuntimeError, msg):
             vjp(fn, t)

         with self.assertRaisesRegex(RuntimeError, msg):
             jvp(fn, (t,), (torch.randn_like(t),))

     def test_tensor_with_scalar_list(self, device):
         x = torch.randn((), device=device)

         def func_list_of_scalar(x):
             return torch.tensor([x], device=device)

         def func(x):
             return torch.tensor(x, device=device).view(1)

         actual_o, actual_fn = vjp(func_list_of_scalar, x)
         expected_o, expected_fn = vjp(func, x)

         self.assertEqual(actual_o, expected_o)
         self.assertEqual(
             expected_fn(torch.ones_like(expected_o)),
             actual_fn(torch.ones_like(actual_o)),
         )


 only_for = ("cpu", "cuda")
 instantiate_device_type_tests(TestOperators, globals(), only_for=only_for)

 if __name__ == "__main__":
     run_tests()