torch/csrc/autograd/FunctionsManual.cpp - platform/external/pytorch - Git at Google

 #include <torch/csrc/autograd/FunctionsManual.h>
 #include <torch/csrc/autograd/variable.h>

 #include <ATen/ATen.h>
 #include <ATen/Utils.h>
 #include <c10/core/TensorOptions.h>
 #include <ATen/WrapDimUtils.h>
 #include <ATen/WrapDimUtilsMulti.h>
 #include <ATen/SparseTensorUtils.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/core/Reduction.h>
 #include <ATen/BatchedTensorImpl.h>
 #include <ATen/Dispatch.h>
 #include <ATen/ScalarOps.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/SparseTensorUtils.h>
 #include <ATen/native/IndexingUtils.h>

 #include <ciso646>
 #include <algorithm>
 #include <numeric>
 #include <functional>
 // Helper functions for autogenerated code
 // These used to be inlined into the codegened Functions.cpp

 namespace torch {
 namespace autograd {
 namespace generated {
 namespace details {

 using at::Tensor;
 using at::Scalar;
 using at::IntArrayRef;
 using at::TensorList;

 bool isDefined(const c10::optional<Tensor>& t) {
   return t.has_value() && t->defined();
 }

 bool isFwGradDefined(const c10::optional<Tensor>& t) {
   return t.has_value() && t->defined() && t->fw_grad(/*level */ 0).defined();
 }

 Tensor toLegacyTensor(const c10::optional<Tensor>& t) {
   return t.has_value() ? *t : Tensor();
 }

 Tensor toLegacyFwGrad(const c10::optional<Tensor>& t) {
   return (t.has_value() && t->defined()) ? t->fw_grad(/*level */ 0) : Tensor();
 }

 Tensor toLegacyPrimal(const c10::optional<Tensor>& t) {
   return (t.has_value() && t->defined()) ? t->_fw_primal(/*level */ 0) : Tensor();
 }

 void copy_range(variable_list& out, IndexRange range, const Tensor & t) {
   AT_ASSERT(range.second <= out.size());
   AT_ASSERTM(range.second - range.first == 1, "inconsistent range for Tensor output");
   out[range.first] = t;
 }

 void copy_range(variable_list& out, IndexRange range, at::ArrayRef<Tensor> t) {
   AT_ASSERT(range.second <= out.size());
   AT_ASSERTM(range.second - range.first == t.size(), "inconsistent range for TensorList output");
   std::copy(t.begin(), t.end(), out.begin() + range.first);
 }

 Tensor copysign_tensor_self_backward(const Tensor & grad, const Tensor & self, const Tensor & result) {
   auto ratio = result / self;
   ratio.masked_fill_(self == 0, 0);
   return grad * ratio;
 }

 Tensor not_implemented(const char* name) {
   throw std::runtime_error(
       std::string("the derivative for '") + name + "' is not implemented");
 }

 Tensor maybe_multiply(const Tensor & t, const Scalar & s) {
   bool is_one = false;
   if (s.isFloatingPoint()) {
     is_one = s.toDouble() == 1;
   } else if(s.isIntegral(true)) {
     is_one = s.toLong() == 1;
   }

   if (is_one) {
     return t;
   } else {
     return t * s;
   }
 }

 int64_t _safe_size(IntArrayRef sizes, IntArrayRef dim) {
   int64_t size = 1;
   if (sizes.size() == 0) {
     return 1;
   }
   for (auto d : dim) {
     d = at::maybe_wrap_dim(d, sizes.size());
     size *= sizes[d];
   }
   return size;
 }

 static Tensor wrapped_scalar_tensor(Scalar scalar) {
   auto tensor = scalar_to_tensor(scalar);
   tensor.unsafeGetTensorImpl()->set_wrapped_number(true);
   return tensor;
 }

 Tensor handle_r_to_c(ScalarType self_st, Tensor gradient_result) {
   if (!at::isComplexType(self_st) && gradient_result.is_complex()) {
     // R -> C
     return at::real(gradient_result);
   }
   return gradient_result;
 }

 Tensor handle_r_to_c(Tensor self, Tensor gradient_result) {
   if (!self.is_complex() && gradient_result.is_complex()) {
     // R -> C
     return at::real(gradient_result);
   }
   return gradient_result;
 }

 Tensor restore_reduced_dims(const Tensor &output, IntArrayRef dims, bool keepdim) {
   if (keepdim) {
     return output;
   }
   int64_t total_dims = output.dim() + dims.size();
   std::vector<int64_t> target_shape(total_dims, 0);
   for (int64_t i : dims) {
     if (i < 0) {
       i = total_dims + i;
     }
     target_shape[i] = 1;
   }
   int64_t j = 0;
   for (int64_t i : output.sizes()) {
     while (target_shape[j] > 0) j++;
     target_shape[j++] = i;
   }
   return output.reshape(target_shape);
 }

 Tensor scale_grad_by_count(const Tensor &grad, const Tensor &mask, IntArrayRef dims) {
   return (grad / mask.sum(dims, true)) * mask;
 }

 std::tuple<Tensor, Tensor> _euclidean_dist_backward(const Tensor & grad, const Tensor & x1, const Tensor & x2, const Tensor & res) {
   if (!grad.defined()) {
     return std::tuple<Tensor, Tensor>(Tensor(), Tensor());
   }
   // handle case at 0 where we return a subgradient containing 0
   Tensor ratio = grad / res;
   ratio.masked_fill_(res == 0, 0);
   return std::tuple<Tensor, Tensor>{
             x1 * ratio.sum(-1, true) - ratio.matmul(x2),
             x2 * ratio.sum(-2, false).unsqueeze(-1) - ratio.transpose(-2, -1).matmul(x1)};
 }

 Tensor norm_backward(const Tensor& grad, const Tensor& self, const optional<Scalar> & p_, const Tensor& norm) {
   return norm_backward(grad, self, p_, norm, {}, true);
 }

 Tensor norm_backward(Tensor grad, const Tensor& self, const optional<Scalar> & p_, Tensor norm, IntArrayRef dim, bool keepdim) {
   size_t ndim = self.sizes().size();
   double p = p_.value_or(2.0).toDouble();
   Tensor self_scaled;
   Tensor scale_v;

   if (!keepdim && self.dim() != 0) {
     grad = unsqueeze_multiple(grad, dim, ndim);
     norm = unsqueeze_multiple(norm, dim, ndim);
   }

   if (p == 0.0) {
     return at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   } else if (p == 1.0) {
     return self.sgn() * grad;
   } else if (p == 2.0) {
     self_scaled = self;
     scale_v = grad / norm;
   } else if (std::isinf(p)) {
     Tensor is_eq_max = (self.abs() == norm).logical_or_(self.isnan().logical_and_(norm.isnan())).type_as(self);
     self_scaled = self.sign() * is_eq_max;
     Tensor nb_max = is_eq_max.count_nonzero(dim);
     if (self.dim() != 0) {
       nb_max = unsqueeze_multiple(nb_max, dim, ndim);
     }
     scale_v = grad / nb_max;
   } else if (p < 2.0) {
     self_scaled = self.sgn() * self.abs().pow(p - 1);
     scale_v = grad / norm.pow(p - 1);
   } else {
     self_scaled = self * self.abs().pow(p - 2);
     scale_v = grad / norm.pow(p - 1);
   }
   // handle case at 0 where we return a subgradient containing 0
   scale_v.masked_fill_(norm == 0, 0);
   return self_scaled * scale_v;
 }

 Tensor pow_backward(Tensor grad, const Tensor & self, const Scalar & exponent) {
   if (exponent.equal(0.0)) {
     return at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   } else {
     auto grad_lambda = [&](auto exp) { return grad * (exp * self.pow(exp - 1)).conj(); };
     Tensor out = (exponent.isComplex()) ? grad_lambda(exponent.toComplexDouble()) : grad_lambda(exponent.toDouble());
     return handle_r_to_c(self, out);
   }
 }

 Tensor pow_backward_self(Tensor grad, const Tensor & self, const Tensor & exponent) {
   auto out = at::where(exponent == 0.0, at::zeros({}, grad.options()), grad * (exponent * self.pow(exponent - 1)).conj());
   return handle_r_to_c(self, out);
 }

 // Caveats:
 // We define d(a^b)/db at a = 0 and b < 0 to be -inf. This is due to
 // d(a^b)/db -> -inf for a fixed b as a -> +0
 // Currently, tensorflow defines d(a^b)/db = nan for a = 0 and b < 0.
 //
 // We define d(a^b)/db = 0 for a = 0 and b = 0 by continuity as
 // d(a^b)/db = 0 for a > 0 and b -> +0.
 // Currently, tensorflow agrees with us.
 Tensor pow_backward_exponent(Tensor grad, const Tensor& self, const Tensor& exponent, Tensor result) {
   Tensor cond;
   if (exponent.is_complex()) {
     auto is_real_exp = at::logical_and(at::imag(exponent) == 0, at::real(exponent) >= 0);
     cond = at::logical_and(self == 0, is_real_exp);
   } else {
     cond = at::logical_and(self == 0, exponent >= 0);
   }
   auto out = grad * at::where(cond,
                           at::zeros({}, grad.options()),
                           (result * self.log()).conj());
   return handle_r_to_c(exponent, out);
 }

 Tensor pow_backward_exponent(Tensor grad, const Scalar & base, const Tensor& exponent, Tensor result) {
   auto grad_lambda = [](Tensor a, Scalar b) { return (a * b.log()).conj(); };
   if (base.equal(0.0)) {
     auto cond = [](auto exp) {
       if (exp.is_complex()) {
         return at::logical_and(at::imag(exp) == 0, at::real(exp) >= 0);
       } else {
         return exp >=0;
       }
     };
     auto out = grad * at::where(cond(exponent),
                             at::zeros({}, grad.options()),
                             grad_lambda(result, base));
     return handle_r_to_c(exponent, out);
   } else {
     auto out = grad * grad_lambda(result, base);
     return handle_r_to_c(exponent, out);
   }
 }

 Tensor angle_backward(Tensor grad, const Tensor& self) {
   if (self.is_complex()) {
     return at::where(self == 0.0, at::zeros({}, self.options()),
                      grad * self / self.abs().pow(2) * Scalar(c10::complex<double>{0.0, 1.0}));
   } else {
     return at::zeros_like(self, at::MemoryFormat::Preserve);
   }
 }

 Tensor mvlgamma_backward(Tensor grad, const Tensor & self, int64_t p) {
   Tensor args = at::arange(-p / 2. + 0.5, 0.5, 0.5, self.options());
   args = args.add(self.unsqueeze(-1));
   return grad * args.digamma_().sum(-1);
 }

 Tensor sgn_backward(Tensor result, Tensor grad, Tensor self) {
   if (self.is_complex()) {
     auto abs = at::abs(self);
     // C -> C
     // https://arxiv.org/pdf/1701.00392.pdf Section 4.20
     return at::where(abs == 0.0, at::zeros({}, grad.options()), (grad/abs - (at::real(grad/self) * result)));
   } else {
     return at::zeros_like(self, at::MemoryFormat::Preserve);
   }
 }

 Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st) {
   auto out = grad * other.conj();
   return handle_r_to_c(self_st, out);
 }

 Tensor div_tensor_self_backward(Tensor grad, Tensor other, ScalarType self_st) {
   auto result = grad / other.conj();
   return handle_r_to_c(self_st, result);
 }

 Tensor div_tensor_other_backward(Tensor grad, Tensor self, Tensor other) {
   auto result = -grad * ((self / other) / other).conj();
   return handle_r_to_c(other, result);
 }

 Tensor permute_backwards(const Tensor & grad, IntArrayRef fwd_dims) {
   // invert the permutation
   auto ndims = fwd_dims.size();
   std::vector<int64_t> dims(ndims);
   for (size_t i = 0; i < ndims; i++) {
     dims[at::maybe_wrap_dim(fwd_dims[i], ndims)] = i;
   }
   return grad.permute(dims);
 }

 Tensor rad2deg_backward(const Tensor& grad) {
   constexpr double M_180_PI = 57.295779513082320876798154814105170332405472466564;
   return at::mul(grad, wrapped_scalar_tensor(Scalar(M_180_PI)));
 }

 Tensor deg2rad_backward(const Tensor& grad) {
   constexpr double M_PI_180 = 0.017453292519943295769236907684886127134428718885417;
   return at::mul(grad, wrapped_scalar_tensor(Scalar(M_PI_180)));
 }

 Tensor unsqueeze_multiple(const Tensor & t, IntArrayRef dim, size_t n_dims) {
     auto dims_to_unsqueeze = at::dim_list_to_bitset(dim, n_dims);
     Tensor res = t;
     for (size_t i = 0; i < n_dims; i++){
       if (dims_to_unsqueeze[i]) {
         res = res.unsqueeze(i);
       }
     }
     return res;
 }

 Tensor sum_backward(const Tensor & grad, IntArrayRef sizes, IntArrayRef dims, bool keepdim) {
   if (!keepdim && sizes.size() > 0) {
     if (dims.size()==1) {
       return grad.unsqueeze(dims[0]).expand(sizes);
     } else {
       Tensor res = unsqueeze_multiple(grad, dims, sizes.size());
       return res.expand(sizes);
     }
   } else {
     return grad.expand(sizes);
   }
 }

 Tensor nansum_backward(const Tensor & grad, const Tensor & self, IntArrayRef dims, bool keepdim) {
   auto sizes = self.sizes();
   if (!keepdim && sizes.size() > 0) {
     if (dims.size()==1) {
       return grad.unsqueeze(dims[0]).expand(sizes) * self.isnan().logical_not();
     } else {
       Tensor res = unsqueeze_multiple(grad, dims, sizes.size());
       return res.expand(sizes) * self.isnan().logical_not();
     }
   } else {
     return grad.expand(sizes) * self.isnan().logical_not();
   }
 }

 std::vector<int64_t> reverse_list(const IntArrayRef list) {
   auto result = std::vector<int64_t>();
   result.reserve(list.size());
   for (auto iter = list.rbegin(); iter != list.rend(); iter++) {
     result.push_back(*iter);
   }
   return result;
 }

 Tensor reverse_dim(const Tensor& t, int64_t dim) {
   Tensor index = at::arange(t.size(dim) - 1, -1, -1, t.options().dtype(at::kLong));
   return t.index_select(dim, index);
 }

 Tensor prod_safe_zeros_backward(const Tensor &grad, const Tensor& inp, int64_t dim) {
   if (inp.size(dim) == 1) {
     return grad;
   }

   auto ones_size = inp.sizes().vec();
   ones_size[dim] = 1;
   Tensor ones = at::ones(ones_size, grad.options());
   Tensor exclusive_normal_nocp = at::cat({ones, inp.narrow(dim, 0, inp.size(dim) - 1)}, dim);
   Tensor exclusive_normal = exclusive_normal_nocp.cumprod(dim);

   Tensor narrow_reverse = reverse_dim(inp.narrow(dim, 1, inp.size(dim) - 1), dim);
   Tensor exclusive_reverse_nocp = at::cat({ones, narrow_reverse}, dim);
   Tensor exclusive_reverse = reverse_dim(exclusive_reverse_nocp.cumprod(dim), dim);

   return grad * (exclusive_normal * exclusive_reverse);
 }

 // note that the gradient for prod is equivalent to:
 // cumprod(exclusive, normal) * cumprod(exclusive, reverse), e.g.:
 // input:                        [    a,     b,     c]
 // cumprod(exclusive, normal):   [1    ,     a, a * b]
 // cumprod(exclusive, reverse):  [b * c,     c,     1]
 // product:                      [b * c, a * c, a * b]
 // and this is safe under input with 0s.
 Tensor prod_backward(const Tensor& grad, const Tensor& input, const Tensor& result) {
   if (input.dim() == 0) {
     return grad;
   }
   Tensor zero_idx = (input == 0).nonzero();
   if (zero_idx.numel() == 0) {
     return (grad * result) / input;
   } else if (zero_idx.size(0) > 1) {
     return at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   } else {
     return prod_safe_zeros_backward(grad, input.contiguous().view(-1), 0).view_as(input);
   }
 }

 Tensor prod_backward(Tensor grad, const Tensor& input, Tensor result, int64_t dim, bool keepdim) {
   if (input.dim() == 0) {
     return grad;
   }
   dim = at::maybe_wrap_dim(dim, input.sizes().size());
   if (!keepdim && input.dim() != 1) {
     grad = grad.unsqueeze(dim);
     result = result.unsqueeze(dim);
   }

   Tensor zero_mask = (input == 0);
   Tensor slice_zero_count = zero_mask.sum(dim, true);
   int64_t total_zeros = slice_zero_count.sum().item<int64_t>();
   if (total_zeros == 0) {
     return (grad * result) / input;
   } else {
     return prod_safe_zeros_backward(grad, input, dim);
   }
 }

 Tensor solve_backward_self(const Tensor & grad, const Tensor & self, const Tensor & A) {
   return at::linalg_solve(A.conj().transpose(-2, -1), grad);
 }

 Tensor solve_backward_A(const Tensor & grad, const Tensor & self, const Tensor & A, const Tensor & solution) {
   Tensor grad_self = solve_backward_self(grad, self, A);
   if (self.ndimension() == 2 && A.ndimension() == 2) {
     return -at::mm(grad_self, solution.conj().transpose(-2, -1));
   }
   // if self was unsqueezed from (..., M) to (..., M, 1)
   auto batched_rhs_shape = IntArrayRef(A.sizes().data(), A.dim()-1);  // A.shape[:-1]
   bool is_rhs_broadcasted = self.dim() == 1 || (A.dim()-1 == self.dim() && self.sizes().equals(batched_rhs_shape));
   if (is_rhs_broadcasted) {
     return -at::matmul(grad_self.unsqueeze(-1), solution.unsqueeze(-1).conj().transpose(-2, -1));
   }
   return -at::matmul(grad_self, solution.conj().transpose(-2, -1));
 }

 Tensor cumsum_backward(const Tensor & x, int64_t dim) {
   // Need to check numel to see if there are no values (such as shape [0,2], and dim to see if x is a scalar.
   if (x.dim() == 0 || x.numel() == 0) {
     return x;
   }
   auto ret = at::cumsum(-x, dim);
   auto ret_sum = ret.narrow(dim, ret.size(dim) - 1, 1).clone(at::MemoryFormat::Preserve);
   ret -= ret_sum.expand(ret.sizes());
   ret += x;
   return ret;
 }

 Tensor logsumexp_backward(Tensor grad, const Tensor & self, Tensor result, IntArrayRef dim, bool keepdim) {
   if (!keepdim && self.dim() != 0) {
     grad = unsqueeze_multiple(grad, dim, self.sizes().size());
     result = unsqueeze_multiple(result, dim, self.sizes().size());
   }
   return grad * (self - result).exp();
 }

 Tensor logcumsumexp_backward(Tensor grad, const Tensor & self, Tensor result, int64_t dim) {
   if (grad.dim() == 0 || grad.numel() == 0) {
     return grad;
   }

   // Reference: https://github.com/tensorflow/tensorflow/blob/
   // 2a5910906a0e0f3dbc186ff9db6386d81a63448c/tensorflow/python/ops/math_grad.py#L1832-L1863
   return AT_DISPATCH_FLOATING_TYPES(
       at::typeMetaToScalarType(grad.dtype()),
       "logcumsumexp_backward",
       [grad, self, result, dim]() {
         auto grad_min = at::empty_like(grad);
         grad_min.fill_(std::numeric_limits<scalar_t>::lowest());
         auto log_grad_positive = at::where(grad > 0, grad.log(), grad_min);
         auto log_grad_negative = at::where(grad < 0, (-grad).log(), grad_min);

         auto reverse_logcumsumexp = [dim](auto x) {
           return at::flip(at::logcumsumexp(at::flip(x, {dim}), dim), {dim});
         };

         auto output_pos =
             (reverse_logcumsumexp(log_grad_positive - result) + self).exp();
         auto output_neg =
             (reverse_logcumsumexp(log_grad_negative - result) + self).exp();

         return output_pos - output_neg;
       });
 }

 Tensor unbind_backward(const variable_list& grads, int64_t dim) {
   IntArrayRef sizes;
   at::TensorOptions o;
   for (auto v : grads) {
     if (v.defined()) {
       sizes = v.sizes();
       o = static_cast<Tensor>(v).options();
       break;
     }
   }
   auto grads_tensors = fmap(grads, [&](const Variable& v) {
     return (
         v.defined() ? static_cast<Tensor>(v) : at::zeros({}, o).expand(sizes));
   });
   return at::stack(grads_tensors, dim);
 }

 Tensor unsqueeze_to(const Tensor & self, IntArrayRef sizes) {
   auto result = self;

   int64_t nDims = sizes.size();
   for (int64_t dim = 0; dim < nDims; dim++) {
     if (sizes[dim] == 1) {
       result = result.unsqueeze(dim);
     }
   }
   return result;
 }

 Tensor unsqueeze_to(const Tensor & self, int64_t dim, IntArrayRef sizes) {
   dim = at::maybe_wrap_dim(dim, sizes.size());
   // in NumPy it's not an error to unsqueeze a scalar, but we still need to avoided
   // unsqueezing in the backward.
   if (sizes.size() > 0 && sizes[dim] == 1) {
     return self.unsqueeze(dim);
   }
   return self;
 }

 std::vector<Tensor> cat_tensors_backward(const Tensor & grad, const std::vector<std::vector<int64_t>> &sizes, int64_t dim) {
   std::vector<Tensor> grad_inputs(sizes.size());
   if (!grad.defined()) {
     return grad_inputs;
   }
   dim = at::legacy_cat_wrap_dim(dim, sizes);
   int64_t accumulate = 0;
   for (size_t i = 0; i < sizes.size(); ++i) {
     auto& shape = sizes[i];
     // If input was empty tensor, gradInput should be empty tensor.
     if (shape == std::vector<int64_t>({0})) {
       grad_inputs[i] = at::zeros({0}, grad.options());
       continue;
     }
     auto size = shape[dim];
     accumulate += size;
     grad_inputs[i] = grad.narrow(dim, accumulate - size, size);
   }
   return grad_inputs;
 }

 Tensor clamp_backward(const Tensor & grad, const Tensor &self, const optional<Scalar> & min, const optional<Scalar> & max) {
   // clamp: gradients not defined on min and max, so we return the subgradient 1 for these cases.
   if (max && min) {
     return grad * ((self >= *min) * (self <= *max)).type_as(grad);
   } else if (min) {
     return grad * (self >= *min).type_as(grad);
   } else if (max) {
     return grad * (self <= *max).type_as(grad);
   } else {
     return grad;
   }
 }

 // This function is used by load_derivatives.py to replace tensor.strides()
 // calls that appear in derivative formulas. If the tensor has requires_grad
 // set, this function returns its strides or throws an error if the tensor
 // is sparse. If requires_grad is not set, an empty array is returned since
 // there will be no backward pass.
 //
 // This function only supports the case where `input` is the tensor whose
 // single derivative is being calculated.
 //
 // This function does not support `self` derivatives for inplace functions.
 //
 // Args:
 //  input              Tensor to call .strides() on
 //  input_name         Name of `input` tensor, from derivative formula
 at::IntArrayRef strides_or_error(const Tensor & input, c10::string_view const & input_name) {
   // TODO: Ideally, this function would never be called if requires_grad is
   // not set. Once codegen is updated to avoid the call, we can remove this
   // check.
   if (input.requires_grad()) {
     TORCH_CHECK(
       !input.is_sparse(),
       "The backward pass for this operation requires the '", input_name,
       "' tensor to be strided, but a sparse tensor was given instead. ",
       "Please either use a strided tensor or set requires_grad=False for '",
       input_name, "'");
     return input.strides();
   } else {
     return IntArrayRef({});
   }
 }

 Tensor mm_mat1_backward(const Tensor & grad, const Tensor & mat2, at::IntArrayRef mat1_sizes, at::IntArrayRef mat1_strides, const Scalar & alpha) {
   // if input was column-major, return grad as column-order for efficiency
   if (mat1_strides[0] == 1 && mat1_strides[1] == mat1_sizes[0]) {
     return maybe_multiply(mat2.conj().mm(grad.t()).t(), alpha);
   } else {
     return maybe_multiply(grad.mm(mat2.t().conj()), alpha);
   }
 }

 Tensor mm_mat2_backward(const Tensor & grad, const Tensor & mat1, IntArrayRef sizes, IntArrayRef strides, const Scalar & alpha) {
   // if input was column-major, return grad as column-order for efficiency
   if (strides[0] == 1 && strides[1] == sizes[0]) {
     if (mat1.is_sparse()) {
       // Since mm(dense, sparse) doesn't exist,
       // pass a transposed output matrix to the underlying "addmm"
       // function directly.
       int64_t out_rows = mat1.size(1);
       int64_t out_cols = grad.size(1);
       Tensor t = at::zeros({}, grad.options()).expand({out_rows, out_cols}, true);
       Tensor r = at::empty({out_cols, out_rows}, grad.options()).t();
       at::addmm_out(r, t, mat1.t(), grad, alpha, 1);
       return r;
     }
     return maybe_multiply(grad.t().mm(mat1.conj()).t(), alpha);
   } else {
     return maybe_multiply(mat1.t().conj().mm(grad), alpha);
   }
 }

 Tensor _sparse_addmm_sparse_backward(const Tensor& grad, const Tensor& sparse_, const Tensor& dense, const Scalar& alpha) {
   AT_ASSERT(sparse_.is_sparse());
   auto sparse = sparse_.coalesce();
   Tensor grad_sparse = maybe_multiply(grad.mm(dense.t()), alpha);
   return grad_sparse.sparse_mask(sparse);
 }

 // This function return a new SparseTensor with values from Tensor `input` filtered by indices of `mask`
 // and values are ignored. `input` and `mask` are sparse matrices, a sparse tensor with sparse_dim=2 and  dense_dim=2,
 // and they must have the same shape.
 // Note that the `output` must have the same `indices` as the `mask` so we are using just a clone.
 // However, to get `values` we have to use specific helper function for CPU/CUDA and use the `mask` data to filter `values`
 // That's why we created this `_sparse_matrix_mask_helper` function.
 Tensor _sparse_matrix_mask(const Tensor& input, const Tensor& mask){
   Tensor output = at::empty_like(mask);
   Tensor mask_indices = mask._indices().clone();
   Tensor r_values;
   if (mask._nnz() == 0) {
     r_values = at::zeros_like(mask._values());
   } else {
     r_values = _sparse_matrix_mask_helper(input, mask_indices.contiguous());
   }
   at::sparse::get_sparse_impl(output)->set_indices_and_values_unsafe(mask_indices, r_values);
   return output;
 }

 Tensor sparse_sparse_matmul_backward(
     const Tensor& grad,
     const Tensor& a,
     const Tensor& b,
     int64_t grad_order) {
   /*
   To implement the backward algorithm for sparse matrix-matrix matmul (SPMM) we can start from the following definition
   for dense tensors:

   c = a @ b
       then
   a_grad = c_grad @ b^T
   b_grad = a^T @ c_grad

   So for sparse matrices we can use the following definition:

   if grad_order == 0:
       a_grad = sparse_matrix_mask(c_grad @ b^T, mask=a)
   else:
       b_grad = sparse_matrix_mask(a^T @ c_grad, mask=b)
   */
   TORCH_CHECK(
       grad_order == 0 || grad_order == 1,
       ": grad_order not in [0, 1] at sparse_sparse_matmul_backward function");
   if (grad_order == 0) {
     auto a_grad = _sparse_sparse_matmul(grad, b.t());
     return _sparse_matrix_mask(a_grad.coalesce(), a.coalesce());
   }
   auto b_grad = _sparse_sparse_matmul(a.t(), grad);
   return _sparse_matrix_mask(b_grad.coalesce(), b.coalesce());
 }

 Tensor renorm_backward(const Tensor & grad, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
   auto transposed_sizes = self.transpose(dim, 0).sizes().vec();
   auto flatten = [&](const Tensor & t) {
     return t.transpose(dim, 0).contiguous().view({t.size(dim), -1});
   };
   auto unflatten = [&](const Tensor & t) {
     return t.contiguous().view(transposed_sizes).transpose(dim, 0);
   };

   // renorm computes the norm over all dimensions except `dim`, which is why
   // we need the flatten and unflatten business. TODO: simplify this when we
   // add support for norm over multiple dimensions.
   auto self_flat = flatten(self);
   auto grad_flat = flatten(grad);
   auto norm_flat = self_flat.norm(p, 1, true);
   auto grad_output = (self_flat * grad_flat).sum(1, true);
   auto nb = norm_backward(grad_output, self_flat, p, norm_flat, 1, true);
   auto invnorm = (norm_flat + 1e-7).reciprocal();
   auto grad_norm = unflatten(maxnorm * invnorm * (grad_flat - invnorm * nb));
   auto norm = unflatten(norm_flat.expand_as(self_flat));

   // TODO: remove the detach once comparison ops no longer require grad
   auto mask = Variable(norm < maxnorm).detach();
   return at::where(mask, grad, grad_norm);
 }

 Tensor repeat_backward(Tensor grad, IntArrayRef repeats, IntArrayRef input_shape) {
   auto find_iter = std::find(repeats.cbegin(), repeats.cend(), 0);
   if (find_iter != repeats.cend()) {
     return at::zeros(input_shape, grad.options());
   }
   const auto input_dims = input_shape.size();
   int64_t num_unsqueezed = grad.dim() - input_dims;
   for (int64_t i = 0; i < num_unsqueezed; ++i) {
     grad = grad.sum(0, false);
   }

   at::DimVector grad_size, sum_dims;
   for (size_t dim = 0; dim < input_dims; ++dim) {
     int64_t repeat = repeats[dim + num_unsqueezed];
     // Reshape gradient (repeat > 1)
     // Index:      [..., dim    , ...]    [..., dim   ,  dim+1        , ...]
     // Shape: From [..., dimsize, ...] to [..., repeat, dimsize/repeat, ...]
     // The gradient tensor at 'dim' is reshaped to 'repeat' times of input tensor.
     // Then, sum up gradients over repeated tensors along 'dim', and reduce shape
     // from 'repeat * dimsize/repeat' to 'dimsize/repeat' ('input_dimsize').
     // Example:
     //        Size(3, 2)                                             Size(6, 2)
     //                                                             [[v1_0, v1_1],
     //                                                              [v1_2, v1_3],
     //        [[v0, v1],                   repeat(2, 1)             [v1_4, v1_5],
     //         [v2, v3],                  ------------->            [v2_0, v2_1],
     //         [v4, v5]]                                            [v2_2, v2_3],
     //                                                              [v2_4, v2_5]]
     //
     //    input grad (3, 2)            reshape (2, 3, 2)         output grad (6, 2)
     //                                  [[[g1_0, g1_1],            [[g1_0, g1_1],
     //                                    [g1_2, g1_3],             [g1_2, g1_3],
     // [[g1_0+g2_0, g1_1+g2_1],           [g1_4, g1_5]],            [g1_4, g1_5],
     //  [g1_0+g2_0, g1_1+g2_1],                                     [g2_0, g2_1],
     //  [g1_0+g2_0, g1_1+g2_1]]          [[g2_0, g2_1],             [g2_2, g2_3],
     //                                    [g2_2, g2_3],             [g2_4, g2_5]]
     //                                    [g2_4, g2_5]]]
     // If gradient tensor is reshaped to [..., dimsize/repeat, repeat, ...] and then
     // sum over 'dim+1'. The gradient for input is not correctly aligned with input.
     // Example:
     //     input grad (3, 2)            reshape (3, 2, 2)        output grad (6, 2)
     //                                  [[[g1_0, g1_1],
     //                                    [g1_2, g1_3]],           [[g1_0, g1_1],
     //                                                              [g1_2, g1_3],
     // [[g1_0+g1_2, g1_1+g1_3],          [[g1_4, g1_5],             [g1_4, g1_5],
     //  [g1_4+g2_0, g1_5+g2_1],           [g2_0, g2_1]],            [g2_0, g2_1],
     //  [g2_2+g2_4, g2_3+g2_5]]                                     [g2_2, g2_3],
     //                                   [[g2_2, g2_3],             [g2_4, g2_5]]
     //                                    [g2_4, g2_5]]]
     if (repeat != 1) {
       grad_size.push_back(repeat);
       sum_dims.push_back(grad_size.size() - 1);
     }
     // Don't need to reshape gradient into (repeat, input_shape[dim]) (repeat == 1)
     grad_size.push_back(input_shape[dim]);
   }
   // One-time Reshape & Sum
   // Reshape gradient to grad_size:
   //   1. If repeat equals to 1, append input size at that dimension,
   //   2. If repeat is larger than 1, append both repeat and input size at that dimension.
   // Sum over all "repeat" dimensions from sum_dims:
   // Example:
   // Input Size         (2,    3,    4,    5)
   // repeat             [4,    1,    9,    3]
   // output/grad Size   (8,    3,    36,   15)
   // grad_size          [4, 2,    3, 9, 4, 3, 5]
   // sum_dims           [0,          3,    5]

   // When repeat 1 time over all original dimensions, the empty sum_dims will reduce
   // the whole grad tensor into a scalar rather than keeping original dimensions.
   if (!sum_dims.empty()) {
     grad = grad.reshape(grad_size);
     grad = grad.sum(sum_dims);
   }
   return grad;
 }

 // p1m == 1 - p
 Tensor _fused_dropout_backward(Tensor grad, Tensor mask, double p1m) {
   if (grad.requires_grad()) {
     // Use autograd-friendly backward if double backward is required
     return grad * (mask.type_as(grad) * (1. / p1m));
   } else {
     return at::_masked_scale(grad, mask, 1. / p1m);
   }
 }

 Tensor evenly_distribute_backward(Tensor grad, const Tensor & input, const Tensor & value) {
   if (input.is_cuda()) {
     auto mask = (input == value).logical_or_(input.isnan().logical_and_(value.isnan()));
     return mask * (grad / mask.sum());
   } else {
     auto mask = value.isnan().item<bool>() ? input.isnan() : input == value;
     return grad.new_zeros(input.sizes(), input.options()).masked_fill_(mask, grad / mask.sum());
   }
 }

 Tensor var_backward(const Tensor & grad, const Tensor & self, bool unbiased) {
   return (2.0 / (self.numel() - unbiased)) * grad * (self - self.mean());
 }

 Tensor var_backward(Tensor grad, const Tensor & self, IntArrayRef dim, bool unbiased, bool keepdim) {
   if (self.dim() == 0) {
     return var_backward(grad, self, unbiased);
   }
   if (!keepdim && self.dim() > 1) {
     grad = unsqueeze_multiple(grad, dim, self.sizes().size());
   }
   return (2.0 / (_safe_size(self.sizes(), dim) - unbiased)) * grad * (self - self.mean(dim, true));
 }

 Tensor std_backward(const Tensor & result, const Tensor & grad, const Tensor & self, bool unbiased) {
   return var_backward((grad / (result * 2)).masked_fill_(result == 0, 0), self, unbiased);
 }

 Tensor std_backward(const Tensor & result, Tensor grad, const Tensor & self, IntArrayRef dim, bool unbiased, bool keepdim) {
   return var_backward((grad / (result * 2)).masked_fill_(result == 0, 0), self, dim, unbiased, keepdim);
 }

 Tensor mean_backward(Tensor grad, const IntArrayRef sizes, IntArrayRef dim, bool keepdim) {
   return sum_backward(grad, sizes, dim, keepdim) / _safe_size(sizes, dim);
 }

 Tensor mean_backward(Tensor grad, const IntArrayRef sizes, int numel) {
   return grad.expand(sizes) / numel;
 }

 Tensor var_std_mean_backward(const variable_list& grads, const Tensor & self, const Tensor & r1, const Tensor & r2, IntArrayRef dim, bool unbiased, bool keepdim, bool is_std) {
   Tensor grad;
   if (grads[0].defined()) {
     grad = is_std ? std_backward(r1, grads[0], self, dim, unbiased, keepdim) : var_backward(grads[0], self, dim, unbiased, keepdim);
   }
   if (grads[1].defined()) {
     Tensor mean_grad = mean_backward(grads[1], self.sizes(), dim, keepdim);
     grad = grads[0].defined() ? grad + mean_grad : mean_grad;
   }
   return grad;
 }

 Tensor var_std_mean_backward(const variable_list& grads, const Tensor & self, const Tensor & r1, const Tensor & r2, bool unbiased, bool is_std) {
   Tensor grad;
   if (grads[0].defined()) {
     grad = is_std ? std_backward(r1, grads[0], self, unbiased) : var_backward(grads[0], self, unbiased);
   }
   if (grads[1].defined()) {
     Tensor mean_grad = mean_backward(grads[1], self.sizes(), self.numel());
     grad = grads[0].defined() ? grad + mean_grad : mean_grad;
   }
   return grad;
 }

 Tensor masked_scatter_backward(const Tensor & grad, const Tensor & mask, IntArrayRef sizes) {
   int64_t numel = 1;
   for (auto size : sizes) {
     numel *= size;
   }
   auto mask_selected = grad.masked_select(mask);
   auto diff_nelem = numel - mask_selected.numel();
   if (diff_nelem > 0) {
     // because mask_selected returns a 1-d tensor with size of masked elements that are 1,
     // we need to fill out the rest with zeros then reshape back to tensor2's size.
     auto zeros_fillin = at::zeros({diff_nelem}, grad.options());
     mask_selected = at::cat({mask_selected, zeros_fillin}, 0);
   }
   return mask_selected.view(sizes);
 }

 Tensor cholesky_backward(Tensor grad, bool upper, Tensor L) {
   // cf. Iain Murray (2016); arXiv 1602.07527
   // This gradient is symmetric, and not triangular.
   // Cholesky additionally assumes that the input is symmetric, which is a subspace of
   // R^{n x n}, and hence the derivative is not well-defined for off-diagonal
   // elements. We resolve this by taking the gradient of the functionally independent
   // elements of the matrix (i.e., the lower triangular portion of the input) and then
   // reflect it on the upper triangular portion, thereby symmetrizing the gradient of
   // the cholesky operation. The motivation behind this choice is that symmetric gradient
   // leads to stable gradient updates, and retains symmetry of the updated matrix if it
   // were updated by a gradient based algorithm.
   if (upper) {
     L = L.transpose(-1, -2).conj();
     grad = grad.transpose(-1, -2).conj();
   }
   auto L_inverse = std::get<0>(at::triangular_solve(at::eye(L.size(-1), L.options()), L, /*upper=*/false));
   auto phi = at::matmul(L.transpose(-1, -2).conj(), grad);
   phi.tril_().diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).mul_(0.5);

   auto grad_input = at::matmul(at::matmul(L_inverse.transpose(-1, -2).conj(), phi), L_inverse);
   return grad_input.add(grad_input.transpose(-1, -2).conj()).mul_(0.5);  // Symmetrizing the gradient
 }

 Tensor cholesky_inverse_backward(Tensor grad, Tensor L, bool upper, Tensor inverse) {
   Tensor grad_L;
   if (grad.defined()) {
     Tensor common_term = grad + grad.transpose(-2, -1);
     common_term = at::matmul(inverse, at::matmul(common_term, inverse));
     if (upper) {
       grad_L = -at::matmul(L, common_term);
     } else {
       grad_L = -at::matmul(common_term, L);
     }
   } else {
     grad_L = at::zeros({1}, L.options()).expand_as(L);
   }
   return grad_L;
 }

 Tensor split_with_sizes_backward(const std::vector<torch::autograd::Variable> &grads,
                                  IntArrayRef split_sizes, int64_t dim, IntArrayRef sizes, const at::TensorOptions &options) {
   dim = at::maybe_wrap_dim(dim, sizes.size());

   // it's possible some of the grads are not defined (represents tensors of all 0s).
   // Since at::cat can't handle those, let's define them
   std::vector<Tensor> grads_all_defined(grads.size());
   for (size_t j = 0; j < grads.size(); ++j) {
     if (grads[j].defined()) {
       grads_all_defined[j] = grads[j];
     } else {
       auto length = split_sizes[j];
       auto grad_size = sizes.vec();
       grad_size[dim] = length;
       grads_all_defined[j] = at::zeros(grad_size, options);
     }
   }

   auto ret =  at::cat(grads_all_defined, dim);
   return ret;
 }

 Tensor split_backward(const std::vector<torch::autograd::Variable> &grads,
                       int64_t split_size, int64_t dim, IntArrayRef sizes, const at::TensorOptions &options) {
   dim = at::maybe_wrap_dim(dim, sizes.size());
   int64_t dim_size = sizes[dim];
   int64_t num_splits = grads.size();
   std::vector<int64_t> split_sizes(num_splits, split_size);
   split_sizes[num_splits - 1] = split_size - (split_size * num_splits - dim_size);
   return split_with_sizes_backward(grads, split_sizes, dim, sizes, options);
 }

 Tensor max_pool_double_backward(const Tensor & grad, const Tensor & indices, int dim) {
   AT_ASSERT(indices.dim() >= dim);
   auto size = indices.sizes().slice(0, indices.dim() - dim).vec();
   size.push_back(-1);
   auto indices_view = indices.view(size);
   const auto memory_format = indices.suggest_memory_format();
   return grad.contiguous(memory_format).view(size).gather(-1, indices_view).view(indices.sizes());
 }

 Tensor glu_double_backward(const Tensor & grad, const Tensor & grad_output, const Tensor & input, int64_t dim) {
   auto& gO = grad_output;
   auto input_size = input.size(dim) / 2;
   auto first_half = input.narrow(dim, 0, input_size);
   auto second_half = input.narrow(dim, input_size, input_size);
   auto sig_second_half = second_half.sigmoid();
   auto one_sub_sig_second_half = 1 - sig_second_half;
   auto sig_one_sub_sig = sig_second_half * one_sub_sig_second_half;

   auto ggI_first_half = grad.narrow(dim, 0, input_size);
   auto ggI_second_half = grad.narrow(dim, input_size, input_size);
   auto ggI_second_half_times_first_half = ggI_second_half * first_half;

   auto gI_first_half = ggI_second_half * gO * sig_one_sub_sig;
   auto second_order_sh = sig_one_sub_sig * one_sub_sig_second_half - sig_second_half * sig_one_sub_sig;
   auto gI_second_half = ggI_second_half_times_first_half * gO * second_order_sh + ggI_first_half * gO * sig_one_sub_sig;
   return at::cat({gI_first_half, gI_second_half}, dim);
 }

 Tensor glu_double_backward_grad_output(const Tensor & grad, const Tensor & input, int64_t dim) {
   if (dim < 0) dim += input.dim();
   auto sizes = input.sizes().vec();
   sizes[dim] /= 2;
   auto tmp = grad * glu_backward(at::ones(sizes, input.options()), input, dim);
   return tmp.narrow(dim, 0, sizes[dim]) + tmp.narrow(dim, sizes[dim], sizes[dim]);
 }

 Tensor infinitely_differentiable_silu_backward(
     const Tensor& grad_output,
     const Tensor& input) {
   const Tensor sigmoid = input.sigmoid();
   return grad_output * sigmoid * (1.0 + input * (1.0 - sigmoid));
 }

 Tensor infinitely_differentiable_logit_backward(
     const Tensor& grad,
     const Tensor& self,
     c10::optional<double> eps) {
   if (eps) {
     const double lo = eps.value();
     const double hi = 1.0 - lo;
     return at::where(
         at::logical_and(self >= lo, self <= hi),
         grad / (self * (1.0 - self)),
         at::zeros({}, self.options()));
   } else {
     return at::where(
         at::logical_and(self >= 0.0, self <= 1.0),
         grad / (self * (1.0 - self)),
         at::empty({}, self.options())
             .fill_(std::numeric_limits<double>::quiet_NaN()));
   }
 }

 Tensor kl_div_double_backward_grad_output(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction, bool log_target) {
   auto result = kl_div_backward(grad, input, target, at::Reduction::None, log_target);
   if (reduction == at::Reduction::Mean) {
     return result.mean();
   } else if (reduction == at::Reduction::Sum) {
     return result.sum();
   }
   return result;
 }

 // Compute derivatives for targets.
 Tensor kl_div_target_backward(Tensor grad_output, Tensor self, Tensor target, int64_t reduction, bool log_target) {
   Tensor grad_target;
   if (!log_target) {
     grad_target = grad_output.mul(target.log().add_(1).sub_(self)).masked_fill_(target == 0, 0.);
   }
   else {
     grad_target = grad_output.mul(target.add(1).sub_(self).mul_(target.exp()));
   }

   if (reduction == at::Reduction::Mean) {
     grad_target.div_(target.numel());
   }

   return grad_target;
 }

 Tensor binary_cross_entropy_with_logits_target_backward(const Tensor& grad_output, const Tensor& self, const Tensor& target, const c10::optional<Tensor>& weight, const c10::optional<Tensor>& pos_weight, int64_t reduction) {
   Tensor grad_target;
   if (isDefined(pos_weight)) {
     grad_target = (1. - self.sigmoid()).log_().sub_(pos_weight->mul(self.sigmoid().log_())).mul_(grad_output);
   } else {
     grad_target = self.mul(-grad_output);
   }

   if (isDefined(weight)) {
     grad_target.mul_(*weight);
   }

   if (reduction == at::Reduction::Mean) {
     grad_target.div_(target.numel());
   }

   return grad_target;
 }

 Tensor log_sigmoid_double_backward(const Tensor & grad, const Tensor & input) {
   auto z = input.sigmoid();
   return grad * (z - 1) * z;
 }

 Tensor softmax_double_backward(const Tensor & grad, const Tensor & grad_output, int dim, const Tensor & output) {
   auto gO = grad_output;
   auto ggI = grad;

   auto ggI_output = ggI * output;
   auto ggI_out_sum = ggI_output.sum(dim, true);
   auto ggI_out_sum_output = ggI_out_sum * output;
   auto gO_out_sum = (gO * output).sum(dim, true);

   // gI calculation
   auto gI_t0 = ggI_output * (gO - gO_out_sum);
   auto gI_t1 = output * ((ggI_output * gO).sum(dim, true).sub_(gO_out_sum * ggI_out_sum));
   auto gI_t2 = ggI_out_sum_output * gO;
   auto gI_t3 = ggI_out_sum_output * gO_out_sum;
   return gI_t0 - gI_t1 - gI_t2 + gI_t3;
 }

 Tensor log_softmax_double_backward(const Tensor & grad, const Tensor & grad_output, int dim, const Tensor & output) {
   auto z = output.exp();
   return z * grad_output.sum(dim, true) * ((grad * z).sum(dim, true) - grad);
 }

 // NOTE: [How to write vmap-compatible backward formulas]
 //
 // See NOTE: [vmap-incompatible in-place operations] for what it means for an
 // in-place operation to be incompatible with vmap.
 //
 // If an in-place operation used in a backward formula is vmap-incompatible,
 // then as developers we have the following options:
 //
 // - If the in-place operation directly followed the creation of a tensor with
 //   a factory function like at::zeros(...), we should replace the factory with a
 //   corresponding grad.new_zeros(...) call. The grad.new_zeros(...) call
 //   propagates the batch dims to the resulting tensor.
 //   For example:
 //     Before: at::zeros(input.sizes(), grad.options()).copy_(grad)
 //     After:  grad.new_zeros(input.sizes()).copy_(grad)
 //
 // - If the in-place operation followed some sequence of operations, if the
 //   we want to be able to vmap over the backward formula as-is (this is
 //   usually the case for simple (<15loc) backward formulas), then use
 //   inplaceIsVmapCompatible to guard the operation. For example:
 //             c = a * b
 //     Before: c.mul_(grad)
 //     After:  c = at::inplaceIsVmapCompatible(c, grad) ? c.mul_(grad) : c * grad
 //
 // - If we don't want to vmap directly over the backward formula (e.g., if the
 //   backward formula is too complicated or has a lot of vmap-incompatible
 //   operations, then register the backward formula as an operator and eventually
 //   write a batching rule for it.

 Tensor binary_cross_entropy_double_backward(const Tensor & grad_output, const Tensor & grad, const Tensor & input, const Tensor & target, const c10::optional<Tensor>& weight, int64_t reduction) {
   auto eps = 1e-12;
   auto inp_pl_eps = input + eps;
   auto one_m_inp_pl_eps = 1 - input + eps;
   // gradient wrt input
   auto gI = (input * input - 2 * input * target + target) / (inp_pl_eps.pow(2) * one_m_inp_pl_eps.pow(2));
   if (at::inplaceIsVmapCompatible(gI, grad)) {
     gI *= (grad * grad_output);
   } else {
     gI = gI * (grad * grad_output);
   }

   if (isDefined(weight)) {
     gI *= *weight;
   }
   if (reduction == at::Reduction::Mean) {
     return gI / input.numel();
   } else if (reduction == at::Reduction::Sum) {
     return gI.sum();
   }
   return gI;
 }

 Tensor binary_cross_entropy_double_backward_grad_output(const Tensor & grad, const Tensor & input, const Tensor & target, const c10::optional<Tensor>& weight, int64_t reduction) {
   auto eps = 1e-12;
   // gradient wrt grad_output
   auto ggO = (input - target) / ((input + eps) * (1 - input + eps));
   if (at::inplaceIsVmapCompatible(ggO, grad)) {
     ggO *= grad;
   } else {
     ggO = ggO * grad;
   }

   if (isDefined(weight)) {
     ggO *= *weight;
   }
   if (reduction == at::Reduction::Mean) {
     return ggO / input.numel();
   } else if (reduction == at::Reduction::Sum) {
     return ggO.sum();
   }
   return ggO;
 }

 Tensor l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction) {
   auto output = l1_loss_backward(grad, input, target, at::Reduction::None);
   if (reduction == at::Reduction::Mean) {
     return output.mean();
   } else if (reduction == at::Reduction::Sum) {
     return output.sum();
   }
   return output;
 }

 Tensor smooth_l1_loss_double_backward(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction, double beta) {
   // special case to protect against a divide-by-zero.
   if (beta == 0) {
       return at::zeros(grad.sizes(), grad.options());
   }
   auto d = (input - target).abs();
   auto grad_input = grad * (d < beta).type_as(grad) / beta;
   if (reduction == at::Reduction::Mean) {
     grad_input /= input.numel();
   }
   return grad_input;
 }

 Tensor smooth_l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & grad_output, const Tensor & input, const Tensor & target, int64_t reduction, double beta) {
   if (reduction == at::Reduction::None) {
     return smooth_l1_loss_backward(grad, input, target, reduction, beta);
   }
   auto r = smooth_l1_loss_backward(ones_like(grad_output), input, target, reduction, beta);
   return (r * grad).sum();
 }

 Tensor mse_loss_double_backward(const Tensor & grad, const Tensor & input, int64_t reduction) {
   auto grad_input = 2 * grad;
   if (reduction == at::Reduction::Mean) {
     grad_input /= input.numel();
   }
   return grad_input;
 }

 Tensor mse_loss_double_backward_grad_output(const Tensor & grad, const Tensor & grad_output, const Tensor & input, const Tensor & target, int64_t reduction) {
   if (reduction == at::Reduction::None) {
     return mse_loss_backward(grad, input, target, reduction);
   }
   auto r = mse_loss_backward(ones_like(grad_output), input, target, reduction);
   return (r * grad).sum();
 }

 Tensor soft_margin_loss_double_backward(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction) {
   auto z = (input * -target).exp();
   auto zplus1 = z + 1;
   auto grad_input = grad * (target * target) * z / (zplus1 * zplus1);
   if (reduction == at::Reduction::Mean) {
     grad_input /= input.numel();
   }
   return grad_input;
 }

 Tensor soft_margin_loss_double_backward_grad_output(const Tensor & grad, const Tensor & grad_output, const Tensor & input, const Tensor & target, int64_t reduction) {
   if (reduction == at::Reduction::None) {
     return soft_margin_loss_backward(grad, input, target, reduction);
   }
   auto r = soft_margin_loss_backward(ones_like(grad_output), input, target, reduction);
   return (r * grad).sum();
 }

 Tensor softplus_double_backward(const Tensor & grad, const Tensor & input, Scalar beta, Scalar threshold) {
   auto x = (input * beta);
   return sigmoid_backward(grad, x.sigmoid()) * (x < threshold).type_as(grad) * beta;
 }


 // NOTE [ as_strided Backward and layout-aware/agnostic autograd ]
 //
 // `storage_offset` is ignored for simplicity in this note. If you just want the
 // full algorithm without explanation, scroll down to bottom of this note.
 //
 // Implementing the backward of as_strided is tricky because you have to deal
 // with mappings that map one memory location to multiple indices, i.e., the
 // output tensor has multiple indices pointing to **overlapping** memory
 // addresses. This can happen in all in all sorts of weird cases. For example,
 //
 //   x = torch.randn(15)
 //   x.as_strided([3, 3], [1, 0])  # "expand" case
 //   x.as_strided([3, 3], [2, 1])  # "size too large" case
 //   x.as_strided([3, 2], [3, 6])  # res[2, 0] points to 2*3 + 0*6 = 6
 //                                 # res[0, 1] points to 0*3 + 1*6 = 6
 //
 // Here is the general strategy we apply in implementing as_strided backward:
 //   0. ??? (optimization step. we will talk about this later)
 //   1. Create some underlying flattened tensor as if it is the base tensor
 //      representing the contiguous memory storage for both input and output.
 //   2. Use the output geometry to scatter (or index_add) the gradients into
 //      this storage tensor.
 //   3. ??? (fix for input tensor with overlapping memory. we will talk about
 //           this later)
 //   4. Return the as_strided view of the storage tensor using input geometry.
 //
 // In step (2), if the output tensor does't have overlapping memory, we can
 // safely scatter (`storage.as_strided(output_geometry).copy_(grad)`);
 // otherwise, we must use `index_add` as gradients at different indices may need
 // to be summed to a single location.
 //
 // For example, in this case:
 //
 //   x = torch.randn(3)
 //   y = x.as_strided([3, 3], [1, 0])  # "expand" case
 //                                     # size   [ 3, 3]
 //                                     # stride [ 1, 0]
 //   y.backward()  # step (1): contiguous storagte tensor `s` of size 3, which
 //                             is large enough to be used as underlying storage
 //                             for `x` and `y`.
 //                               s = [ 0, 0, 0]
 //                 # step (2): since `y` has overlapping memory, index_add grad
 //                             into `s` basing on `y`'s geometry, i.e.,
 //                             s[i * y.stride(0) + j * y.stride(1)] += gy[i, j].
 //                               s = [ 3, 3, 3]
 //                 # step (4): as_strided view `s` using `x`'s geometry
 //                               s = [ 3, 3, 3]
 //                               grad_input = s.as_strided(x.size(), x.stride())
 //                                          = s.as_strided([3], [1])
 //                                          = [ 3, 3, 3]
 //
 // This is exactly what we would get if using `expand`. However, here the input
 // tensor doesn't have overlapping memory. If it does, we must add an extra step
 // before (4). Considering this case:
 //
 //   t = torch.randn(3)
 //   x = t.expand(3, 3)            # input with overlapping memory
 //                                 # size   [3, 3]
 //                                 # stride [0, 1]
 //   y = x.as_strided([1], [1])    # contiguous output
 //                                 # size   [1]
 //                                 # stride [1]
 //   y.backward()  # step (1): contiguous storage tensor `s` of size 3, which
 //                             is large enough to be used as underlying storage
 //                             for `x` and `y`.
 //                               s = [ 0, 0, 0]
 //                 # step (2): scatter grad into `s` basing on `y`'s geometry
 //                               s = [ 1, 0, 0]
 //                 # step (4): as_strided view `s` using `x`'s geometry
 //                               s = [ 1, 0, 0]
 //                               grad_input = s.as_strided([3, 3], [0, 1])
 //                                          = s.as_strided([3, 3], [0, 1])
 //                                          = [[ 1, 0, 0],
 //                                             [ 1, 0, 0],
 //                                             [ 1, 0, 0]]
 // Is this result correct?
 //
 // `x.as_strided([1], [1])` call is obviously equivalent with
 // `x[(0,) * x.dim()].view(1)` for any `x`. But autograd through the second
 // gives gradient `[ [ 1, 0, 0], [ 0, 0, 0], [ 0, 0, 0]]`. For this specific
 // case, indexing `x` at any index in first column is also equivalent, and
 // yields a gradient of shape `[3 x 3]` containing eight 0's and one 1. There is
 // an `x.size(1)`-times difference between these gradients computed from other
 // PyTorch ops and the gradient we got from as_strided.
 //
 // You might conclude that the gradients from as_strided is wrong. However,
 // let's first see why they are actually reasonable. Consider the pointwise
 // perturbations by `delta` anywhere in the first column of `x`. It will lead to
 // a `delta` change in the same memory location, and then `y` will change by
 // `delta`. So one can say the gradient should be exactly 1 at the first column,
 // as given by our above procedure.
 //
 // In the above computation of numerical gradients, they only match the
 // analytical results because strides and memory locations are considered in the
 // forward pass, i.e., this op (including both forward and backward) is
 // layout-aware.
 //
 // However, in PyTorch, most (probably all) other ops (forward and backward) are
 // layout-agnostic. E.g.,
 //
 //   t = torch.randn(1)
 //   x = t.expand(2)
 //   y = x.sum()
 //   y.backward()
 //
 // Layout-agnostic autograd (as it is currently in PyTorch) will give you
 //
 //   gy = 1
 //   gx = [ 1, 1]  # SumBackward:    torch.ones_like(x)
 //   gt = [ 2]     # ExpandBackward: gx.sum()
 //
 // Note that `gx = [ 1, 1]`. However, if you perturb any value in `x` by `delta`
 // (the other will also change by `delta`), `y` will change by `2 * delta`. So
 // the gradients, if strides are taken into consideration, should be 2.
 //
 // Layout-aware autograd should give you
 //
 //   gy = 1
 //   gx = [ 2, 2]  # Because the backward considers the fact that the input `x`
 //                 # is already expanded.
 //   gt = [ 2]     # Layout-aware backward of expand is just a slicing because
 //                 # the previous backward should have already taken care of
 //                 # strides and made sure that gradients are the same along the
 //                 # expanded dimension.
 //
 // As shown above, these two types are not compatible. Therefore, we must either
 // make as_strided layout-agnostic, or make all other ops layout-aware.
 //
 // It is difficult to support layout-aware autograd (at least in the current
 // codebase structure), because it would mean
 //   1. storing tensor geometries of every input tensor for backward
 //   2. depending on input geometry, the gradient computed from backward change
 //   3. ideally enforcing gradient of T to always have same strides as T
 // (although these two methods only differ when it comes to overlapping memory)
 //
 // Therefore, we must formulate `as_strided` in a layout-agnostic way, i.e.,
 // giving the same output regardless of the input layout. We consider
 // `input.stride()` as a separate independent fixed argument `input_stride`.
 // Then, `as_strided(input, size, stride)` can be thought of as:
 //   1. "Scatter" each value of `input` into a "storage" using storage location
 //      computed from the value's index in `input`, `input.size()` and
 //      `input_stride`, but if N values end up in the same location, the value
 //      is average of those N values (they will be the same value anyways).
 //
 //      Formal description:
 //        Denote the set of all input indices that pointing to the same storage
 //        location `storage[n]` as `S(n)`, i.e.,
 //
 //            S(n) = { index : <index, input_stride> == n, index is valid given input.size() },
 //
 //        where `<x, y>` is the dot product between `x` and `y`.
 //
 //        Then, the process is:
 //
 //            storage[n] = Avg { S(n) }
 //
 //        Note that all values in `S(n)` are the same (they point to the same
 //        memory location anyways, so this step doesn't change anything, but
 //        effectively avoids having the denpendency on the layout of `input`.
 //        I.e., the result holds fixed regardless of the layout of `input`, as
 //        long as `input_stride` is fixed.
 //
 //      NOTE: for forward pass, we can equivalently simply selet any one of
 //            `S(n)` as `storage[n]`. However, cosnidering this as an average
 //            operation makes backward easier (so all values in set
 //            `{ grad_input[i] : i in S(n) }` are the same, and it can use the
 //            same geometry as input).
 //   2. As usual, return the as_strided view of `storage` using required output
 //      `size` and `stride`.
 //
 // To backward through this layout-agnostic version, we simply add the following
 // step:
 //   .... (scatter gradients into the storage tensor using output geometry)
 //   3. For all storage location n, `storage[n] /= |S(n)|`.
 //   .... (return as_strided view of the storage tensor using input geometry)
 //
 // Finally, we note that these general operations are expensive, so we apply the
 // following optimizations:
 //   Add step (0): For all output dimension `d` with output stride 0, sum the
 //                 gradients along dimension `d` (don't keepdim), and remove
 //                 dimension `d` from output size and stride.
 //                 (An optimization for "expand" cases so we may avoid step (3))
 //  Only apply step (3) when input tensor has overlapping memory.
 //
 // FULL ALGORITHM:
 //   0. For all output dimension `d` with output stride 0, sum the gradients
 //       along dimension `d` (don't keepdim), and remove dimension `d` from
 //       output size and stride.
 //   1. Create some underlying flattened tensor as if it is the base tensor
 //      representing the contiguous memory storage for both input and output.
 //   2. Use the output geometry to scatter (or index_add) the gradients into
 //      this storage tensor `storage`.
 //   3. If input tensor has overlapping memory,
 //      For all storage location `i`, `storage[i] /= N(i)`, where `N(i)` is the
 //      number of indices in input geometry pointing to the same storage
 //      location `i` (i.e., `|S(i)|` in equations above).
 //   4. Return the as_strided view of the storage tensor using input geometry.
 //
 // See NOTE [ Detecting Memory Overlap Within A Strided Tensor ] on how to
 // roughly detech overlapping memory.


 // NOTE [ Detecting Memory Overlap Within A Strided Tensor ]
 //
 // Checking memory overlap within a strided tensor is the special case of
 // detecting memory overlap of two strided tensors, where the two tensors start
 // at the same memory address. The later is HARD (see #8212).
 //
 // But even this special case isn't simple. This note describes a check for a
 // even more constrained simple case where we can be certain that there is no
 // overlap.
 //
 // The checking algorithm can be described as:
 //   0. Return [ pass check ] if any dimension has size 0
 //   1. Ignore all dimensions that have size 1
 //   2. If no remaining dimensions, return [ pass check ]
 //   3. Sort the remaining dimensions according to the strides decreasingly
 //   4. Check that for each dimension k,
 //
 //           stride[k] > \sum_{ i > k } (size[i] - 1) * stride[i]
 //
 //      That is equivalent to, after reordering the dimensions so strides are
 //      in decreasing order, checking that stride of each dimension is larger
 //      than the maximum memory offset in a slice at that dimension.
 //
 // Obviously this check passes for contiguous tensors ( the dimensions will be
 // already sorted with LHS = stride[0] = \prod size[i] being exactly 1 larger
 // than RHS ). Similarly, the check passes for tensors contiguous in all but
 // the last dimension, and LHS = stride[0] = stride[-1] * \prod size[i] being
 // exactly stride[-1] larger than RHS. (*)
 //
 // We will show that these view operations, including all our view operations
 // *except for* general as_strided and unfold, also preserve this invariant:
 //
 //  alias:      Obviously preserves
 //
 //  expand:     All changed dimensions are removed in step (1)
 //
 //  view:       Consider the input dimensions as grouped into consecutive
 //              dimension "blocks", where dimensions are contiguous in each one.
 //              one. view only works when the output dimensions can also be
 //              grouped into the same consecutive blocks of same ordering.
 //
 //              NB: this means that the number of elements and stride of the
 //                  last dimension in each block is the same in input and
 //                  output. (**)
 //
 //              Notation:
 //                Consider a single such block B,
 //                    ... B_prev[-1]], [ B[0], ..., B[i], ..., B[k] = B[-1] ], [ B_next[0], ...
 //                                start--^^^^                  ^^^^^^^^^^^^--end
 //                Each B[i] denotes a dimension index such that B[i] = B[0] + i.
 //
 //              We first show that in a tensor (i.e., input) satisfies the
 //              invariant, after sorting, the dimensions within each block
 //              still remain consecutive. (***)
 //
 //                After removing dimensions of size 1, the dimensions within a
 //                block is already sorted by strides in descending order. So
 //                sorting all dimensions will not change the relative ordering
 //                among them.
 //
 //                Assume that some block B is not consecutive after sorting,
 //                i.e., there exists a dimension d between B[0] and B[-1] in
 //                sorted order.
 //
 //                By (*), we know that
 //                       stride[B[0]]
 //                    =  \sum_{i > 0}   (size[B[i]] - 1) * stride[B[i]] + stride[B[-1]]
 //                    <  \sum_{i > 0}   (size[B[i]] - 1) * stride[B[i]] + stride[d]
 //                    <= \sum_{i > 0}   (size[B[i]] - 1) * stride[B[i]] + (size[d] - 1) * stride[d]
 //                    <= \sum{j > B[0]} (size[j]    - 1) * stride[j],
 //
 //                where the first <   comes from sorting and
 //                      the second <= comes from the fact that dimension d
 //                                               exists after step (1) and
 //                                               thus must have size greater
 //                                               than 1
 //                      the third  <= comes from the fact that each term in
 //                                               the sum is non-negative
 //
 //                Then we have a countradiction as the invariant must not be
 //                satisfied at B[0]. So the original proposition is true.
 //
 //              Now that we established the above claim (***), we consider the
 //              view operation as first sorting the dimensions (i.e., blocks),
 //              apply the original view (since it only cares dimensions being
 //              consecutive and contiguous withtin each block), and then undo
 //              the sort.
 //
 //              Consider a single block B in the output,
 //                  ... ], [ B[0], ..., B[i], ..., B[k] = B[-1] ], [ ...
 //                    start--^^^^                  ^^^^^^^^^^^^--end
 //
 //              By (*), we know that for all i
 //                  stride[i] = stride[B[-1]] +
 //                                \sum_{j=i+1}^{k} (size[B[j]] - 1) * stride[B[j]]
 //
 //              Then the invariant is obviously satisfied at every dimension
 //              in this block if it is satisfied at dimnesion B[-1]. It only
 //              remains to show that it is satisfied at the last dimension in
 //              each block.
 //
 //              Since the same blocks are present in both input and output
 //              with the same ordering, we will abuse the notation in the
 //              following statements.
 //
 //              By (*), we know that the following holds for both input and
 //              output, for any block B:
 //                    \sum_{i > B[-1]} (size[i] - 1) * stride[i]
 //                  = \sum_{block B' after B} \prod_{j in B'} size[B[j]] * stride[B'[-1]]
 //                  = \sum_{block B' after B} numel(B') * stride[B'[-1]].
 //                    ^^^^^^^^^^^^^^^^^^^^^^^|^^^^^^^^^^^^^^^^^^^^^^^^^^
 //              By (**), we know that, this quantity in the above equation
 //              remains the same in input and output. So both
 //                  \sum_{i > B[-1]} (size[i] - 1) * stride[i]
 //              and
 //                  stride[B[-1]]
 //              are the same in input and output.
 //
 //              These two quantities are exactly the LHS and RHS of the
 //              invariant inequality. Since by assumption the invariant is
 //              satisfied in input at B[-1], it is also satisfied in output at
 //              B[-1]. This concludes the proof.
 //
 //  squeeze:    Special case of view
 //
 //  unsqueeze:  Special case of view
 //
 //  slice:      Consider slicing dimension i with step = k >= 1.
 //
 //              Let stride' and size' be the output strides and sizes. We have
 //
 //                  stride'[i] = k * stride[i]
 //                  size'[i] <= floor(size[i] / k)
 //
 //              If size'[i] = 1, invariant is obviously satisfied as we are
 //              just removing a dimension (afte step (1)).
 //
 //              Assume size'[i] > 1.
 //
 //              By assumption, the invariant is satisfied at every dimension
 //              in input.
 //
 //              For any dimension j, if stride[j] > stride[i], we have
 //                  stride'[j] =  stride[j]
 //                             >  (size[i] - 1) * stride[i]
 //                             =  (size[i] / k * k - 1) * k * stride[i] / k
 //                             =  (size[i] / k - 1 / k) * stride'[i]
 //                             >= (size'[i]    - 1 / k) * stride'[i]
 //                             >= stride'[i].
 //
 //              If stride[j] < stride[i], we have
 //                  stride'[j] = stride[j] < stride[i] <= stride'[i].
 //
 //              So the sorting order remains unchanged after slice.
 //
 //              Since
 //                     (size'[i] - 1) * stride'[i]
 //                  =  (floor(size[i] / k) - 1) * k * stride[i]
 //                  <= (size[i] / k - 1) * k * stride[i]
 //                  =  (size[i] - k) * stride[i]
 //                  <= (size[i] - 1) * * stride[i],
 //              the term from this dimension i in the invariant inequality at
 //              other dimensions can only decrease after slice. So the
 //              invariant is preserved.
 //
 //  narrow:     Special case of slice
 //
 //  select:     narrow + squeeze
 //
 //  permute:    Sorting makes permutation of dimensions irrelevant
 //
 //  transpose:  Sorting makes swapping dimensions irrelevant
 //
 //  diagonal:   Effectively merging two dimensions i and j into a new
 //              dimension k s.t.
 //                  stride'[k] =  stride[i] + stride[j]
 //                  size'[k]   <= min(size[i], size[j]),
 //              where stride and size are on the input, and stride' and size'
 //              are on the output.
 //
 //              Assuming that size[i] > 1 and size[j] > 1. If any has size 1,
 //              then this is unsqueeze on that dimension.
 //
 //              WLOG, say stride[i] >= stride[j].
 //
 //              Each dimension d in input with stride[d] > stride[j] has
 //                  stride'[d] =  stride[d]
 //                             >  (size[i] - 1) * stride[i] + (size[j] - 1) * stride[j]
 //                             >= stride[i] + stride[j]
 //                             =  stride[k].
 //              So, considering the sorted dimensions, this is effectively
 //              removing i, and replacing j with k.
 //
 //              For dimensions d with stride[i] < stride[d] < stride[j], the
 //              term from dimension i is removed in the invariant inequality.
 //              For dimensions d with stride[d] > stride[j], we have
 //                     (size'[k] - 1) * stride'[k]
 //                  <= (min(size[i], size[j]) - 1) * (stride[i] + stride[j])
 //                  <= (size[i] - 1) * stride[i] + (size[j] - 1) * stride[j],
 //              so the term from i and j in the invariant can only decrease.
 //
 //              So this is generally relaxing the constraint, and thus it
 //              preserves it.

 // This implements steps (2)~(4) of the algorithm in
 // NOTE [ Detecting Memory Overlap Within A Strided Tensor ]
 // Helper for as_strided_backward
 static inline bool _maybe_overlapping_memory(IntArrayRef sizes, IntArrayRef strides) {
   if (sizes.size() > 0) {
     std::vector<std::size_t> argsort(sizes.size());
     std::iota(argsort.begin(), argsort.end(), 0);
     std::sort(argsort.begin(), argsort.end(),
         [&](std::size_t i, std::size_t j){ return strides[i] < strides[j]; });

     int64_t max_index_in_slice = 0;
     for (auto i : argsort) {
       auto stride_ = strides[i];
       if (stride_ <= max_index_in_slice) {
         return true;
       }
       max_index_in_slice += stride_ * (sizes[i] - 1);
     }
   }
   return false;
 }

 // Returns the minimum storage size needed to contain a tensor of sizes, strides, and storage_offset
 // Helper for as_strided_backward
 static inline int64_t _min_storage_size(IntArrayRef sizes, IntArrayRef strides, int64_t storage_offset) {
   int64_t storage_size = storage_offset + 1;
   int64_t dim = sizes.size();
   for (int64_t i = 0; i < dim; i++) {
     auto size_i = sizes[i];
     if (size_i == 0) {
       return storage_offset;
     }
     storage_size += (size_i - 1) * strides[i];
   }
   return storage_size;
 }

 // See NOTE [ as_strided Backward and layout-aware/agnostic autograd ] for explanation
 Tensor as_strided_backward(Tensor grad, TensorGeometry input_geometry, IntArrayRef sizes, IntArrayRef strides, optional<int64_t> storage_offset_) {
   // For output geometry,
   //   check for size 0 dimensions,
   //   skip size 1 dimensions,
   //   reduce grad on expanded dims (stride=0, size>1)
   // Step (0)     for the algorithm in NOTE [ as_strided Backward and layout-aware/agnostic autograd ]
   // Step (0)~(1) for the algorithm in NOTE [ Detecting Memory Overlap Within A Strided Tensor ]
   //              on output geometry
   auto storage_offset = storage_offset_.value_or(input_geometry.storage_offset());
   auto odim = grad.dim();
   std::vector<int64_t> out_sizes_, out_strides_;
   out_sizes_.reserve(odim);
   out_strides_.reserve(odim);
   for (int64_t i = odim - 1; i >= 0; i--) {
     auto size_i = sizes[i];
     auto stride_i = strides[i];
     if (size_i == 0) {
       return at::zeros(input_geometry.sizes(), grad.options());
     } else if (size_i == 1) {
       grad = grad.squeeze(i);
     } else if (stride_i == 0) {
       grad = grad.sum(i, false);
     } else {
       out_sizes_.insert(out_sizes_.begin(), size_i);
       out_strides_.insert(out_strides_.begin(), stride_i);
     }
   }
   // Step (2)~(4) for the algorithm in NOTE [ Detecting Memory Overlap Within A Strided Tensor ]
   //              on output geometry
   auto out_maybe_overlap = _maybe_overlapping_memory(out_sizes_, out_strides_);

   // For input geometry,
   //   check for size 0 dimensions,
   //   skip size 1 dimensions,
   // Step (0)~(1) for the algorithm in NOTE [ Detecting Memory Overlap Within A Strided Tensor ]
   //              on input geometry
   auto idim = input_geometry.dim();
   IntArrayRef inp_sizes = input_geometry.sizes(), inp_strides = input_geometry.strides();
   std::vector<int64_t> inp_sizes_, inp_strides_;
   inp_sizes_.reserve(idim);
   inp_strides_.reserve(idim);
   for (int64_t i = idim - 1; i >= 0; i--) {
     auto size_i = inp_sizes[i];
     auto stride_i = inp_strides[i];
     if (size_i == 0) {
       return at::zeros(input_geometry.sizes(), grad.options());
     } else if (size_i != 1) {
       inp_sizes_.insert(inp_sizes_.begin(), size_i);
       inp_strides_.insert(inp_strides_.begin(), stride_i);
     }
   }
   // Step (1)~(4) for the algorithm in NOTE [ Detecting Memory Overlap Within A Strided Tensor ]
   //              on input geometry
   auto inp_maybe_overlap = _maybe_overlapping_memory(inp_sizes_, inp_strides_);


   // Rest of this function implements
   // Step (1)~(4) for the algorithm in NOTE [ as_strided Backward and layout-aware/agnostic autograd ]
   // TODO: Raise if not all output values are visible in input geometry.
   //       Technically speaking, if you treat those values as constants, not
   //       raising is fine, and mathematically correct. However, these values
   //       really are contained in some base tensor, and by treating them as
   //       constants we are ignoring this tight dependency. Therefore, it is
   //       more sensible to raise here.

   // Step (1): create underlying tensor as "storage"
   auto shared_offset = std::min(input_geometry.storage_offset(), storage_offset);
   auto inp_effective_offset = input_geometry.storage_offset() - shared_offset;
   auto out_effective_offset = storage_offset - shared_offset;
   auto base_size = std::max(
     _min_storage_size(inp_sizes_, inp_strides_, inp_effective_offset),
     _min_storage_size(out_sizes_, out_strides_, out_effective_offset)
   );
   auto storage = grad.new_zeros({base_size});

   // prepare indices tensor if we will do index_add_ later
   c10::optional<at::Tensor> flatten_full_indices;
   if (inp_maybe_overlap || out_maybe_overlap) {
     flatten_full_indices = at::arange(0, base_size, grad.options().dtype(at::kLong));
   }

   // Step (2): use output geometry to scatter gradients into storage
   if (out_maybe_overlap) {
     auto out_indices = flatten_full_indices->as_strided(out_sizes_, out_strides_, out_effective_offset);
     storage.index_add_(0, out_indices.reshape(-1), grad.reshape(-1));
   } else {
     // assume that new tensors have 0 storage offset
     storage.as_strided(out_sizes_, out_strides_, out_effective_offset).copy_(grad);
   }

   // Step (3): if input tensor has overlapping memory, divide scattered gradient
   //           at storage[i] by the number of times i shows up in input geometry
   if (inp_maybe_overlap) {
     auto count = at::zeros_like(storage, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
     auto inp_indices = flatten_full_indices->as_strided(inp_sizes_, inp_strides_, inp_effective_offset).reshape(-1);
     count.index_add_(0, inp_indices, at::ones({1}, grad.options()).expand_as(inp_indices));
     storage.div_(count); // this will give nan outside visible range
   }
   // Step (4): return as_strided view of the storage tensor with input geometry
   return storage.as_strided(inp_sizes, inp_strides, inp_effective_offset);
 }

 std::tuple<Tensor, Tensor> atan2_backward(const Tensor& grad, const Tensor& self, const Tensor& other, std::array<bool, 2> output_mask) {
   if (!grad.defined()) {
     return std::tuple<Tensor, Tensor>{Tensor(), Tensor()};
   }
   auto recip = (self * self + other * other).reciprocal();
   return std::tuple<Tensor,Tensor>{
             output_mask[0] ? grad * other * recip : Tensor(),
             output_mask[1] ? grad * -self * recip : Tensor() };
 }

 // TODO: Seriously consider writing the derivative formulas for
 // each output separately; there is not all that much sharing
 // of computation going on here.
 std::tuple<Tensor, Tensor, Tensor> prelu_double_backward(
     const Tensor & grad_grad_input,
     const Tensor & grad_grad_weight,
     const Tensor & grad_out,
     const Tensor & input_,
     const Tensor & weight_) {

   if (!(grad_grad_input.defined() || grad_grad_weight.defined() || grad_out.defined())) {
     return std::tuple<Tensor, Tensor, Tensor>(Tensor(), Tensor(), Tensor());
   }
     auto input = input_.contiguous();
     auto weight = weight_.contiguous();

   // Zero-fill undefined grads (TODO: do this more efficiently)
   auto ggI = grad_grad_input.defined() ? grad_grad_input.contiguous() : at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   auto ggW = grad_grad_weight.defined() ? grad_grad_weight.contiguous() : at::zeros_like(weight, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   auto gO = grad_out.defined() ? grad_out.contiguous() : at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);

   auto positive_mask = (input > 0).type_as(ggI);
   auto nonpositive_mask = (input <= 0).type_as(ggW);

   // Explanation: Let input be i, weight be w, grad_output be gO.
   // f(i, w) = i      if i > 0
   //         = w * i  if i <= 0
   // gI = df/di * gO  = gO      if i > 0    gW = df/dw * gO = 0       if i > 0
   //                  = gO * w  if i <= 0                   = gO * i  if i <= 0
   // The rest is taking derivatives of these wrt i, w, gO and summing/expanding properly.

   if (weight.numel() == 1) {
       // from PReLU.forward: num_parameters == 0 is used indicate that a
       // single weight is shared among all input channels.

       // this is a little tricky because PReLU currently doesn't take a shape so the weight may be
       // 1-d when the input is a scalar (and there isn't a good Parameter API for that anyway until Variable
       // and tensor are merged).  So, use weight and ggW as 0-dim in this case.
       bool scalar_input_1d_weight = (positive_mask.dim() == 0 && weight.dim() == 1);
       auto weight_maybe_squeeze = scalar_input_1d_weight ? weight.squeeze() : weight;
       auto ggW_maybe_squeeze = scalar_input_1d_weight ? ggW.squeeze() : ggW;

       auto mask = positive_mask + nonpositive_mask * weight_maybe_squeeze.expand_as(input);
       auto ggO = ggI * mask + ggW_maybe_squeeze.expand_as(gO) * (nonpositive_mask * input);
       return std::tuple<Tensor, Tensor, Tensor>(
                 ggO,
                 ggW_maybe_squeeze.expand_as(gO) * gO * nonpositive_mask,
                 (ggI * gO * nonpositive_mask).sum().expand_as(weight)
           );
   } else {
       // Expand ggW to match size of ggI; a simple expand doesn't work because
       // ggW is the size of the input channel (dim==1 unless there is only 1 dimension).  For example,
       // let ggI be size (3,4,5,6,7) and ggW be size (4).  Then we unsqueeze ggW to be size (4,1,1,1)
       // so the expand succeeds.
       auto dims_to_unsqueeze = std::max<int64_t>(input.dim() - 2, 0);
       auto ggW_expanded = ggW;
       for (int64_t i = 0; i < dims_to_unsqueeze; i++) {
           ggW_expanded = ggW_expanded.unsqueeze(1);
       }
       ggW_expanded = ggW_expanded.expand_as(ggI);

       auto gI = ggW_expanded * gO * nonpositive_mask;

       auto gW = ggI * gO * nonpositive_mask;
       if (input.dim() > 1) {
           gW = gW.sum(0);
       }
       while (gW.dim() > 1) {
           gW = gW.sum(1);
       }

       Tensor ggO;
       if (gO.requires_grad()) {
           // expand weight as input as in ggW/ggI above
           auto weight_expanded = weight;
           for (int64_t i = 0; i < dims_to_unsqueeze; i++) {
               weight_expanded = weight_expanded.unsqueeze(1);
           }
           weight_expanded = weight_expanded.expand_as(input);

           auto mask = positive_mask + nonpositive_mask * weight_expanded;
           ggO = ggI * mask + ggW_expanded * nonpositive_mask * input;
       }
       return std::tuple<Tensor,Tensor,Tensor>{ggO, gI, gW};
   }
 }

 // https://j-towns.github.io/papers/svd-derivative.pdf
 //
 // This makes no assumption on the signs of sigma.
 Tensor svd_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
           bool some, bool compute_uv, const Tensor& raw_u, const Tensor& sigma, const Tensor& raw_v) {
   TORCH_CHECK(compute_uv,
            "svd_backward: Setting compute_uv to false in torch.svd doesn't compute singular matrices, ",
            "and hence we cannot compute backward. Please use torch.svd(compute_uv=True)");

   auto m = self.size(-2);
   auto n = self.size(-1);
   auto k = sigma.size(-1);
   auto gsigma = grads[1];

   auto u = raw_u;
   // Currently torch.svd for complex dtypes returns the conjugate of V,
   // while the backward formula is derived with just V (without the conjugation)
   // therefore here we need to conjugate the V output of SVD and grads[2].
   // Once https://github.com/pytorch/pytorch/issues/45821 is resolved
   // extra .conj(), that are marked below in the code, shall be removed.
   auto v = raw_v.conj();  // TODO: remove .conj()
   auto gu = grads[0];
   auto gv = grads[2].conj();  // TODO: remove .conj()

   if (!some) {
     // We ignore the free subspace here because possible base vectors cancel
     // each other, e.g., both -v and +v are valid base for a dimension.
     // Don't assume behavior of any particular implementation of svd.
     u = raw_u.narrow(-1, 0, k);
     v = raw_v.narrow(-1, 0, k).conj();  // TODO: remove .conj()
     if (gu.defined()) {
       gu = gu.narrow(-1, 0, k);
     }
     if (gv.defined()) {
       gv = gv.narrow(-1, 0, k);
     }
   }
   auto vh = v.conj().transpose(-2, -1);

   Tensor sigma_term;
   if (gsigma.defined()) {
     gsigma = gsigma.to(self.dtype());
     // computes u @ diag(gsigma) @ vh
     sigma_term = at::matmul(u * gsigma.unsqueeze(-2), vh);
   } else {
     sigma_term = at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   }
   // in case that there are no gu and gv, we can avoid the series of kernel
   // calls below
   if (!gv.defined() && !gu.defined()) {
     return sigma_term;
   }

   auto uh = u.conj().transpose(-2, -1);
   auto im = at::eye(m, self.options());
   auto in = at::eye(n, self.options());
   auto sigma_mat = sigma.diag_embed(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).to(self.dtype());
   auto sigma_mat_inv = sigma.pow(-1).diag_embed(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).to(self.dtype());
   auto sigma_sq = sigma.pow(2);
   auto F = sigma_sq.unsqueeze(-2) - sigma_sq.unsqueeze(-1);
   // The following two lines invert values of F, and fills the diagonal with 0s.
   // Notice that F currently has 0s on diagonal. So we fill diagonal with +inf
   // first to prevent nan from appearing in backward of this function.
   F.diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).fill_(INFINITY);
   F = F.pow(-1);

   Tensor u_term, v_term;

   if (gu.defined()) {
     auto guh = gu.conj().transpose(-2, -1);
     u_term = at::matmul(u, at::matmul(F.mul(at::matmul(uh, gu) - at::matmul(guh, u)), sigma_mat));
     if (m > k) {
       u_term = u_term + at::matmul(im - at::matmul(u, uh), at::matmul(gu, sigma_mat_inv));
     }
     u_term = at::matmul(u_term, vh);
   } else {
     u_term = at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   }

   if (gv.defined()) {
     auto gvh = gv.conj().transpose(-2, -1);
     v_term = at::matmul(sigma_mat, at::matmul(F.mul(at::matmul(vh, gv) - at::matmul(gvh, v)), vh));
     if (n > k) {
       v_term = v_term + at::matmul(sigma_mat_inv, at::matmul(gvh, in - at::matmul(v, vh)));
     }
     v_term = at::matmul(u, v_term);
   } else {
     v_term = at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   }

   // for complex-valued input there is an additional term
   // https://giggleliu.github.io/2019/04/02/einsumbp.html
   // https://arxiv.org/abs/1909.02659
   if (self.is_complex() && gu.defined()) {
     // computes L = Identity.mul(uh @ gu)
     Tensor L = at::matmul(uh, gu).diagonal(0, -2, -1).diag_embed(0, -2, -1);
     L = L - L.conj().transpose(-2, -1);
     Tensor imag_term = 0.5 * at::matmul(at::matmul(at::matmul(u, L), sigma_mat_inv), vh);
     return u_term + sigma_term + v_term + imag_term;
   }

   return u_term + sigma_term + v_term;
 }

 // "An extended collection of matrix derivative results for forward and reverse mode algorithmic differentiation"
 // https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf
 Tensor eig_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
                     bool eigenvectors, const Tensor& lambda, const Tensor& v) {
   // This gradient only works for real eigenvalues at the moment.
   TORCH_CHECK(eigenvectors,
            "eig_backward: Setting eigenvectors to false in torch.eig doesn't compute eigenvectors ",
            "and hence we cannot compute backward. Please use torch.eig(eigenvectors=True)");
   auto zeros = at::zeros({1}, lambda.options());
   TORCH_CHECK(
       at::allclose(lambda.slice(/*dim=*/-1, /*start=*/1, /*end=*/2), zeros),
       "eig_backward: Backward calculation does not support complex eigenvalues at the moment.");

   auto glambda = grads[0];
   auto gv = grads[1];
   auto vt = v.transpose(-2, -1);

   Tensor result;
   // contribution from the eigenvectors
   if (gv.defined()) {
     auto rlambda = lambda.slice(/*dim=*/-1, /*start=*/0, /*end=*/1);

     auto hm = rlambda.transpose(-2,-1) - rlambda;
     hm.diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).fill_(INFINITY);
     hm.pow_(-1.0);

     auto gvortho = gv - at::sum(gv * v, /*dim=*/-2, /*keepdim=*/true) * v;
     auto B = hm * at::matmul(vt, gvortho);
     auto A = at::matmul(B, vt);

     std::tie(result, std::ignore) = at::solve(A, vt);
   }
   // contribution from eigenvalues
   if (glambda.defined()) {
     auto grlambda = glambda.slice(/*dim=*/-1, /*start=*/0, /*end=*/1) * vt;
     auto A = at::matmul(v, grlambda);
     auto vvt = at::matmul(v, vt);
     if (result.defined()) {
       Tensor result1;
       std::tie(result1, std::ignore) = at::solve(A, vvt);
       result = result.add(result1);
     }
     else {
       std::tie(result, std::ignore) = at::solve(A, vvt);
     }
   }
   return result;
 }

 // http://eprints.maths.ox.ac.uk/1079/1/NA-08-01.pdf
 Tensor symeig_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
                     bool eigenvectors, bool upper, const Tensor& lambda, const Tensor& v) {
   // This gradient is symmetric, and not triangular.
   // symeig operates only on symmetric inputs, which is a subspace of
   // R^{n x n}, and hence the derivative is not well-defined for off-diagonal
   // elements. We resolve this by taking the gradient of the functionally independent
   // elements of the matrix (i.e., the lower triangular portion of the input) and then
   // reflect it on the upper triangular portion, thereby symmetrizing the gradient of
   // the symeig operation. The motivation behind this choice is that symmetric gradient
   // leads to stable gradient updates, and retains symmetry of the updated matrix if it
   // were updated by a gradient based algorithm.
   TORCH_CHECK(eigenvectors,
            "symeig_backward: Setting eigenvectors to false in torch.symeig doesn't compute eigenvectors ",
            "and hence we cannot compute backward. Please use torch.symeig(eigenvectors=True)");

   auto glambda = grads[0];
   auto gv = grads[1];

   auto vh = v.conj().transpose(-2, -1);

   Tensor result;
   if (gv.defined()) {
       Tensor F = lambda.unsqueeze(-2) - lambda.unsqueeze(-1);
       F.diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).fill_(INFINITY);
       F.pow_(-1);
       result = at::matmul(v, at::matmul(F * at::matmul(vh, gv), vh));
   } else {
       result = at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   }

   if (glambda.defined()) {
     glambda = glambda.to(self.dtype());
     // computes v @ diag(glambda) @ vh
     Tensor glambda_term = at::matmul(v * glambda.unsqueeze(-2), vh);
     if (at::inplaceIsVmapCompatible(result, glambda_term)) {
       result.add_(glambda_term);
     } else {
       result = result + glambda_term;
     }
   }
   return result.add(result.conj().transpose(-2, -1)).mul_(0.5);
 }

 Tensor linalg_qr_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
                           std::string mode, const Tensor& q, const Tensor& r){
   bool compute_q, reduced;
   std::tie(compute_q, reduced) = at::native::_parse_qr_mode(mode);
   TORCH_CHECK(compute_q, "The derivative of qr is not implemented when mode='r'. "
                          "Please use torch.linalg.qr(..., mode='reduced')");

   auto square_deep_case_backward = [](const Tensor& grad_Q,
                                       const Tensor& grad_R,
                                       const Tensor& A,
                                       const Tensor& Q,
                                       const Tensor& R) -> Tensor {
     // For square and deep (tall) case we refer:
     // Matthias Seeger, Asmus Hetzel, Zhenwen Dai, Eric Meissner, Neil D. Lawrence (2018). Auto-Differentiating Linear Algebra.
     // https://arxiv.org/abs/1710.08717 Section 4.3 LQ Decomposition (Note that LQ decomposition is the transpose of QR decomposition)
     // Hai-Jun Liao, Jin-Guo Liu, Lei Wang, Tao Xiang (2019). Differentiable Programming Tensor Networks.
     // https://arxiv.org/abs/1903.09650 Section 3. QR factorization
     // For derivations of complex-valued input case, see https://giggleliu.github.io/2019/04/02/einsumbp.html

     // Compute R grad_R^H
     Tensor R_term;
     if (grad_R.defined()) {
       R_term = at::matmul(R, grad_R.conj().transpose(-2, -1));
     } else {
       // R is ... x N x N, grad_R is ... x N x N and grad_R.T is ... x N x N
       R_term = at::zeros_like(R, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
     }

     // Compute grad_Q^H Q
     Tensor Q_term;
     if (grad_Q.defined()) {
       Q_term = at::matmul(grad_Q.conj().transpose(-2, -1), Q);
     } else {
       // Q is ... x M x N, Q.T is ... x N x M and grad_Q is ... x M x N
       Q_term = at::zeros_like(R, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
     }

     Tensor M = R_term - Q_term;

     // Compute M = (tril(M) + tril(M).conj().transpose(-2, -1)) * 0.5 Identity
     Tensor M_tril = at::tril(M);
     M = M_tril + M_tril.conj().transpose(-2, -1);
     M.diagonal(0, -2, -1).mul_(0.5);

     Tensor rhs_term;
     if (grad_Q.defined()) {
       rhs_term = grad_Q + at::matmul(Q, M);
     } else {
       rhs_term = at::matmul(Q, M);
     }

     // We want to compute: (rhs_term @ R^{-H})
     // Note that (rhs_term @ R^{-H}) = (R^{-1} @ rhs_solve_1^H)^H
     // Since R is upper triangular, we can do this using
     // triangular_solve(rhs_term^H, R)^H
     Tensor grad_A;
     std::tie(grad_A, std::ignore) = at::triangular_solve(
         rhs_term.conj().transpose(-2, -1),
         R,
         /*upper=*/true,
         /*transpose=*/false,
         /*unitriangular=*/false);

     return grad_A.conj().transpose(-2, -1);
   };

   auto m = self.size(-2);
   auto n = self.size(-1);

   TORCH_CHECK(
       ((m <= n && (!reduced)) || reduced),
       "The derivative of qr is not implemented when mode='complete' and nrows > ncols.");

   auto grad_Q = grads[0];
   auto grad_R = grads[1];

  if (m >= n) {
     return square_deep_case_backward(grad_Q, grad_R, self, q, r);
   } else {
     // For wide (m < n) input matrices A,  partition A = [X|Y] and R = [U|V]
     // X and U are square full rank matrices. We will partition grads,
     // grad_R = [grad_U | grad_V] and grad_A = [grad_X | grad_Y].
     // To obtain grad_X we reuse the gradient formula from the square case.
     // Formulae: grad_X = square_case_grad(grad_Q_prime, grad_U, Q, U),
     // where grad_Q_prime = grad_Q + Y @ grad_V^H
     // and grad_Y = Q @ grad_V.
     // Then concatenate grads to get grad_A = [grad_X | grad_Y].

     auto Y = self.narrow(-1, m, n - m);
     auto U = r.narrow(-1, 0, m);
     Tensor grad_Y, grad_X, grad_V, grad_Q_prime;

     if (grad_R.defined()) {
       grad_V = grad_R.narrow(-1, m, n - m);
       // reuse grad_R to store grad_U
       grad_R = grad_R.narrow(-1, 0, m);
       // grad_Q_prime starts with the value of Y @ grad_V^H
       grad_Q_prime = at::matmul(Y, grad_V.conj().transpose(-2, -1));
     } else {
       // when grad_R is not defined then grad_V and grad_Q_prime
       // get initialized with zeros
       grad_V = at::zeros_like(Y, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
       grad_Q_prime = at::zeros_like(q, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
     }

     if (grad_Q.defined()) {
       // add the grad_Q term into grad_Q_prime when defined o/w is 0
       grad_Q_prime = grad_Q_prime + grad_Q;
     }
     // Calculate grad_X using the helper. Grad_R contains the grad_U value
     grad_X = square_deep_case_backward(grad_Q_prime, grad_R, self, q, U);
     grad_Y = at::matmul(q, grad_V);
     // Concatenate grad_X and grad_Y to get grad_A.
     return at::cat({grad_X, grad_Y}, -1);
   }
 }

 // Invertible case is derived from Jacobi's formula, and also can be found at:
 // http://eprints.maths.ox.ac.uk/1079/1/NA-08-01.pdf
 Tensor det_backward(const Tensor & grad, const Tensor& self, const Tensor& det) {
   auto singular_case_backward = [&](const Tensor& grad, const Tensor& self, const Tensor& det) -> Tensor {
     Tensor u, sigma, v;
     std::tie(u, sigma, v) = self.svd();
     auto gsigma = prod_backward(grad.unsqueeze(-1), sigma, det.unsqueeze(-1));
     return svd_backward({{}, gsigma, {}}, self, true, true, u, sigma, v);
   };

   auto nonsingular_case_backward = [&](const Tensor& grad, const Tensor& self, const Tensor& det) -> Tensor {
     return unsqueeze_multiple(grad * det, {-1, -2}, self.dim()) * self.inverse().transpose(-2, -1);
   };

   if (self.dim() == 2) {
     if (det.item<double>() == 0) {
       return singular_case_backward(grad, self, det);
     } else {
       return nonsingular_case_backward(grad, self, det);
     }
   } else {
     auto nonzero_det_indices = at::native::toListOfOptionalTensors(at::where(det));
     c10::optional<Tensor> first_nonzero_det_index = nonzero_det_indices[0];

     if (first_nonzero_det_index->size(0) == det.numel()) {  // all determinants are nonzero (non-singular)
       return nonsingular_case_backward(grad, self, det);
     }

     auto zero_det_indices = at::native::toListOfOptionalTensors(at::where(det == 0));
     c10::optional<Tensor> first_zero_det_index = zero_det_indices[0];

     if (first_zero_det_index->size(0) == det.numel()) {  // all determinants are zero (singular)
       return singular_case_backward(grad, self, det);
     }

     Tensor grad_det = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);

     // invertible case
     grad_det.index_put_(/*indices=*/nonzero_det_indices,
                         /*value=*/nonsingular_case_backward(grad.index(nonzero_det_indices),
                                                             self.index(nonzero_det_indices),
                                                             det.index(nonzero_det_indices)));

     // non-invertible case, uses SVD
     grad_det.index_put_(/*indices=*/zero_det_indices,
                         /*value=*/singular_case_backward(grad.index(zero_det_indices),
                                                          self.index(zero_det_indices),
                                                          det.index(zero_det_indices)));

     return grad_det;
   }
 }

 Tensor logdet_backward(const Tensor & grad, const Tensor& self, const Tensor& logdet) {
   auto singular_case_backward = [&](const Tensor& grad, const Tensor& self) -> Tensor {
     Tensor u, sigma, v;
     std::tie(u, sigma, v) = self.svd();
     // logdet = \sum log(sigma)
     auto gsigma = grad.unsqueeze(-1).div(sigma);
     return svd_backward({{}, gsigma, {}}, self, true, true, u, sigma, v);
   };

   auto nonsingular_case_backward = [&](const Tensor& grad, const Tensor& self) -> Tensor {
     return unsqueeze_multiple(grad, {-1, -2}, self.dim()) * self.inverse().transpose(-2, -1);
   };

   if (self.dim() == 2) {
     if (logdet.item<double>() != -INFINITY) {
       return nonsingular_case_backward(grad, self);
     } else {
       return singular_case_backward(grad, self);
     }
   } else {
     auto finite_logdet_indices = at::native::toListOfOptionalTensors(at::where(logdet != -INFINITY));
     c10::optional<Tensor> first_finite_logdet_index = finite_logdet_indices[0];

     if (first_finite_logdet_index->size(0) == logdet.numel()) {  // all log determinants are finite (non-singular)
       return nonsingular_case_backward(grad, self);
     }

     auto neginf_logdet_indices = at::native::toListOfOptionalTensors(at::where(logdet == -INFINITY));
     c10::optional<Tensor> first_neginf_logdet_index = neginf_logdet_indices[0];

     if (first_neginf_logdet_index->size(0) == logdet.numel()) {  // all log determinants are -inf (singular)
       return singular_case_backward(grad, self);
     }

     Tensor grad_logdet = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);

     // invertible case
     grad_logdet.index_put_(/*indices=*/finite_logdet_indices,
                            /*value=*/nonsingular_case_backward(grad.index(finite_logdet_indices),
                                                                self.index(finite_logdet_indices)));

     // non-invertible case, uses SVD
     grad_logdet.index_put_(/*indices=*/neginf_logdet_indices,
                            /*value=*/singular_case_backward(grad.index(neginf_logdet_indices),
                                                             self.index(neginf_logdet_indices)));

     return grad_logdet;
   }
 }

 Tensor slogdet_backward(const Tensor& grad_logabsdet,
                         const Tensor& self,
                         const Tensor& signdet, const Tensor& logabsdet) {
   auto singular_case_backward = [&](const Tensor& grad_logabsdet, const Tensor& self) -> Tensor {
     Tensor u, sigma, v;
     std::tie(u, sigma, v) = self.svd();
     // sigma has all non-negative entries (also with at least one zero entry)
     // so logabsdet = \sum log(abs(sigma))
     // but det = 0, so backward logabsdet = \sum log(sigma)
     auto gsigma = grad_logabsdet.unsqueeze(-1).div(sigma);
     return svd_backward({{}, gsigma, {}}, self, true, true, u, sigma, v);
   };

   auto nonsingular_case_backward = [&](const Tensor& grad_logabsdet, const Tensor& self) -> Tensor {
     return unsqueeze_multiple(grad_logabsdet, {-1, -2}, self.dim()) * self.inverse().transpose(-2, -1);
   };

   if (self.dim() == 2) {
     if (signdet.item<double>() == 0) {
       return singular_case_backward(grad_logabsdet, self);
     } else {
       return nonsingular_case_backward(grad_logabsdet, self);
     }
   } else {
     auto nonzero_signdet_indices = at::native::toListOfOptionalTensors(at::where(signdet));
     c10::optional<Tensor> first_nonzero_signdet_index = nonzero_signdet_indices[0];

     if (first_nonzero_signdet_index->size(0) == logabsdet.numel()) {  // all log determinants are finite (non-singular)
       return nonsingular_case_backward(grad_logabsdet, self);
     }

     auto zero_signdet_indices = at::native::toListOfOptionalTensors(at::where(signdet == 0));
     c10::optional<Tensor> first_zero_signdet_index = zero_signdet_indices[0];

     if (first_zero_signdet_index->size(0) == logabsdet.numel()) {  // all log determinants are -inf (singular)
       return singular_case_backward(grad_logabsdet, self);
     }

     Tensor grad_slogdet = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);

     // invertible case
     grad_slogdet.index_put_(/*indices=*/nonzero_signdet_indices,
                             /*value=*/nonsingular_case_backward(grad_logabsdet.index(nonzero_signdet_indices),
                                                                 self.index(nonzero_signdet_indices)));

     // non-invertible case, uses SVD
     grad_slogdet.index_put_(/*indices=*/zero_signdet_indices,
                             /*value=*/singular_case_backward(grad_logabsdet.index(zero_signdet_indices),
                                                              self.index(zero_signdet_indices)));

     return grad_slogdet;
   }
 }

 // Reference:
 // https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf
 // Sec. 2.3.1 Matrix inverse product
 std::tuple<Tensor, Tensor> triangular_solve_backward(
     const Tensor & grad_x, const Tensor & grad_m,
     const Tensor & b, const Tensor & a, const Tensor & x,
     const bool upper, const bool transpose, const bool unitriangular,
     std::array<bool, 2> output_mask) {
   Tensor grad_b, grad_a;
   if (grad_x.defined() || grad_m.defined()) {
     if (grad_x.defined()) {
       grad_b = std::get<0>(grad_x.triangular_solve(a.conj(), upper, !transpose, unitriangular));
       if (output_mask[1]) {
         grad_a = transpose ? -x.conj().matmul(grad_b.transpose(-1, -2)) : -grad_b.matmul(x.transpose(-1, -2).conj());
         if (upper) {
           grad_a = grad_a.triu((int) unitriangular);
         } else {
           grad_a = grad_a.tril(-((int) unitriangular));
         }
       }
     }
     if (!grad_a.defined()) {
       grad_a = at::zeros({1}, a.options()).expand_as(a);
     }
     if (!grad_b.defined()) {
       grad_b = at::zeros({1}, b.options()).expand_as(b);
     }
     if (output_mask[1] && grad_m.defined()) {
       grad_a = grad_a.add(grad_m);
     }
   }
   return std::tuple<Tensor, Tensor>{grad_b, grad_a};
 }

 std::tuple<Tensor, Tensor> cholesky_solve_backward(
     const Tensor& grad_x, const Tensor& self,
     const Tensor& input2, const Tensor& result, const bool upper) {
   Tensor grad_self, grad_input2;
   if (grad_x.defined()) {
     grad_self = grad_x.cholesky_solve(input2, /*upper=*/upper);

     Tensor common_term = at::matmul(grad_self, result.conj().transpose(-2, -1));
     common_term = common_term + common_term.conj().transpose(-2, -1);

     if (upper) {
       grad_input2 = -at::matmul(input2, common_term);
     } else {
       grad_input2 = -at::matmul(common_term, input2);
     }
   }
   return std::tuple<Tensor, Tensor>{grad_self, grad_input2};
 }

 Tensor fft_c2r_backward(const Tensor& grad, IntArrayRef dim, int64_t normalization) {
   // Forward is C2R (onesided)
   // Think of onesided C2R irfft as
   //    1. fill the other half by conjugate symmetry
   //    2. inverse C2C ifft
   //    3. discard the complex dimension
   // So backward is
   //    1. R2C rfft (essentially add dummy complex dimension, and dft)
   //    2. accumulate gradient by conjugate symmetry
   //       since rfft results follow conjugate symmetry, we only need to
   //       double some entries from onesided rfft results, i.e., the ones with
   //       their reflected indices also landing out of the onesided range. So
   //       consider the index of last dim:
   //           i.   idx = 0.
   //                Reflected to (N - 0) % N = 0. Not doubled.
   //           ii   0 < idx < floor(N/2) (last).
   //                N > N - idx > ceil(N/2)
   //                Reflected to ()
   //           iii. idx = floor(N/2) = N/2 (last) when N even.
   //                Reflected to (N - N/2) % N = N/2. Not doubled.
   //           iv.  idx = floor(N/2) = (N-1)/2 (last) when N odd.
   //                Reflected to (N - (N-1)/2) % N = (N+1)/2. Doubled.
   //       Therefore, needs to double
   //           idx = 1, 2, ..., N/2 - 1     when N even
   //           idx = 1, 2, ..., (N-1)/2     when N odd
   //       that is
   //           idx = 1, 2, ..., N - (floor(N/2) + 1)
   //               = 1, 2, ..., N - onesided_length
   auto gI = at::_fft_r2c(grad, dim, normalization, /*onesided=*/true);

   auto double_length = grad.size(dim.back()) - gI.size(dim.back());
   if (double_length > 0) {  // also covers case when signal size is zero
     gI.narrow(dim.back(), 1, double_length).mul_(2);
   }
   return gI;
 }

 Tensor fft_r2c_backward(const Tensor& grad, IntArrayRef dim, int64_t normalization,
                         bool onesided, int64_t last_dim_size) {
   if (!onesided) {
     return at::real(at::_fft_c2c(grad, dim, normalization, /*forward=*/false));
   }

   // Forward is R2C (onesided)
   // Think of onesided R2C rfft as
   //     1. view as complex numbers (fill complex dim with zeros)
   //     2. C2C fft
   //     3. discard half of results
   // So backward is
   //     1. fill the other half with zeros (with `zero_grad_shape` below)
   //        (C2C ifft only take twosided inputs so we need to fill here)
   //     2. inverse C2C ifft
   //     3. discard the complex dim
   auto half_sizes = grad.sizes();
   at::DimVector new_grad_shape(half_sizes.begin(), half_sizes.end());
   const auto last_dim = at::maybe_wrap_dim(dim.back(), half_sizes.size());
   new_grad_shape[last_dim] = last_dim_size;

   const auto zero_length = last_dim_size - grad.size(dim.back());
   auto complex_full_grad = zero_length > 0 ? at::zeros(new_grad_shape, grad.options()) : grad;
   if (zero_length > 0) {
     complex_full_grad.slice(last_dim, 0, half_sizes[last_dim]).copy_(grad);
   }
   return at::real(at::_fft_c2c(complex_full_grad, dim, normalization, /*forward=*/false));
 }

 // Helper for batchnorm_double_backward
 Tensor sum_exclude_dim1(const Tensor& to_sum, bool keepdim=true) {
   auto r = to_sum.sum(0, keepdim);
   int64_t start_point_exclusive = keepdim ? 1 : 0;
   for (int64_t dim = r.dim() - 1; dim > start_point_exclusive; dim--) {
     r = r.sum(dim, keepdim);
   }
   return r;
 }

 // Helper for batchnorm_double_backward
 // similar to expand_as below, but doesn't do the expand_as; operates as if
 // reductions were done with keepdim=True
 Tensor unsqueeze_dim1(const Tensor& src, const Tensor& target) {
   auto src_expanded = src;
   while (src_expanded.sizes().size() < target.sizes().size() - 1) {
     src_expanded = src_expanded.unsqueeze(1);
   }
   if (src_expanded.sizes().size() == target.sizes().size() - 1) {
     src_expanded = src_expanded.unsqueeze(0);
   }
   return src_expanded;
 }

 // Helper for batchnorm_double_backward
 // because gamma/ggG/ggB are 1-dimensional and represent dim==1, we can't
 // do a straight expansion because it won't follow the broadcasting rules.
 Tensor expand_as_dim1(const Tensor& src, const Tensor& target) {
   auto src_expanded = src;
   while (src_expanded.sizes().size() < target.sizes().size() - 1) {
     src_expanded = src_expanded.unsqueeze(1);
   }
   return src_expanded.expand_as(target);
 }

 std::tuple<Tensor, Tensor, Tensor> batchnorm_double_backward(
     const Tensor & input,
     const c10::optional<Tensor> & gamma,
     const Tensor & ggI,
     const Tensor & ggG,
     const Tensor & ggB,
     const Tensor & gO,
     const c10::optional<Tensor> & running_mean,
     const c10::optional<Tensor> & running_var,
     bool training,
     double eps,
     const c10::optional<Tensor> & save_mean,
     const c10::optional<Tensor> & save_invstd,
     std::array<bool,3> output_mask) {

   bool affine = isDefined(gamma);
   // TODO: Do we have a ScalarOrTensor type?  Would such a thing exist?
   Tensor gamma_expanded;
   Tensor ggG_expanded, ggB_expanded;
   if (affine) {
     gamma_expanded = expand_as_dim1(*gamma, input);
     if (ggG.defined()) {
       ggG_expanded = expand_as_dim1(ggG, input);
     }
     if (ggB.defined()) {
       ggB_expanded = expand_as_dim1(ggB, input);
     }
   } else {
     gamma_expanded = at::ones({}, input.options());
   }

   // define some terms we will reuse
   auto M = input.size(0);
   for (auto s : input.sizes().slice(2)) {
     M *= s;
   }
   // for half inputs, save_mean, save_invstd are float (ideally, we would cast
   // everything else, but not now)
   auto mu = unsqueeze_dim1(training ? toLegacyTensor(save_mean).to(input.scalar_type()) : toLegacyTensor(running_mean), input);
   auto input_sub_mu = input - mu;
   auto sigma2_eps_neg_1_2 = unsqueeze_dim1(
       training ? toLegacyTensor(save_invstd).to(input.scalar_type())
                : toLegacyTensor(running_var).add(Scalar(eps)).pow(-0.5),
       input);
   auto sigma2_eps_neg_1 = sigma2_eps_neg_1_2.pow(2);
   auto sigma2_eps_neg_3_2 = sigma2_eps_neg_1_2.pow(3);

   // calculate gI
   auto input_mu_sigma2_neg_3_2 = input_sub_mu * sigma2_eps_neg_3_2;
   auto gOinmu_sum = sum_exclude_dim1(gO * input_sub_mu);
   auto gO_sum = sum_exclude_dim1(gO);

   Tensor gI;
   if (ggI.defined() && training) {
     auto ggI_sum = sum_exclude_dim1(ggI);
     auto ggIinmu_sum = sum_exclude_dim1(ggI * input_sub_mu);
     auto all_sub = ((ggI_sum * gO_sum).div_(M)).sub_(sum_exclude_dim1(gO * ggI)).add_(
                     (sigma2_eps_neg_1 * gOinmu_sum * ggIinmu_sum).mul_(3. / M));
     auto gI_0t = (input_mu_sigma2_neg_3_2 * all_sub).div_(M);
     auto gI_1t = (ggIinmu_sum * sigma2_eps_neg_3_2).div_(M) * (gO_sum.div(M) - gO);
     auto gI_2t = (gOinmu_sum * sigma2_eps_neg_3_2).div_(M) * (ggI_sum.div(M) - ggI);
     gI = gamma_expanded * (gI_0t.add_(gI_1t).add_(gI_2t));
   }

   // add contribution of gamma term to gI
   Tensor gI_G_term;
   if (affine && ggG.defined()) {
     if (training) {
       auto t0 = gO * sigma2_eps_neg_1_2;
       auto t1 = (sigma2_eps_neg_1_2 * gO_sum).div_(-M);
       auto t2 = (input_mu_sigma2_neg_3_2 * sum_exclude_dim1(gO * input_sub_mu)).div_(-M);
       gI_G_term = ggG_expanded * (t0.add_(t1).add_(t2));
       gI = gI.defined() ? gI.add_(gI_G_term) : gI_G_term;
     } else {
       gI_G_term = ggG_expanded * sigma2_eps_neg_1_2 * gO;
       gI = gI.defined() ? gI.add_(gI_G_term) : gI_G_term;
     }
   }

   // this is the first backward's grad_input
   auto first_back_grad_input = [&](const Tensor& gO, const Tensor& gamma) -> Tensor {
     auto h0 = (gamma * sigma2_eps_neg_1_2).div_(M);
     auto h1 = (M * gO).sub_(sum_exclude_dim1(gO)).sub_(
                 input_sub_mu.mul(sigma2_eps_neg_1) * sum_exclude_dim1(gO * input_sub_mu));
     return h0 * h1;
   };

   // calculate gG
   Tensor gG;
   if (affine && ggI.defined()) {
     if (training) {
       // gG is just the first backwards with the gamma term removed (then shaped properly)
       gG = ggI * first_back_grad_input(gO, at::ones({}, sigma2_eps_neg_1_2.options()));
       gG = sum_exclude_dim1(gG, false);
     } else {
       gG = sum_exclude_dim1(ggI * gO * sigma2_eps_neg_1_2, false);
     }
   }

   // calculate ggO
   Tensor ggO;
   // contribution of input term
   if (ggI.defined()) {
     if (training) {
       ggO = first_back_grad_input(ggI, gamma_expanded);
     } else {
       ggO = ggI * sigma2_eps_neg_1_2 * gamma_expanded;
     }
   }
   if (ggG.defined()) {
     auto ggO_G_term = ggG_expanded * input_sub_mu * sigma2_eps_neg_1_2;
     ggO = ggO.defined() ? ggO.add_(ggO_G_term) : ggO_G_term;
   }
   if (ggB.defined()) {
     auto ggO_B_term = ggB_expanded;
     ggO = ggO.defined() ? ggO.add_(ggO_B_term) : ggO_B_term;
   }

   if (output_mask[1] && !gG.defined()) {
     AT_ASSERTM(affine, "gamma should always be defined when it requires grad");
   }

   return std::tuple<Tensor, Tensor, Tensor>{gI, gG, ggO};

 }

 std::tuple<Tensor, Tensor, Tensor>
 infinitely_differentiable_native_layer_norm_backward(
     const Tensor& dY,
     const Tensor& dmean,
     const Tensor& drstd,
     const Tensor& X,
     const Tensor& mean,
     const Tensor& rstd,
     const c10::optional<Tensor>& gamma,
     IntArrayRef normalized_shape,
     double eps,
     std::array<bool, 3> grad_input_mask) {

   const int normalized_ndim = normalized_shape.size();
   const auto input_shape = X.sizes();
   const auto input_ndim = X.dim();
   const int axis = input_ndim - normalized_ndim;
   const int64_t M =
       at::prod_intlist(input_shape.cbegin(), input_shape.cbegin() + axis);
   const int64_t N =
       at::prod_intlist(input_shape.cbegin() + axis, input_shape.cend());

   Tensor dX;
   Tensor dgamma;
   Tensor dbeta;

   const Tensor X_tensor = X.reshape({M, N});
   const Tensor mean_tensor = mean.reshape({M, 1});
   const Tensor rstd_tensor = rstd.reshape({M, 1});
   const double s = 1.0 / static_cast<double>(N);

   Tensor dY_tensor;
   if (dY.defined()) {
     dY_tensor = dY.reshape({M, N});
   }

   if (grad_input_mask[0]) {
     Tensor gamma_tensor;
     if (isDefined(gamma)) {
       gamma_tensor = gamma->reshape({1, N});
     }
     Tensor rstd_cube = rstd_tensor * rstd_tensor * rstd_tensor;
     Tensor var;
     Tensor dvar;
     if (drstd.defined()) {
       var = ((rstd_tensor * rstd_tensor).reciprocal_() - eps).clamp_min(0);
       dvar = -0.5 * rstd_cube * drstd.view({M, 1});
     }
     Tensor ds;
     Tensor db;
     if (dY.defined()) {
       ds = (isDefined(gamma) ? dY_tensor * X_tensor * gamma_tensor
                             : dY_tensor * X_tensor)
                .sum(1)
                .unsqueeze_(-1);
       db = (isDefined(gamma) ? dY_tensor * gamma_tensor : dY_tensor)
                .sum(1)
                .unsqueeze_(-1);
       const Tensor& a = rstd_tensor;
       const Tensor b = (db * mean_tensor - ds) * rstd_cube * s;
       const Tensor c = -b * mean_tensor - db * rstd_tensor * s;
       if (isDefined(gamma)) {
         dX = a * dY_tensor * gamma_tensor + b * X_tensor + c;
       } else {
         dX = a * dY_tensor + b * X_tensor + c;
       }
       if (dmean.defined() && drstd.defined()) {
         dX += var_std_mean_backward(
             {dvar, dmean.view({M, 1})},
             X_tensor,
             var,
             mean_tensor,
             {1},
             false,
             true,
             false);
       }
       dX = dX.reshape_as(X);
     } else if (dmean.defined() && drstd.defined()) {
       dX = var_std_mean_backward(
                {dvar, dmean.view({M, 1})},
                X_tensor,
                var,
                mean_tensor,
                {1},
                false,
                true,
                false)
                .reshape_as(X);
     }
   }

   if (grad_input_mask[1] && dY.defined()) {
     dgamma = (dY_tensor * (X_tensor - mean_tensor) * rstd_tensor)
                  .sum(0)
                  .reshape_as(toLegacyTensor(gamma));
   }
   if (grad_input_mask[2] && dY.defined()) {
     dbeta = dY_tensor.sum(0).reshape_as(toLegacyTensor(gamma));
   }

   return std::make_tuple(dX, dgamma, dbeta);
 }

 std::tuple<Tensor, Tensor, Tensor>
 infinitely_differentiable_native_group_norm_backward(
     const Tensor& dY,
     const Tensor& dmean,
     const Tensor& drstd,
     const Tensor& X,
     const Tensor& mean,
     const Tensor& rstd,
     const c10::optional<Tensor>& gamma,
     int64_t N,
     int64_t C,
     int64_t HxW,
     int64_t group,
     double eps,
     std::array<bool, 3> grad_input_mask) {
   const int64_t G = group;
   const int64_t D = C / G;
   const double s = 1.0 / static_cast<double>(D * HxW);
   Tensor dX;
   Tensor dgamma;
   Tensor dbeta;
   const Tensor X_tensor = X.reshape({N, G, D, HxW});
   const Tensor mean_tensor = mean.reshape({N, G, 1, 1});
   const Tensor rstd_tensor = rstd.reshape({N, G, 1, 1});
   Tensor dY_tensor;
   Tensor ds;
   Tensor db;
   if (dY.defined()) {
     dY_tensor = dY.reshape({N, G, D, HxW});
     ds = (dY_tensor * X_tensor).sum(3).unsqueeze_(-1);
     db = dY_tensor.sum(3).unsqueeze_(-1);
   }
   if (grad_input_mask[0]) {
     Tensor gamma_tensor;
     if (isDefined(gamma)) {
       gamma_tensor = gamma->reshape({1, G, D, 1});
     }
     const Tensor var =
         ((rstd_tensor * rstd_tensor).reciprocal_() - eps).clamp_min(0);
     const Tensor rstd_cube = rstd_tensor * rstd_tensor * rstd_tensor;
     Tensor dvar;
     if (drstd.defined()) {
       dvar = -0.5 * rstd_cube * drstd.view({N, G, 1, 1});
     }
     if (dY.defined()) {
       const Tensor a =
           isDefined(gamma) ? rstd_tensor * gamma_tensor : rstd_tensor;
       Tensor b = (isDefined(gamma) ? (ds * gamma_tensor).sum(2) : ds.sum(2))
                      .unsqueeze_(-2);
       Tensor c = (isDefined(gamma) ? (db * gamma_tensor).sum(2) : db.sum(2))
                      .unsqueeze_(-2);
       b = (c * mean_tensor - b) * rstd_cube * s;
       c = -b * mean_tensor - c * rstd_tensor * s;
       dX = a * dY_tensor + b * X_tensor + c;
       if (dmean.defined() && drstd.defined()) {
         dX += var_std_mean_backward(
             {dvar, dmean.view({N, G, 1, 1})},
             X_tensor,
             var,
             mean_tensor,
             {2, 3},
             false,
             true,
             false);
       }
       dX = dX.reshape_as(X);
     } else if (dmean.defined() && drstd.defined()) {
       dX = var_std_mean_backward(
                {dvar, dmean.view({N, G, 1, 1})},
                X_tensor,
                var,
                mean_tensor,
                {2, 3},
                false,
                true,
                false)
                .reshape_as(X);
     }
   }
   if (grad_input_mask[1] && dY.defined()) {
     dgamma = ((ds - db * mean_tensor) * rstd_tensor).sum(0).reshape_as(toLegacyTensor(gamma));
   }
   if (grad_input_mask[2] && dY.defined()) {
     dbeta = db.sum(0).reshape_as(toLegacyTensor(gamma));
   }

   return std::make_tuple(dX, dgamma, dbeta);
 }

 std::tuple<Tensor, Tensor, Tensor> _trilinear_backward(const Tensor& grad_out, const Tensor& i1, const Tensor& i2, const Tensor& i3,
                                                        IntArrayRef expand1, IntArrayRef expand2, IntArrayRef expand3,
                                                        IntArrayRef sumdim, int64_t unroll_dim, std::array<bool, 3> grad_mask) {
   Tensor grad_i1, grad_i2, grad_i3;
   if (grad_out.defined()) {
     if (grad_mask[0])
       grad_i1 = at::_trilinear(grad_out, i2, i3, sumdim, expand2, expand3, expand1);
     if (grad_mask[1])
       grad_i2 = at::_trilinear(i1, grad_out, i3, expand1, sumdim, expand3, expand2);
     if (grad_mask[2])
       grad_i3 = at::_trilinear(i1, i2, grad_out, expand1, expand2, sumdim, expand3);
   }
   return std::tuple<Tensor, Tensor, Tensor>(grad_i1, grad_i2, grad_i3);
 }

 Tensor log1p_backward(const Tensor& grad, const Tensor& self) {
   if (self.is_sparse()) {
     AT_ERROR(
       "log1p of a sparse tensor is made to be non-differentiable since ",
       "local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. ",
       "Use a different mathematical operation which preserves sparsity of gradients, ",
       "or report a bug if you think this is an error.");
   }
   return grad / (self + 1).conj();
 }

 Tensor sparse_constructor_values_backward(const Tensor& sparse_grad_out, const Tensor& indices, IntArrayRef values_shape) {
   // TODO: improve this backward by writing a kernel (maybe)
   auto dense_grad = sparse_grad_out.is_sparse() ? sparse_grad_out.to_dense() : sparse_grad_out;
   auto full_size = sparse_grad_out.sizes();
   auto flattened_grad_shape = values_shape.vec();
   flattened_grad_shape[0] = at::prod_intlist(full_size.slice(0, indices.size(0)));
   auto flattened_dense_grad = dense_grad.view(flattened_grad_shape);
   auto flattened_indices = at::sparse::flatten_indices(indices, full_size);
   return flattened_dense_grad.index_select(0, flattened_indices);
 }

 // Because the backward of pad(input, pads) is just pad(grad_output, [-p for p in pads])
 Tensor constant_pad_nd_backward(const Tensor& grad, IntArrayRef pad) {
   auto negated_pad = pad.vec();
   std::transform(negated_pad.cbegin(), negated_pad.cend(), negated_pad.begin(), std::negate<int64_t>());
   return at::constant_pad_nd(grad, negated_pad, 0);
 }

 Tensor embedding_dense_double_backward(const Tensor & grad, const Tensor & indices, int64_t padding_idx) {
   // since first backward takes care of scaling by frequency,
   // we don't need to worry about it here.
   auto gg_weight = grad.index_select(0, indices.reshape(-1));

   // reshape gradient as per the shape of indices
   auto size = indices.sizes().vec();
   size.push_back(-1);

   if (padding_idx >= 0) {
     gg_weight.masked_fill_((indices == padding_idx).reshape({-1, 1}), 0);
   }
   return gg_weight.view(size);
 }

 Tensor index_backward(Tensor zeros_like_self, const torch::List<c10::optional<Tensor>>& indices, const Tensor& grad) {
   return at::_index_put_impl_(zeros_like_self, indices, grad, true, true);
 }

 Tensor _cudnn_ctc_loss_backward(const Tensor& grad_out, const Tensor& loss, const Tensor& raw_grad, bool zero_infinity) {
   if (zero_infinity) {
     return at::where(
         loss.unsqueeze(0).unsqueeze(2) == 0,
         at::zeros({0}, raw_grad.options()),
         raw_grad * grad_out.unsqueeze(0).unsqueeze(2));
   } else {
     return raw_grad * grad_out.unsqueeze(0).unsqueeze(2);
   }
 }

 bool any_variable_defined(variable_list& variables) {
   for (auto variable : variables) {
     if (variable.defined()) {
       return true;
     }
   }
   return false;
 }

 } // namespace details
 } // namespace generated
 } // namespace autograd
 } // namespace torch