aten/src/ATen/native/Distance.cpp - platform/external/pytorch - Git at Google

 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
 #include <ATen/core/grad_mode.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/TensorOperators.h>
 #include <ATen/native/Distance.h>
 #include <c10/util/accumulate.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/_cdist_backward_native.h>
 #include <ATen/ops/_cdist_forward.h>
 #include <ATen/ops/_cdist_forward_native.h>
 #include <ATen/ops/_euclidean_dist.h>
 #include <ATen/ops/_euclidean_dist_native.h>
 #include <ATen/ops/_pdist_backward_native.h>
 #include <ATen/ops/_pdist_forward.h>
 #include <ATen/ops/_pdist_forward_native.h>
 #include <ATen/ops/cat.h>
 #include <ATen/ops/cdist_native.h>
 #include <ATen/ops/cosine_similarity_native.h>
 #include <ATen/ops/empty.h>
 #include <ATen/ops/empty_like.h>
 #include <ATen/ops/linalg_vector_norm.h>
 #include <ATen/ops/norm.h>
 #include <ATen/ops/ones_like.h>
 #include <ATen/ops/pairwise_distance_native.h>
 #include <ATen/ops/pdist_native.h>
 #include <ATen/ops/pow.h>
 #include <ATen/ops/result_type.h>
 #include <ATen/ops/sum.h>
 #include <ATen/ops/zeros.h>
 #include <ATen/ops/zeros_like.h>

 #include <utility>
 #endif

 namespace at::native {

 DEFINE_DISPATCH(pdist_forward_stub);
 DEFINE_DISPATCH(pdist_backward_stub);
 DEFINE_DISPATCH(cdist_stub);
 DEFINE_DISPATCH(cdist_backward_stub);

 Tensor pairwise_distance(const Tensor& x1, const Tensor& x2, double p, double eps, bool keepdim) {
   // Since either x1 or x2 could be broadcasted
   auto x1_dim = x1.dim();
   auto x2_dim = x2.dim();
   auto output_dim = x1_dim > x2_dim ? x1_dim : x2_dim;
   auto innermost_dim = output_dim - 1;
   return at::norm(x1 - x2 + eps, p, innermost_dim, keepdim);
 }

 // This is to guarantee that the contiguous memory is passed to the backward pass
 Tensor pdist(const Tensor& self, const double p) {
   TORCH_CHECK(self.dim() == 2,
       "pdist only supports 2D tensors, got: ", self.dim(), "D");
   TORCH_CHECK(at::isFloatingType(self.scalar_type()), "pdist only supports floating-point dtypes");
   TORCH_CHECK(p >= 0, "pdist only supports non-negative p values");
   return at::_pdist_forward(self.contiguous(), p);
 }

 Tensor _euclidean_dist(const Tensor& x1, const Tensor& x2) {
   /** This function does the fist part of the euclidean distance calculation
    * We divide it in two steps to simplify dealing with subgradients in the
    * backward step */
   Tensor x1_norm = x1.pow(2).sum(-1, true);
   Tensor x1_pad = at::ones_like(x1_norm, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   Tensor x2_norm = x2.pow(2).sum(-1, true);
   Tensor x2_pad = at::ones_like(x2_norm, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   Tensor x1_ = at::cat({x1.mul(-2), std::move(x1_norm), std::move(x1_pad)}, -1);
   Tensor x2_ = at::cat({x2, std::move(x2_pad), std::move(x2_norm)}, -1);
   Tensor result = x1_.matmul(x2_.mT());
   result.clamp_min_(0).sqrt_();
   return result;
 }

 static Tensor cdist_impl(const Tensor& x1, const Tensor& x2, const double p, c10::optional<int64_t> compute_mode) {
   TORCH_CHECK(at::isFloatingType(x1.scalar_type()), "cdist only supports floating-point dtypes, X1 got: ", x1.scalar_type());
   auto device1 = x1.device().type();
   TORCH_CHECK(at::isFloatingType(x2.scalar_type()), "cdist only supports floating-point dtypes, X2 got: ", x2.scalar_type());
   auto device2 = x2.device().type();
   TORCH_CHECK(p >= 0, "cdist only supports non-negative p values");
   TORCH_CHECK(device1 == device2, "X1 and X2 must have the same device type. X1: ", device1, " X2: ", device2);
   // TODO: This is bad; this test should apply universally
   TORCH_CHECK(!x1.is_cuda() || x1.get_device() == x2.get_device(), "device of X1 (", x1.get_device(), ") must match device of X2 (", x2.get_device(), ")");
   SymInt c1 = x1.sym_size(-1);
   SymInt c2 = x2.sym_size(-1);
   // 0 - default value. If p = 2 and r1 > 25 or r2 > 25 (these values are based on performance metrics),
   // it will try to compute distance using matrix multiplication approach
   // 1 - force to use matrix multiplication for p = 2
   // 2 - do not use matrix multiplication for p = 2
   int64_t mode = compute_mode.value_or(0);
   TORCH_CHECK(mode >= 0 && mode <= 2, "possible modes: 0, 1, 2, but was: ", mode);

   SymInt r1 = x1.sym_size(-2);
   SymInt r2 = x2.sym_size(-2);

   // See Note [cdist relies on cdist_impl redispatching]
   // Keep this condition in sync with the condition at the Note
   if (!(p == 2 && (mode == 1 || (mode == 0 && (r1 > 25 || r2 > 25))))) {
     TORCH_CHECK(device1 == kCPU || device1 == kCUDA, "cdist only supports CPU and CUDA devices, X1 got: ", device1);
     TORCH_CHECK(device2 == kCPU || device2 == kCUDA, "cdist only supports CPU and CUDA devices, X2 got: ", device2);
   }

   auto dim1 = x1.dim();
   auto dim2 = x2.dim();

   //For batch calculation we expand all dimensions(except the last two) to one, with size that equals to product of them.
   //The last two dimensions will stay the same
   SymIntArrayRef batch_tensor1(x1.sym_sizes().data(), dim1 - 2);
   SymIntArrayRef batch_tensor2(x2.sym_sizes().data(), dim2 - 2);
   std::vector<SymInt> expand_batch_portion = infer_size_symint(batch_tensor1, batch_tensor2);
   std::vector<SymInt> tensor1_expand_size(expand_batch_portion);
   tensor1_expand_size.insert(tensor1_expand_size.end(), {r1, c1});
   std::vector<SymInt> tensor2_expand_size(expand_batch_portion);
   tensor2_expand_size.insert(tensor2_expand_size.end(), {r2, c2});

   const SymInt expand_batch_product = c10::multiply_integers(expand_batch_portion);
   std::vector<SymInt> tensor1_view{expand_batch_product, r1, c1};
   std::vector<SymInt> tensor2_view{expand_batch_product, r2, c2};

   Tensor tensor1_expanded = x1.expand_symint(tensor1_expand_size).contiguous().view_symint(tensor1_view);
   Tensor tensor2_expanded = x2.expand_symint(tensor2_expand_size).contiguous().view_symint(tensor2_view);

   std::vector<SymInt> output_shape(std::move(expand_batch_portion));
   output_shape.insert(output_shape.end(), {r1, r2});

   Tensor result;
   if (r1 == 0 || r2 == 0 || expand_batch_product == 0) {
     result = at::empty_symint(output_shape, x1.options());
   } else if (c1 == 0) {
     result = at::zeros_symint(output_shape, x1.options());
   } else if (p == 2 && (mode == 1 || (mode == 0 && (r1 > 25 || r2 > 25)))) {
     // See Note [cdist relies on cdist_impl redispatching]
     // Keep the condition above in sync with the condition at the Note
     Tensor dist = (expand_batch_product == 1) ? at::_euclidean_dist(x1, x2) :
                   at::_euclidean_dist(tensor1_expanded, tensor2_expanded);
     result = dist.view_symint(output_shape);
   } else {
     result = at::empty_symint(output_shape, x1.options());
     cdist_stub(device1, result, tensor1_expanded, tensor2_expanded, p);
   }
   return result;
 }

 Tensor cdist(const Tensor& x1, const Tensor& x2, const double p, c10::optional<int64_t> compute_mode) {
   TORCH_CHECK(x1.dim() >= 2, "cdist only supports at least 2D tensors, X1 got: ", x1.dim(), "D");
   TORCH_CHECK(x2.dim() >= 2, "cdist only supports at least 2D tensors, X2 got: ", x2.dim(), "D");
   TORCH_CHECK(x1.sym_size(-1) == x2.sym_size(-1), "X1 and X2 must have the same number of columns. X1: ", x1.sym_size(-1), " X2: ", x2.sym_size(-1));
   auto maybe_outnames = namedinference::compute_cdist_outnames(x1, x2);
   auto result = [&]() {
     NoNamesGuard guard;
     SymInt r1 = x1.sym_size(-2);
     SymInt r2 = x2.sym_size(-2);
     // Special case for empty input: always call the version with explicit autograd to ensure the graph is properly connected
     if (x1.sym_numel() == 0 || x2.sym_numel() == 0) {
         return at::_cdist_forward(x1, x2, p, compute_mode);
     }
     int64_t mode = compute_mode.value_or(0);
     // Note [cdist relies on cdist_impl redispatching]
     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     // This is for pytorch to figure the backward pass itself
     // when p=2.  Keep this condition in sync with the See Note reference
     if (p == 2 && (mode == 1 || (mode == 0 && (r1 > 25 || r2 > 25)))) {
         return cdist_impl(x1, x2, p, compute_mode);
     } else {
         return at::_cdist_forward(x1, x2, p, compute_mode);
     }
   }();
   namedinference::propagate_names_if_nonempty(result, maybe_outnames);
   return result;
 }

 Tensor _cdist_forward(const Tensor& x1, const Tensor& x2, const double p, c10::optional<int64_t> compute_mode) {
   TORCH_CHECK(x1.dim() >= 2, "cdist only supports at least 2D tensors, X1 got: ", x1.dim(), "D");
   TORCH_CHECK(x2.dim() >= 2, "cdist only supports at least 2D tensors, X2 got: ", x2.dim(), "D");
   TORCH_CHECK(x1.size(-1) == x2.size(-1), "X1 and X2 must have the same number of columns. X1: ", x1.size(-1), " X2: ", x2.size(-1));
   auto maybe_outnames = namedinference::compute_cdist_outnames(x1, x2);
   auto result = [&]() {
     NoNamesGuard guard;
     return cdist_impl(x1, x2, p, compute_mode);
   }();
   namedinference::propagate_names_if_nonempty(result, maybe_outnames);
   return result;
 }

 Tensor _cdist_backward(const Tensor& _grad, const Tensor& _x1, const Tensor& _x2, const double p, const Tensor& _cdist) {
   // Broadcasting might generate non-contiguous Tensors, so handle it before doing checks
   int64_t c1 = _x1.size(-1);
   int64_t c2 = _x2.size(-1);
   int64_t r1 = _x1.size(-2);
   int64_t r2 = _x2.size(-2);
   auto dim1 = _x1.dim();
   auto dim2 = _x2.dim();
   IntArrayRef batch_tensor1(_x1.sizes().data(), dim1 - 2);
   IntArrayRef batch_tensor2(_x2.sizes().data(), dim2 - 2);
   std::vector<int64_t> expand_batch_portion = infer_size(batch_tensor1, batch_tensor2);
   std::vector<int64_t> tensor1_expand_size(expand_batch_portion);
   tensor1_expand_size.insert(tensor1_expand_size.end(), {r1, c1});
   std::vector<int64_t> tensor2_expand_size(expand_batch_portion);
   tensor2_expand_size.insert(tensor2_expand_size.end(), {r2, c2});

   // Compute the linearized batch size
   const int64_t batch_product = c10::multiply_integers(expand_batch_portion);

   // Gracefully handle empty Tensors
   if (r1 == 0 || r2 == 0 || c1 == 0 || batch_product == 0) {
     return at::zeros_like(_x1, _x1.options());
   }

   Tensor x1 = _x1;
   if (tensor1_expand_size != x1.sizes()) {
     x1 = x1.expand(tensor1_expand_size);
   }
   Tensor x2 = _x2;
   if (tensor2_expand_size != x2.sizes()) {
     x2 = x2.expand(tensor2_expand_size);
   }

   x1 = x1.contiguous();
   x2 = x2.contiguous();
   auto cdist = _cdist.contiguous();
   auto grad = _grad.contiguous();
   int64_t n = x1.size(-2);
   int64_t m = x1.size(-1);
   auto device1 = x1.device().type();
   TORCH_CHECK(device1 == kCPU || device1 == kCUDA, "_cdist_backward only supports CPU and CUDA devices, X1 got: ", device1);
   auto device2 = x2.device().type();
   TORCH_CHECK(device2 == kCPU || device2 == kCUDA, "_cdist_backward only supports CPU and CUDA devices, X2 got: ", device2);

   Tensor grad_x1 =
       at::empty({batch_product, n, m}, x1.options(), LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   cdist_backward_stub(device1, grad_x1, grad, x1, x2, p, cdist);

   // Use x1.size() here and not the original size of _x1.size() as this gradient is not taking broadcasting into account
   // Broadcasting will be handled automatically by the autograd engine
   return grad_x1.view(x1.sizes());
 }

 Tensor _pdist_forward(const Tensor& self, const double p) {
   TORCH_CHECK(self.is_contiguous(), "_pdist_forward requires contiguous input");
   auto device = self.device().type();
   TORCH_CHECK(device == kCPU || device == kCUDA, "_pdist_forward only supports CPU and CUDA devices, got: ", device);
   Tensor result = at::empty({0}, self.options(), LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   if (self.size(0) <= 1) {
     result.resize_({0});
   } else {
     int64_t n = self.size(0);
     int64_t c = n * (n - 1) / 2;
     result.resize_({c});
     if (self.size(1) == 0) {
       result.fill_(0);
     } else {
       pdist_forward_stub(device, result, self, p);
     }
   }
   return result;
 }

 Tensor _pdist_backward(const Tensor& grad, const Tensor& self, const double p, const Tensor& pdist) {
   TORCH_CHECK(self.is_contiguous(), "_pdist_backward requires self to be contiguous");
   TORCH_CHECK(pdist.is_contiguous(), "_pdist_backward requires pdist to be contiguous");
   auto device = self.device().type();
   TORCH_CHECK(device == kCPU || device == kCUDA, "_pdist_backward only supports CPU and CUDA devices, got: ", device);
   Tensor result = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   pdist_backward_stub(device, result, grad, self, p, pdist);
   return result;
 }

 Tensor cosine_similarity(const Tensor& x1_, const Tensor& x2_, int64_t dim, double eps) {
     /*
    * cosine_similarity(x1, x2) = <x1, x2> / (||x1|| * ||x2||)
    *
    * The current implementation is an improvement over the previous version.
    *
    * Previous implementation:
    * 1. Compute num = <x1, x2>,
    * 2. Compute denom = ||x1|| * ||x2||,
    * 3. Compute denom = max(denom, eps) to avoid division by zero,
    * 4. Return num / denom.
    *
    * Previous implementation has the following issues:
    * 1. Chance of losing precision in <x1, x2> when ||x1|| and ||x2|| are large.
    * 2. Chance of losing precision in ||x1|| * ||x2|| when ||x1|| and ||x2|| are large.
    * 3. Losing precision may cause |cosing_similarity(x1, x2)| > 1.0.
    *
    * Current implementation:
    * 1. Compute x1_normalized = x1 / max(||x1||, eps),
    *            x2_normalized = x2 / max(||x2||, eps),
    * 2. Return <x1_normalized, x2_normalized>.
    *
    * The current implementation improves over the previous one by:
    * 1. Making sure that <x1, x2> and ||x1|| * ||x2|| are not computed explicitly,
    *    hence avoiding floating point overflows.
    * 2. Both methods might have issues with computing ||x1|| and ||x2||, but for
    *    the current method this is the only source of the floating point imprecision.
    * 3. Makes sure |cosing_similarity(x1, x2)| <= 1.0.
    *
    */

   auto commonDtype = at::result_type(x1_, x2_);
   TORCH_CHECK(at::isFloatingType(commonDtype), "expected common dtype to be floating point, yet common dtype is ", commonDtype);

   // We accept integral types (and bools lol) but vector_norm does not
   auto x1_is_int = c10::isIntegralType(x1_.scalar_type(), /*încludeBool=*/true);
   auto x2_is_int = c10::isIntegralType(x2_.scalar_type(), /*încludeBool=*/true);
   auto x1_t = x1_is_int ? x1_.to(commonDtype) : x1_;
   auto x2_t = x2_is_int ? x2_.to(commonDtype) : x2_;
   auto [x1, x2] = expand_outplace(x1_t, x2_t);


   // We want to divide each tensor by its norm first, as it's more numerically stable.
   // This keeps the result between -1.0 and 1.0
   // We clone them, as we're going to modify them in-place
   // This allows the gradients to propagate properly all the way to x1 and x2
   auto x1_norm = at::linalg_vector_norm(*x1, 2, /*dim=*/dim, /*keepdim=*/true).clone();
   auto x2_norm = at::linalg_vector_norm(*x2, 2, /*dim=*/dim, /*keepdim=*/true).clone();

   {
     at::NoGradGuard guard;
     x1_norm.clamp_min_(eps);
     x2_norm.clamp_min_(eps);
   }

   return ((*x1 / x1_norm) * (*x2 / x2_norm)).sum(dim);
 }

 }  // namespace at::native
	#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
	#include <ATen/core/Tensor.h>
	#include <ATen/core/grad_mode.h>
	#include <ATen/ExpandUtils.h>
	#include <ATen/NamedTensorUtils.h>
	#include <ATen/TensorOperators.h>
	#include <ATen/native/Distance.h>
	#include <c10/util/accumulate.h>

	#ifndef AT_PER_OPERATOR_HEADERS
	#include <ATen/Functions.h>
	#include <ATen/NativeFunctions.h>
	#else
	#include <ATen/ops/_cdist_backward_native.h>
	#include <ATen/ops/_cdist_forward.h>
	#include <ATen/ops/_cdist_forward_native.h>
	#include <ATen/ops/_euclidean_dist.h>
	#include <ATen/ops/_euclidean_dist_native.h>
	#include <ATen/ops/_pdist_backward_native.h>
	#include <ATen/ops/_pdist_forward.h>
	#include <ATen/ops/_pdist_forward_native.h>
	#include <ATen/ops/cat.h>
	#include <ATen/ops/cdist_native.h>
	#include <ATen/ops/cosine_similarity_native.h>
	#include <ATen/ops/empty.h>
	#include <ATen/ops/empty_like.h>
	#include <ATen/ops/linalg_vector_norm.h>
	#include <ATen/ops/norm.h>
	#include <ATen/ops/ones_like.h>
	#include <ATen/ops/pairwise_distance_native.h>
	#include <ATen/ops/pdist_native.h>
	#include <ATen/ops/pow.h>
	#include <ATen/ops/result_type.h>
	#include <ATen/ops/sum.h>
	#include <ATen/ops/zeros.h>
	#include <ATen/ops/zeros_like.h>

	#include <utility>
	#endif

	namespace at::native {

	DEFINE_DISPATCH(pdist_forward_stub);
	DEFINE_DISPATCH(pdist_backward_stub);
	DEFINE_DISPATCH(cdist_stub);
	DEFINE_DISPATCH(cdist_backward_stub);

	Tensor pairwise_distance(const Tensor& x1, const Tensor& x2, double p, double eps, bool keepdim) {
	// Since either x1 or x2 could be broadcasted
	auto x1_dim = x1.dim();
	auto x2_dim = x2.dim();
	auto output_dim = x1_dim > x2_dim ? x1_dim : x2_dim;
	auto innermost_dim = output_dim - 1;
	return at::norm(x1 - x2 + eps, p, innermost_dim, keepdim);
	}

	// This is to guarantee that the contiguous memory is passed to the backward pass
	Tensor pdist(const Tensor& self, const double p) {
	TORCH_CHECK(self.dim() == 2,
	"pdist only supports 2D tensors, got: ", self.dim(), "D");
	TORCH_CHECK(at::isFloatingType(self.scalar_type()), "pdist only supports floating-point dtypes");
	TORCH_CHECK(p >= 0, "pdist only supports non-negative p values");
	return at::_pdist_forward(self.contiguous(), p);
	}

	Tensor _euclidean_dist(const Tensor& x1, const Tensor& x2) {
	/** This function does the fist part of the euclidean distance calculation
	* We divide it in two steps to simplify dealing with subgradients in the
	* backward step */
	Tensor x1_norm = x1.pow(2).sum(-1, true);
	Tensor x1_pad = at::ones_like(x1_norm, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
	Tensor x2_norm = x2.pow(2).sum(-1, true);
	Tensor x2_pad = at::ones_like(x2_norm, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
	Tensor x1_ = at::cat({x1.mul(-2), std::move(x1_norm), std::move(x1_pad)}, -1);
	Tensor x2_ = at::cat({x2, std::move(x2_pad), std::move(x2_norm)}, -1);
	Tensor result = x1_.matmul(x2_.mT());
	result.clamp_min_(0).sqrt_();
	return result;
	}

	static Tensor cdist_impl(const Tensor& x1, const Tensor& x2, const double p, c10::optional<int64_t> compute_mode) {
	TORCH_CHECK(at::isFloatingType(x1.scalar_type()), "cdist only supports floating-point dtypes, X1 got: ", x1.scalar_type());
	auto device1 = x1.device().type();
	TORCH_CHECK(at::isFloatingType(x2.scalar_type()), "cdist only supports floating-point dtypes, X2 got: ", x2.scalar_type());
	auto device2 = x2.device().type();
	TORCH_CHECK(p >= 0, "cdist only supports non-negative p values");
	TORCH_CHECK(device1 == device2, "X1 and X2 must have the same device type. X1: ", device1, " X2: ", device2);
	// TODO: This is bad; this test should apply universally
	TORCH_CHECK(!x1.is_cuda() \|\| x1.get_device() == x2.get_device(), "device of X1 (", x1.get_device(), ") must match device of X2 (", x2.get_device(), ")");
	SymInt c1 = x1.sym_size(-1);
	SymInt c2 = x2.sym_size(-1);
	// 0 - default value. If p = 2 and r1 > 25 or r2 > 25 (these values are based on performance metrics),
	// it will try to compute distance using matrix multiplication approach
	// 1 - force to use matrix multiplication for p = 2
	// 2 - do not use matrix multiplication for p = 2
	int64_t mode = compute_mode.value_or(0);
	TORCH_CHECK(mode >= 0 && mode <= 2, "possible modes: 0, 1, 2, but was: ", mode);

	SymInt r1 = x1.sym_size(-2);
	SymInt r2 = x2.sym_size(-2);

	// See Note [cdist relies on cdist_impl redispatching]
	// Keep this condition in sync with the condition at the Note
	if (!(p == 2 && (mode == 1 \|\| (mode == 0 && (r1 > 25 \|\| r2 > 25))))) {
	TORCH_CHECK(device1 == kCPU \|\| device1 == kCUDA, "cdist only supports CPU and CUDA devices, X1 got: ", device1);
	TORCH_CHECK(device2 == kCPU \|\| device2 == kCUDA, "cdist only supports CPU and CUDA devices, X2 got: ", device2);
	}

	auto dim1 = x1.dim();
	auto dim2 = x2.dim();

	//For batch calculation we expand all dimensions(except the last two) to one, with size that equals to product of them.
	//The last two dimensions will stay the same
	SymIntArrayRef batch_tensor1(x1.sym_sizes().data(), dim1 - 2);
	SymIntArrayRef batch_tensor2(x2.sym_sizes().data(), dim2 - 2);
	std::vector<SymInt> expand_batch_portion = infer_size_symint(batch_tensor1, batch_tensor2);
	std::vector<SymInt> tensor1_expand_size(expand_batch_portion);
	tensor1_expand_size.insert(tensor1_expand_size.end(), {r1, c1});
	std::vector<SymInt> tensor2_expand_size(expand_batch_portion);
	tensor2_expand_size.insert(tensor2_expand_size.end(), {r2, c2});

	const SymInt expand_batch_product = c10::multiply_integers(expand_batch_portion);
	std::vector<SymInt> tensor1_view{expand_batch_product, r1, c1};
	std::vector<SymInt> tensor2_view{expand_batch_product, r2, c2};

	Tensor tensor1_expanded = x1.expand_symint(tensor1_expand_size).contiguous().view_symint(tensor1_view);
	Tensor tensor2_expanded = x2.expand_symint(tensor2_expand_size).contiguous().view_symint(tensor2_view);

	std::vector<SymInt> output_shape(std::move(expand_batch_portion));
	output_shape.insert(output_shape.end(), {r1, r2});

	Tensor result;
	if (r1 == 0 \|\| r2 == 0 \|\| expand_batch_product == 0) {
	result = at::empty_symint(output_shape, x1.options());
	} else if (c1 == 0) {
	result = at::zeros_symint(output_shape, x1.options());
	} else if (p == 2 && (mode == 1 \|\| (mode == 0 && (r1 > 25 \|\| r2 > 25)))) {
	// See Note [cdist relies on cdist_impl redispatching]
	// Keep the condition above in sync with the condition at the Note
	Tensor dist = (expand_batch_product == 1) ? at::_euclidean_dist(x1, x2) :
	at::_euclidean_dist(tensor1_expanded, tensor2_expanded);
	result = dist.view_symint(output_shape);
	} else {
	result = at::empty_symint(output_shape, x1.options());
	cdist_stub(device1, result, tensor1_expanded, tensor2_expanded, p);
	}
	return result;
	}

	Tensor cdist(const Tensor& x1, const Tensor& x2, const double p, c10::optional<int64_t> compute_mode) {
	TORCH_CHECK(x1.dim() >= 2, "cdist only supports at least 2D tensors, X1 got: ", x1.dim(), "D");
	TORCH_CHECK(x2.dim() >= 2, "cdist only supports at least 2D tensors, X2 got: ", x2.dim(), "D");
	TORCH_CHECK(x1.sym_size(-1) == x2.sym_size(-1), "X1 and X2 must have the same number of columns. X1: ", x1.sym_size(-1), " X2: ", x2.sym_size(-1));
	auto maybe_outnames = namedinference::compute_cdist_outnames(x1, x2);
	auto result = [&]() {
	NoNamesGuard guard;
	SymInt r1 = x1.sym_size(-2);
	SymInt r2 = x2.sym_size(-2);
	// Special case for empty input: always call the version with explicit autograd to ensure the graph is properly connected
	if (x1.sym_numel() == 0 \|\| x2.sym_numel() == 0) {
	return at::_cdist_forward(x1, x2, p, compute_mode);
	}
	int64_t mode = compute_mode.value_or(0);
	// Note [cdist relies on cdist_impl redispatching]
	// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	// This is for pytorch to figure the backward pass itself
	// when p=2. Keep this condition in sync with the See Note reference
	if (p == 2 && (mode == 1 \|\| (mode == 0 && (r1 > 25 \|\| r2 > 25)))) {
	return cdist_impl(x1, x2, p, compute_mode);
	} else {
	return at::_cdist_forward(x1, x2, p, compute_mode);
	}
	}();
	namedinference::propagate_names_if_nonempty(result, maybe_outnames);
	return result;
	}

	Tensor _cdist_forward(const Tensor& x1, const Tensor& x2, const double p, c10::optional<int64_t> compute_mode) {
	TORCH_CHECK(x1.dim() >= 2, "cdist only supports at least 2D tensors, X1 got: ", x1.dim(), "D");
	TORCH_CHECK(x2.dim() >= 2, "cdist only supports at least 2D tensors, X2 got: ", x2.dim(), "D");
	TORCH_CHECK(x1.size(-1) == x2.size(-1), "X1 and X2 must have the same number of columns. X1: ", x1.size(-1), " X2: ", x2.size(-1));
	auto maybe_outnames = namedinference::compute_cdist_outnames(x1, x2);
	auto result = [&]() {
	NoNamesGuard guard;
	return cdist_impl(x1, x2, p, compute_mode);
	}();
	namedinference::propagate_names_if_nonempty(result, maybe_outnames);
	return result;
	}

	Tensor _cdist_backward(const Tensor& _grad, const Tensor& _x1, const Tensor& _x2, const double p, const Tensor& _cdist) {
	// Broadcasting might generate non-contiguous Tensors, so handle it before doing checks
	int64_t c1 = _x1.size(-1);
	int64_t c2 = _x2.size(-1);
	int64_t r1 = _x1.size(-2);
	int64_t r2 = _x2.size(-2);
	auto dim1 = _x1.dim();
	auto dim2 = _x2.dim();
	IntArrayRef batch_tensor1(_x1.sizes().data(), dim1 - 2);
	IntArrayRef batch_tensor2(_x2.sizes().data(), dim2 - 2);
	std::vector<int64_t> expand_batch_portion = infer_size(batch_tensor1, batch_tensor2);
	std::vector<int64_t> tensor1_expand_size(expand_batch_portion);
	tensor1_expand_size.insert(tensor1_expand_size.end(), {r1, c1});
	std::vector<int64_t> tensor2_expand_size(expand_batch_portion);
	tensor2_expand_size.insert(tensor2_expand_size.end(), {r2, c2});

	// Compute the linearized batch size
	const int64_t batch_product = c10::multiply_integers(expand_batch_portion);

	// Gracefully handle empty Tensors
	if (r1 == 0 \|\| r2 == 0 \|\| c1 == 0 \|\| batch_product == 0) {
	return at::zeros_like(_x1, _x1.options());
	}

	Tensor x1 = _x1;
	if (tensor1_expand_size != x1.sizes()) {
	x1 = x1.expand(tensor1_expand_size);
	}
	Tensor x2 = _x2;
	if (tensor2_expand_size != x2.sizes()) {
	x2 = x2.expand(tensor2_expand_size);
	}

	x1 = x1.contiguous();
	x2 = x2.contiguous();
	auto cdist = _cdist.contiguous();
	auto grad = _grad.contiguous();
	int64_t n = x1.size(-2);
	int64_t m = x1.size(-1);
	auto device1 = x1.device().type();
	TORCH_CHECK(device1 == kCPU \|\| device1 == kCUDA, "_cdist_backward only supports CPU and CUDA devices, X1 got: ", device1);
	auto device2 = x2.device().type();
	TORCH_CHECK(device2 == kCPU \|\| device2 == kCUDA, "_cdist_backward only supports CPU and CUDA devices, X2 got: ", device2);

	Tensor grad_x1 =
	at::empty({batch_product, n, m}, x1.options(), LEGACY_CONTIGUOUS_MEMORY_FORMAT);
	cdist_backward_stub(device1, grad_x1, grad, x1, x2, p, cdist);

	// Use x1.size() here and not the original size of _x1.size() as this gradient is not taking broadcasting into account
	// Broadcasting will be handled automatically by the autograd engine
	return grad_x1.view(x1.sizes());
	}

	Tensor _pdist_forward(const Tensor& self, const double p) {
	TORCH_CHECK(self.is_contiguous(), "_pdist_forward requires contiguous input");
	auto device = self.device().type();
	TORCH_CHECK(device == kCPU \|\| device == kCUDA, "_pdist_forward only supports CPU and CUDA devices, got: ", device);
	Tensor result = at::empty({0}, self.options(), LEGACY_CONTIGUOUS_MEMORY_FORMAT);
	if (self.size(0) <= 1) {
	result.resize_({0});
	} else {
	int64_t n = self.size(0);
	int64_t c = n * (n - 1) / 2;
	result.resize_({c});
	if (self.size(1) == 0) {
	result.fill_(0);
	} else {
	pdist_forward_stub(device, result, self, p);
	}
	}
	return result;
	}

	Tensor _pdist_backward(const Tensor& grad, const Tensor& self, const double p, const Tensor& pdist) {
	TORCH_CHECK(self.is_contiguous(), "_pdist_backward requires self to be contiguous");
	TORCH_CHECK(pdist.is_contiguous(), "_pdist_backward requires pdist to be contiguous");
	auto device = self.device().type();
	TORCH_CHECK(device == kCPU \|\| device == kCUDA, "_pdist_backward only supports CPU and CUDA devices, got: ", device);
	Tensor result = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
	pdist_backward_stub(device, result, grad, self, p, pdist);
	return result;
	}

	Tensor cosine_similarity(const Tensor& x1_, const Tensor& x2_, int64_t dim, double eps) {
	/*
	* cosine_similarity(x1, x2) = <x1, x2> / (\|\|x1\|\| * \|\|x2\|\|)
	*
	* The current implementation is an improvement over the previous version.
	*
	* Previous implementation:
	* 1. Compute num = <x1, x2>,
	* 2. Compute denom = \|\|x1\|\| * \|\|x2\|\|,
	* 3. Compute denom = max(denom, eps) to avoid division by zero,
	* 4. Return num / denom.
	*
	* Previous implementation has the following issues:
	* 1. Chance of losing precision in <x1, x2> when \|\|x1\|\| and \|\|x2\|\| are large.
	* 2. Chance of losing precision in \|\|x1\|\| * \|\|x2\|\| when \|\|x1\|\| and \|\|x2\|\| are large.
	* 3. Losing precision may cause \|cosing_similarity(x1, x2)\| > 1.0.
	*
	* Current implementation:
	* 1. Compute x1_normalized = x1 / max(\|\|x1\|\|, eps),
	* x2_normalized = x2 / max(\|\|x2\|\|, eps),
	* 2. Return <x1_normalized, x2_normalized>.
	*
	* The current implementation improves over the previous one by:
	* 1. Making sure that <x1, x2> and \|\|x1\|\| * \|\|x2\|\| are not computed explicitly,
	* hence avoiding floating point overflows.
	* 2. Both methods might have issues with computing \|\|x1\|\| and \|\|x2\|\|, but for
	* the current method this is the only source of the floating point imprecision.
	* 3. Makes sure \|cosing_similarity(x1, x2)\| <= 1.0.
	*
	*/

	auto commonDtype = at::result_type(x1_, x2_);
	TORCH_CHECK(at::isFloatingType(commonDtype), "expected common dtype to be floating point, yet common dtype is ", commonDtype);

	// We accept integral types (and bools lol) but vector_norm does not
	auto x1_is_int = c10::isIntegralType(x1_.scalar_type(), /încludeBool=/true);
	auto x2_is_int = c10::isIntegralType(x2_.scalar_type(), /încludeBool=/true);
	auto x1_t = x1_is_int ? x1_.to(commonDtype) : x1_;
	auto x2_t = x2_is_int ? x2_.to(commonDtype) : x2_;
	auto [x1, x2] = expand_outplace(x1_t, x2_t);


	// We want to divide each tensor by its norm first, as it's more numerically stable.
	// This keeps the result between -1.0 and 1.0
	// We clone them, as we're going to modify them in-place
	// This allows the gradients to propagate properly all the way to x1 and x2
	auto x1_norm = at::linalg_vector_norm(x1, 2, /dim=/dim, /keepdim=*/true).clone();
	auto x2_norm = at::linalg_vector_norm(x2, 2, /dim=/dim, /keepdim=*/true).clone();

	{
	at::NoGradGuard guard;
	x1_norm.clamp_min_(eps);
	x2_norm.clamp_min_(eps);
	}

	return ((x1 / x1_norm) (*x2 / x2_norm)).sum(dim);
	}

	} // namespace at::native