aten/src/ATen/ExpandUtils.cpp - platform/external/pytorch - Git at Google

 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/ExpandUtils.h>
 #include <ATen/ExpandBase.h>

 #include <c10/util/irange.h>

 namespace at {
 namespace internal {
 TensorBase expand_slow_path(const TensorBase &self, IntArrayRef size) {
   return OptionalTensorRef(self)->expand(size);
 }
 } // namespace internal

 namespace {
 // NOTE: are_expandable did a similar check, please keep them sync if change is needed
 template <typename Container, typename ArrayType>
 Container infer_size_impl(ArrayType a, ArrayType b) {
   // Use ptrdiff_t to ensure signed comparison.
   auto dimsA = static_cast<ptrdiff_t>(a.size());
   auto dimsB = static_cast<ptrdiff_t>(b.size());
   auto ndim = dimsA > dimsB ? dimsA : dimsB;
   Container expandedSizes(ndim);

   for (ptrdiff_t i = ndim - 1; i >= 0; --i) {
     ptrdiff_t offset = ndim - 1 - i;
     ptrdiff_t dimA = dimsA - 1 - offset;
     ptrdiff_t dimB = dimsB - 1 - offset;
     auto sizeA = (dimA >= 0) ? a[dimA] : 1;
     auto sizeB = (dimB >= 0) ? b[dimB] : 1;

     TORCH_CHECK(
         sizeA == sizeB || sizeA == 1 || sizeB == 1,
         "The size of tensor a (", sizeA,
         ") must match the size of tensor b (", sizeB,
         ") at non-singleton dimension ", i);

       // 1s map to the other size (even 0).
       expandedSizes[i] = sizeA == 1 ? std::move(sizeB) : std::move(sizeA);
   }

   return expandedSizes;
 }
 }

 std::vector<int64_t> infer_size(IntArrayRef a, IntArrayRef b) {
   return infer_size_impl<std::vector<int64_t>>(a, b);
 }

 std::vector<SymInt> infer_size_symint(SymIntArrayRef a, SymIntArrayRef b) {
   return infer_size_impl<std::vector<SymInt>>(a, b);
 }

 DimVector infer_size_dimvector(IntArrayRef a, IntArrayRef b) {
   return infer_size_impl<DimVector, IntArrayRef>(a, b);
 }

 SymDimVector infer_size_symdimvector(SymIntArrayRef a, SymIntArrayRef b) {
   return infer_size_impl<SymDimVector, SymIntArrayRef>(a, b);
 }

 template<typename Container>
 C10_ALWAYS_INLINE InferExpandGeometryResult<Container> inferExpandGeometryImpl(
     IntArrayRef tensor_sizes,
     IntArrayRef tensor_strides,
     IntArrayRef sizes) {
   int64_t ndim = static_cast<int64_t>(sizes.size());
   int64_t tensor_dim = static_cast<int64_t>(tensor_sizes.size());

   if (tensor_dim == 0) {
     return InferExpandGeometryResult<Container>(sizes, ndim);
   }

   InferExpandGeometryResult<Container> result(ndim);
   auto& expandedSizes = result.sizes;
   auto& expandedStrides = result.strides;

   // create a new geometry for the tensors
   for (int64_t i = ndim - 1; i >= 0; --i) {
     int64_t offset = ndim - 1 - i;
     int64_t dim = tensor_dim - 1 - offset;
     int64_t size = (dim >= 0) ? tensor_sizes[dim] : 1;
     int64_t stride = (dim >= 0) ? tensor_strides[dim]
                                 : expandedSizes[i + 1] * expandedStrides[i + 1];
     int64_t targetSize = sizes[i];
     if (targetSize == -1) {
       TORCH_CHECK(
           dim >= 0,
           "The expanded size of the tensor (",
           targetSize,
           ") isn't allowed in a leading, non-existing dimension ",
           i);
       targetSize = size;
     }
     if (size != targetSize) {
       TORCH_CHECK(
           size == 1,
           "The expanded size of the tensor (",
           targetSize,
           ") must match the existing size (",
           size,
           ") at non-singleton dimension ",
           i,
           ".  Target sizes: ",
           sizes,
           ".  Tensor sizes: ",
           tensor_sizes);
       size = targetSize;
       stride = 0;
     }
     expandedSizes[i] = size;
     expandedStrides[i] = stride;
   }
   return result;
 }

 std::tuple<std::vector<int64_t>, std::vector<int64_t>> inferExpandGeometry(
     IntArrayRef tensor_sizes,
     IntArrayRef tensor_strides,
     IntArrayRef sizes) {
   auto result = inferExpandGeometryImpl<std::vector<int64_t>>(
       tensor_sizes, tensor_strides, sizes);
   return std::make_tuple(std::move(result.sizes), std::move(result.strides));
 }

 InferExpandGeometryResult<DimVector> inferExpandGeometry_dimvector(
     IntArrayRef tensor_sizes,
     IntArrayRef tensor_strides,
     IntArrayRef sizes) {
   return inferExpandGeometryImpl<DimVector>(
       tensor_sizes, tensor_strides, sizes);
 }


 // This function returns a dense and non-overlapping strides, which keeps the same layout permutation
 // as the input `tensor_strides`, computed based on the input `tensor_sizes`.
 // Note:
 // 1. This function expects the inputs `tensor_strides` and `tensor_sizes` are non-dense or overlapping,
 //    If the inputs are densed and non-overlapping, the output strides will be the same as `tensor_strides`.
 //    However, this function won't check whether inputs are dense or overlapping, so the whole function will
 //    still be executed even the inputs are already dense and non-overlapping, this will cause slowness.
 //
 //    Please verify whether the inputs are non-dense or overlapping before calling this function if possible,
 //    if the inputs come from a tensor, you can check this through `is_non_overlapping_and_dense()`
 //
 // 2. The strides propagation rule that is used in this function is exactily the same as what is being used in
 //    TensorIterator. Please refer to https://github.com/pytorch/pytorch/pull/42922 for more details

 std::vector<int64_t> infer_dense_strides(IntArrayRef tensor_sizes, IntArrayRef tensor_strides) {

   TORCH_CHECK(tensor_sizes.size() == tensor_strides.size(),
     "Input sizes and strides should have same size but got ", tensor_sizes.size(), " and ", tensor_strides.size());

   size_t ndim = tensor_sizes.size();
   if (ndim == 0) {
     return {};
   }
   if (ndim == 1) {
     return {1};
   }

   std::vector<int64_t> perm(ndim);
   // initialize perm with n-1, n-2, ..., 1, 0
   std::iota(perm.rbegin(), perm.rend(), 0);

   // The following sorting algorithm has exactly the same behavior as TensorIterator
   // This is to make sure we have the same stride propagation everywhere.

   // return -1 if dim0 should come before dim1
   // return  1 if dim0 should come after dim1
   // return  0 if comparison is ambiguous
   auto should_swap = [&](size_t dim0, size_t dim1) {
     int64_t stride0 = tensor_strides[dim0];
     int64_t stride1 = tensor_strides[dim1];

     // if any stride is 0, treat it as ambiguous comparison to
     // keep the same behavior as TensorIterator
     if (stride0 == 0 || stride1 == 0) {
       return 0;
     }
     if (stride0 < stride1) {
       return -1;
     }
     if (stride0 > stride1) {
       return 1;
     }
     // for equal strides, the dimension with smaller size goes front
     if (tensor_sizes[dim0] > tensor_sizes[dim1]) {
       return 1;
     }
     return 0;
   };

   // Insertion sort (stable) indices in `perm` based on input tensor's stride and shape,
   // all dimensions with 0 stride won't move. This is the same behavior as TensorIterator.
   // eg. Given tensor with size/stride (6, 5, 4, 3, 2)/(6, 0, 120, 0, 1), the initial `perm`
   //     is (4, 3, 2, 1, 0) and the sorted `perm` will be (4, 3, 0, 1, 2)
   for (const auto i : c10::irange(1, ndim)) {
     auto dim1 = i;
     for (const auto j : c10::irange(1, i + 1)) {
       auto dim0 = i - j;
       int comparison = should_swap(perm[dim0], perm[dim1]);
       if (comparison > 0) {
         std::swap(perm[dim0], perm[dim1]);
         dim1 = dim0;
       }
       else if (comparison < 0) {
         break;
       }
     }
   }

   // compute output strides which preserves the input tensor's memory layout
   std::vector<int64_t> out_strides(ndim);
   int64_t curr_stride = 1;
   for (const auto i : c10::irange(ndim)) {
     int64_t idx = perm[i];
     out_strides[idx] = curr_stride;
     // Note: for size 0, we simply treated it as 1, it really doesn't matter here
     // since the total number of element is 0.
     if (tensor_sizes[idx] > 1) {
       curr_stride *= tensor_sizes[idx];
     }
   }
   return out_strides;
 }

 } // namespace at
	#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
	#include <ATen/ExpandUtils.h>
	#include <ATen/ExpandBase.h>

	#include <c10/util/irange.h>

	namespace at {
	namespace internal {
	TensorBase expand_slow_path(const TensorBase &self, IntArrayRef size) {
	return OptionalTensorRef(self)->expand(size);
	}
	} // namespace internal

	namespace {
	// NOTE: are_expandable did a similar check, please keep them sync if change is needed
	template <typename Container, typename ArrayType>
	Container infer_size_impl(ArrayType a, ArrayType b) {
	// Use ptrdiff_t to ensure signed comparison.
	auto dimsA = static_cast<ptrdiff_t>(a.size());
	auto dimsB = static_cast<ptrdiff_t>(b.size());
	auto ndim = dimsA > dimsB ? dimsA : dimsB;
	Container expandedSizes(ndim);

	for (ptrdiff_t i = ndim - 1; i >= 0; --i) {
	ptrdiff_t offset = ndim - 1 - i;
	ptrdiff_t dimA = dimsA - 1 - offset;
	ptrdiff_t dimB = dimsB - 1 - offset;
	auto sizeA = (dimA >= 0) ? a[dimA] : 1;
	auto sizeB = (dimB >= 0) ? b[dimB] : 1;

	TORCH_CHECK(
	sizeA == sizeB \|\| sizeA == 1 \|\| sizeB == 1,
	"The size of tensor a (", sizeA,
	") must match the size of tensor b (", sizeB,
	") at non-singleton dimension ", i);

	// 1s map to the other size (even 0).
	expandedSizes[i] = sizeA == 1 ? std::move(sizeB) : std::move(sizeA);
	}

	return expandedSizes;
	}
	}

	std::vector<int64_t> infer_size(IntArrayRef a, IntArrayRef b) {
	return infer_size_impl<std::vector<int64_t>>(a, b);
	}

	std::vector<SymInt> infer_size_symint(SymIntArrayRef a, SymIntArrayRef b) {
	return infer_size_impl<std::vector<SymInt>>(a, b);
	}

	DimVector infer_size_dimvector(IntArrayRef a, IntArrayRef b) {
	return infer_size_impl<DimVector, IntArrayRef>(a, b);
	}

	SymDimVector infer_size_symdimvector(SymIntArrayRef a, SymIntArrayRef b) {
	return infer_size_impl<SymDimVector, SymIntArrayRef>(a, b);
	}

	template<typename Container>
	C10_ALWAYS_INLINE InferExpandGeometryResult<Container> inferExpandGeometryImpl(
	IntArrayRef tensor_sizes,
	IntArrayRef tensor_strides,
	IntArrayRef sizes) {
	int64_t ndim = static_cast<int64_t>(sizes.size());
	int64_t tensor_dim = static_cast<int64_t>(tensor_sizes.size());

	if (tensor_dim == 0) {
	return InferExpandGeometryResult<Container>(sizes, ndim);
	}

	InferExpandGeometryResult<Container> result(ndim);
	auto& expandedSizes = result.sizes;
	auto& expandedStrides = result.strides;

	// create a new geometry for the tensors
	for (int64_t i = ndim - 1; i >= 0; --i) {
	int64_t offset = ndim - 1 - i;
	int64_t dim = tensor_dim - 1 - offset;
	int64_t size = (dim >= 0) ? tensor_sizes[dim] : 1;
	int64_t stride = (dim >= 0) ? tensor_strides[dim]
	: expandedSizes[i + 1] * expandedStrides[i + 1];
	int64_t targetSize = sizes[i];
	if (targetSize == -1) {
	TORCH_CHECK(
	dim >= 0,
	"The expanded size of the tensor (",
	targetSize,
	") isn't allowed in a leading, non-existing dimension ",
	i);
	targetSize = size;
	}
	if (size != targetSize) {
	TORCH_CHECK(
	size == 1,
	"The expanded size of the tensor (",
	targetSize,
	") must match the existing size (",
	size,
	") at non-singleton dimension ",
	i,
	". Target sizes: ",
	sizes,
	". Tensor sizes: ",
	tensor_sizes);
	size = targetSize;
	stride = 0;
	}
	expandedSizes[i] = size;
	expandedStrides[i] = stride;
	}
	return result;
	}

	std::tuple<std::vector<int64_t>, std::vector<int64_t>> inferExpandGeometry(
	IntArrayRef tensor_sizes,
	IntArrayRef tensor_strides,
	IntArrayRef sizes) {
	auto result = inferExpandGeometryImpl<std::vector<int64_t>>(
	tensor_sizes, tensor_strides, sizes);
	return std::make_tuple(std::move(result.sizes), std::move(result.strides));
	}

	InferExpandGeometryResult<DimVector> inferExpandGeometry_dimvector(
	IntArrayRef tensor_sizes,
	IntArrayRef tensor_strides,
	IntArrayRef sizes) {
	return inferExpandGeometryImpl<DimVector>(
	tensor_sizes, tensor_strides, sizes);
	}


	// This function returns a dense and non-overlapping strides, which keeps the same layout permutation
	// as the input `tensor_strides`, computed based on the input `tensor_sizes`.
	// Note:
	// 1. This function expects the inputs `tensor_strides` and `tensor_sizes` are non-dense or overlapping,
	// If the inputs are densed and non-overlapping, the output strides will be the same as `tensor_strides`.
	// However, this function won't check whether inputs are dense or overlapping, so the whole function will
	// still be executed even the inputs are already dense and non-overlapping, this will cause slowness.
	//
	// Please verify whether the inputs are non-dense or overlapping before calling this function if possible,
	// if the inputs come from a tensor, you can check this through `is_non_overlapping_and_dense()`
	//
	// 2. The strides propagation rule that is used in this function is exactily the same as what is being used in
	// TensorIterator. Please refer to https://github.com/pytorch/pytorch/pull/42922 for more details

	std::vector<int64_t> infer_dense_strides(IntArrayRef tensor_sizes, IntArrayRef tensor_strides) {

	TORCH_CHECK(tensor_sizes.size() == tensor_strides.size(),
	"Input sizes and strides should have same size but got ", tensor_sizes.size(), " and ", tensor_strides.size());

	size_t ndim = tensor_sizes.size();
	if (ndim == 0) {
	return {};
	}
	if (ndim == 1) {
	return {1};
	}

	std::vector<int64_t> perm(ndim);
	// initialize perm with n-1, n-2, ..., 1, 0
	std::iota(perm.rbegin(), perm.rend(), 0);

	// The following sorting algorithm has exactly the same behavior as TensorIterator
	// This is to make sure we have the same stride propagation everywhere.

	// return -1 if dim0 should come before dim1
	// return 1 if dim0 should come after dim1
	// return 0 if comparison is ambiguous
	auto should_swap = [&](size_t dim0, size_t dim1) {
	int64_t stride0 = tensor_strides[dim0];
	int64_t stride1 = tensor_strides[dim1];

	// if any stride is 0, treat it as ambiguous comparison to
	// keep the same behavior as TensorIterator
	if (stride0 == 0 \|\| stride1 == 0) {
	return 0;
	}
	if (stride0 < stride1) {
	return -1;
	}
	if (stride0 > stride1) {
	return 1;
	}
	// for equal strides, the dimension with smaller size goes front
	if (tensor_sizes[dim0] > tensor_sizes[dim1]) {
	return 1;
	}
	return 0;
	};

	// Insertion sort (stable) indices in `perm` based on input tensor's stride and shape,
	// all dimensions with 0 stride won't move. This is the same behavior as TensorIterator.
	// eg. Given tensor with size/stride (6, 5, 4, 3, 2)/(6, 0, 120, 0, 1), the initial `perm`
	// is (4, 3, 2, 1, 0) and the sorted `perm` will be (4, 3, 0, 1, 2)
	for (const auto i : c10::irange(1, ndim)) {
	auto dim1 = i;
	for (const auto j : c10::irange(1, i + 1)) {
	auto dim0 = i - j;
	int comparison = should_swap(perm[dim0], perm[dim1]);
	if (comparison > 0) {
	std::swap(perm[dim0], perm[dim1]);
	dim1 = dim0;
	}
	else if (comparison < 0) {
	break;
	}
	}
	}

	// compute output strides which preserves the input tensor's memory layout
	std::vector<int64_t> out_strides(ndim);
	int64_t curr_stride = 1;
	for (const auto i : c10::irange(ndim)) {
	int64_t idx = perm[i];
	out_strides[idx] = curr_stride;
	// Note: for size 0, we simply treated it as 1, it really doesn't matter here
	// since the total number of element is 0.
	if (tensor_sizes[idx] > 1) {
	curr_stride *= tensor_sizes[idx];
	}
	}
	return out_strides;
	}

	} // namespace at