| #pragma once |
| |
| #include <limits> |
| #include <ATen/core/Tensor.h> |
| #include <ATen/native/Resize.h> |
| #include <ATen/native/TensorIterator.h> |
| #include <ATen/native/NonEmptyUtils.h> |
| #include <ATen/WrapDimUtilsMulti.h> |
| #include <c10/core/ScalarType.h> |
| #include <c10/util/irange.h> |
| |
| #ifndef AT_PER_OPERATOR_HEADERS |
| #include <ATen/Functions.h> |
| #else |
| #include <ATen/ops/empty.h> |
| #include <ATen/ops/scalar_tensor.h> |
| #endif |
| |
| namespace at::native { |
| |
| // Maximum and minimum possible scalar values, including infinities |
| template <typename scalar_t> |
| constexpr scalar_t upper_bound() { |
| using lim = std::numeric_limits<scalar_t>; |
| return lim::has_infinity ? lim::infinity() : lim::max(); |
| } |
| |
| template <typename scalar_t> |
| constexpr scalar_t lower_bound() { |
| using lim = std::numeric_limits<scalar_t>; |
| return lim::has_infinity ? -lim::infinity() : lim::lowest(); |
| } |
| |
| static inline Tensor restride_dim( |
| const Tensor& src, int64_t dim, |
| IntArrayRef replacement_shape |
| ) { |
| auto strides = ensure_nonempty_vec(src.strides().vec()); |
| strides[dim] = 0; |
| return src.as_strided(replacement_shape, strides); |
| } |
| |
| inline void _dimreduce_setup(const Tensor &result, const Tensor &self, |
| int64_t dim) { |
| IntArrayRef self_sizes = self.sizes(); |
| std::vector<int64_t> result_sizes; |
| result_sizes.insert(result_sizes.end(), self_sizes.begin(), self_sizes.end()); |
| result_sizes[dim] = 1; |
| result.resize_(result_sizes); |
| } |
| |
| inline bool _dimreduce_return_trivial(const Tensor &result, const Tensor &self, |
| const Scalar& ident, int64_t dim, bool keepdim) { |
| if (self.numel() == 1 && self.ndimension() == 0) { |
| result.resize_({}); |
| result.fill_(self); |
| return true; |
| } |
| // Return identity |
| if (self.numel() == 0) { |
| _dimreduce_setup(result, self, dim); |
| result.fill_(ident); |
| if (!keepdim) result.squeeze_(dim); |
| return true; |
| } |
| return false; |
| } |
| |
| inline bool _dimreduce_return_trivial_no_ident(Tensor &result, const Tensor &self, |
| int64_t /*dim*/, bool /*keepdim*/, const char* /*fn_name*/) { |
| if (self.numel() == 1 && self.ndimension() == 0) { |
| result.resize_({}); |
| result.fill_(self); |
| return true; |
| } |
| |
| return false; |
| } |
| |
| inline c10::optional<Tensor> _allreduce_return_trivial( |
| const Tensor& self, |
| const Scalar& ident) { |
| // Return identity |
| if (self.numel() == 0) { |
| return at::scalar_tensor(ident, self.options()); |
| } |
| return c10::nullopt; |
| } |
| |
| #define OPTION_TYPE_EQUALITY_CHECK(option, out, self) \ |
| { \ |
| TORCH_CHECK(\ |
| out.option() == self.option(),\ |
| "expected ", #option, " ",\ |
| self.option(),\ |
| " but found ", out.option())\ |
| } |
| |
| static inline void check_scalar_type_device_layout_equal(const Tensor& out, const Tensor& self) { |
| OPTION_TYPE_EQUALITY_CHECK(scalar_type, out, self); |
| OPTION_TYPE_EQUALITY_CHECK(device, out.options(), self.options()); |
| OPTION_TYPE_EQUALITY_CHECK(layout, out.options(), self.options()); |
| } |
| |
| static inline Tensor integer_upcast(const Tensor& self, c10::optional<ScalarType> dtype) { |
| ScalarType scalarType = self.scalar_type(); |
| TORCH_CHECK(!isBarebonesUnsignedType(scalarType), "integer upcasting for uint16, uint32 and uint64 is not currently implemented"); |
| ScalarType upcast_scalarType = dtype.value_or(at::isIntegralType(scalarType, /*includeBool=*/true) ? ScalarType::Long : scalarType); |
| return self.toType(upcast_scalarType); |
| } |
| |
| using DimMask = TensorIterator::DimMask; |
| |
| static DimVector make_dim_vector(OptionalIntArrayRef opt_dims, int64_t ndim) { |
| if (opt_dims.has_value()) { |
| return DimVector(opt_dims.value()); |
| } else { |
| std::vector<int64_t> all_dims(ndim); |
| std::iota(all_dims.begin(), all_dims.end(), 0); |
| return DimVector(all_dims); |
| } |
| } |
| |
| static DimMask make_dim_mask(OptionalIntArrayRef opt_dims, int64_t ndim, bool allow_empty_dims=false) { |
| DimMask mask; |
| if (opt_dims.has_value()) { |
| auto dims = opt_dims.value(); |
| if (dims.empty() && !allow_empty_dims) { |
| mask = DimMask().flip(); |
| } else { |
| mask = at::dim_list_to_bitset(dims, ndim); |
| } |
| } else { |
| mask = DimMask().flip(); |
| } |
| return mask; |
| } |
| |
| inline DimVector shape_from_dim_mask(const Tensor& self, DimMask mask, bool keepdim) { |
| auto shape = DimVector(self.sizes()); |
| for (int dim = shape.size() - 1; dim >= 0; dim--) { |
| if (mask[dim]) { |
| if (keepdim) { |
| shape[dim] = 1; |
| } else { |
| shape.erase(shape.begin() + dim); |
| } |
| } |
| } |
| return shape; |
| } |
| |
| static void resize_reduction_result( |
| Tensor& result, const Tensor& self, DimMask mask, bool keepdim, |
| ScalarType /*dtype*/) |
| { |
| auto shape = shape_from_dim_mask(self, mask, keepdim); |
| TORCH_CHECK(result.defined(), "Cannot create a new tensor inside a reduction op. You likely tried to call an operator with an out argument but the out argument was an undefined tensor."); |
| at::native::resize_output(result, shape); |
| } |
| |
| inline Tensor create_reduction_result( |
| const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, ScalarType dtype |
| ) { |
| DimMask mask = make_dim_mask(dim, self.dim()); |
| auto shape = shape_from_dim_mask(self, mask, keepdim); |
| return at::empty(shape, self.options().dtype(dtype)); |
| } |
| |
| static Tensor review_reduce_result(const Tensor& result, int ndim, DimMask mask, bool keepdim) { |
| if (keepdim) { |
| return result; |
| } |
| auto shape = DimVector(result.sizes()); |
| auto stride = DimVector(result.strides()); |
| for (const auto dim : c10::irange(ndim)) { |
| if (mask[dim]) { |
| shape.insert(shape.begin() + dim, 1); |
| stride.insert(stride.begin() + dim, 0); |
| } |
| } |
| return result.as_strided(shape, stride); |
| } |
| |
| static TensorIterator make_reduction( |
| const char* name, Tensor& result, const Tensor& self, |
| at::OptionalIntArrayRef dim_opt, |
| bool keepdim, ScalarType in_dtype, ScalarType out_dtype) { |
| // check that result type and dtype match if provided |
| TORCH_CHECK( |
| !result.defined() || result.scalar_type() == out_dtype, |
| name, ": provided dtype must match dtype of result. Got ", |
| toString(result.scalar_type()), |
| " and ", |
| toString(out_dtype), |
| "."); |
| // dim={} performs an all-reduce, same as dim=None |
| IntArrayRef dim = dim_opt.value_or(IntArrayRef{}); |
| int64_t ndim = self.dim(); |
| auto mask = make_dim_mask(dim, ndim); |
| resize_reduction_result(result, self, mask, keepdim, out_dtype); |
| auto viewed_result = review_reduce_result(result, ndim, mask, keepdim); |
| namedinference::propagate_names_for_reduction(result, self, dim, keepdim); |
| if (self.scalar_type() == in_dtype) { |
| return TensorIterator::reduce_op(viewed_result, self); |
| } |
| return TensorIterator::reduce_op(viewed_result, self.to(in_dtype)); |
| } |
| |
| static C10_UNUSED TensorIterator make_reduction( |
| const char* name, Tensor& result, const Tensor& self, |
| at::OptionalIntArrayRef dim, bool keepdim, ScalarType out_dtype) { |
| // special case for type promotion in mixed precision, improves computational |
| // efficiency. |
| // not generalize this to common mismatched input/output types to avoid cross |
| // product of templated kernel launches. |
| const bool gpu_lowp_to_f32 = ( |
| self.is_cuda() && (self.scalar_type() == kHalf || self.scalar_type() == kBFloat16) && out_dtype == kFloat); |
| auto in_dtype = gpu_lowp_to_f32 ? self.scalar_type() |
| : self.is_complex() ? c10::toComplexType(out_dtype) |
| : out_dtype; |
| return make_reduction(name, result, self, dim, keepdim, in_dtype, out_dtype); |
| } |
| |
| static TensorIterator make_reduction( |
| const char* name, Tensor& result1, Tensor& result2, const Tensor& self, |
| at::OptionalIntArrayRef dim_opt, bool keepdim, ScalarType dtype1, |
| ScalarType dtype2) { |
| // check that result type and dtype match if provided |
| TORCH_CHECK( |
| (!result1.defined() || result1.scalar_type() == dtype1) && (!result2.defined() || result2.scalar_type() == dtype2), |
| name, ": provided dtype must match dtype of result. Got ", |
| toString(result1.scalar_type()), toString(result2.scalar_type()), |
| " and ", |
| toString(dtype1), toString(dtype2), |
| "."); |
| |
| // dim={} performs an all-reduce, same as dim=None |
| auto dim = dim_opt.value_or(IntArrayRef{}); |
| int64_t ndim = self.dim(); |
| DimMask mask = make_dim_mask(dim, ndim); |
| resize_reduction_result(result1, self, mask, keepdim, dtype1); |
| auto viewed_result1 = review_reduce_result(result1, ndim, mask, keepdim); |
| |
| resize_reduction_result(result2, self, mask, keepdim, dtype2); |
| auto viewed_result2 = review_reduce_result(result2, ndim, mask, keepdim); |
| |
| namedinference::propagate_names_for_reduction(result1, self, dim, keepdim); |
| namedinference::propagate_names_for_reduction(result2, self, dim, keepdim); |
| |
| // special case for type promotion in mixed precision, improves computational |
| // efficiency. |
| // We don't generalize this to common mismatched input/output types to avoid cross |
| // product of templated kernel launches. |
| if (self.scalar_type() == dtype1 || |
| (self.is_cuda() && self.scalar_type() == kHalf && dtype1 == kFloat)) { |
| return TensorIterator::reduce_op(viewed_result1, viewed_result2, self); |
| } |
| return TensorIterator::reduce_op(viewed_result1, viewed_result2, self.to(dtype1)); |
| } |
| |
| static C10_UNUSED TensorIterator make_reduction( |
| const char* name, Tensor& result1, Tensor& result2, const Tensor& self, |
| at::OptionalIntArrayRef dim, bool keepdim, ScalarType dtype) { |
| return make_reduction(name, result1, result2, self, dim, keepdim, dtype, dtype); |
| } |
| |
| static void zero_numel_check_dims(const Tensor& self, const int64_t dim, const char *fn_name) { |
| if (self.ndimension() == 0) { |
| TORCH_CHECK_INDEX(dim == 0 || dim == -1, fn_name, |
| ": Expected reduction dim -1 or 0 for scalar but got ", dim); |
| } |
| else { |
| TORCH_CHECK_INDEX(self.size(dim) != 0, fn_name, |
| ": Expected reduction dim ", dim, " to have non-zero size."); |
| } |
| } |
| |
| static void zero_numel_check_dims(const Tensor& self, const IntArrayRef dim, const char *fn_name) { |
| TORCH_CHECK( |
| !dim.empty(), |
| fn_name, ": Expected reduction dim to be specified for input.numel() == 0. ", |
| "Specify the reduction dim with the 'dim' argument."); |
| for (const int64_t d : dim) { |
| zero_numel_check_dims(self, d, fn_name); |
| } |
| } |
| |
| static std::vector<int64_t> get_zero_numel_tensor_size( |
| const Tensor& self, |
| const int64_t dim, |
| const bool keepdim, |
| const char* fn_name) { |
| TORCH_INTERNAL_ASSERT(self.numel() == 0, fn_name, ": Expected self.numel() == 0."); |
| zero_numel_check_dims(self, dim, fn_name); |
| std::vector<int64_t> sizes; |
| if (keepdim) { |
| sizes = self.sizes().vec(); |
| sizes[dim] = 1; |
| } |
| else { |
| for (const auto d : c10::irange(self.dim())) { |
| if (d != dim) { |
| sizes.push_back(self.sizes()[d]); |
| } |
| } |
| } |
| return sizes; |
| } |
| |
| // Resize the result tensor and indices when result.numel() == 0 depending on values of |
| // dim and keepdim for returning tensors containing reduction results. |
| // This function should be called when you are reducing a zero-numel tensor and want to |
| // resize the output and return it. This function exists for resizing zero-numel |
| // tensors when the size of the reduction dimension is non-zero. |
| static C10_UNUSED void zero_numel_tensor_resize(Tensor& result, Tensor& result_indices, |
| const Tensor& self, const int64_t dim, |
| const bool keepdim, const char *fn_name) { |
| auto sizes = get_zero_numel_tensor_size(self, dim, keepdim, fn_name); |
| at::native::resize_output(result, sizes); |
| at::native::resize_output(result_indices, sizes); |
| } |
| |
| inline ScalarType get_dtype_from_self( |
| const Tensor& self, |
| const c10::optional<ScalarType>& dtype, |
| bool promote_integers) { |
| if (dtype.has_value()) { |
| return dtype.value(); |
| } |
| ScalarType src_type = self.scalar_type(); |
| if (promote_integers && at::isIntegralType(src_type, /*includeBool=*/true)) { |
| return kLong; |
| } |
| return src_type; |
| } |
| |
| inline ScalarType get_dtype_from_result(Tensor& result, c10::optional<ScalarType> dtype) { |
| TORCH_CHECK(result.defined(), "Cannot create a new tensor inside a reduction op. You likely tried to call an operator with an out argument but the out argument was an undefined tensor."); |
| if (dtype.has_value()) { |
| return dtype.value(); |
| } else { |
| return result.scalar_type(); |
| } |
| } |
| |
| |
| } // namespace at::native |
| |
| namespace at::meta { |
| |
| static C10_UNUSED DimVector get_reduction_shape( |
| const Tensor& self, |
| IntArrayRef dims, |
| bool keepdim, |
| bool allow_empty_dims=false) { |
| auto mask = native::make_dim_mask(dims, self.dim(), allow_empty_dims); |
| return native::shape_from_dim_mask(self, mask, keepdim); |
| } |
| |
| static void resize_reduction( |
| impl::MetaBase& meta, |
| const Tensor& self, |
| OptionalIntArrayRef opt_dims, |
| bool keepdim, |
| ScalarType out_dtype, |
| bool allow_empty_dims=false) { |
| DimVector dims_ = at::native::make_dim_vector(opt_dims, self.dim()); |
| maybe_wrap_dims(dims_, self.dim()); |
| auto shape = get_reduction_shape(self, dims_, keepdim, allow_empty_dims); |
| meta.set_output_raw_strided(0, shape, {}, self.options().dtype(out_dtype)); |
| namedinference::propagate_names_for_reduction( |
| meta.maybe_get_output(), self, dims_, keepdim); |
| } |
| |
| static void resize_reduction_with_indices( |
| impl::MetaBase& meta, |
| const Tensor& self, |
| IntArrayRef dims, |
| bool keepdim, |
| ScalarType out_dtype) { |
| DimVector dims_(dims); |
| maybe_wrap_dims(dims_, self.dim()); |
| auto shape = get_reduction_shape(self, dims_, keepdim); |
| meta.set_output_raw_strided(0, shape, {}, self.options().dtype(out_dtype)); |
| meta.set_output_raw_strided(1, shape, {}, self.options().dtype(kLong)); |
| namedinference::propagate_names_for_reduction( |
| meta.maybe_get_output(0), self, dims_, keepdim); |
| namedinference::propagate_names_for_reduction( |
| meta.maybe_get_output(1), self, dims_, keepdim); |
| } |
| |
| static TensorIterator make_reduction( |
| const Tensor& self, |
| const Tensor& result, |
| OptionalIntArrayRef opt_dims, |
| bool keepdim, |
| ScalarType in_dtype) { |
| int64_t ndim = self.dim(); |
| auto mask = at::native::make_dim_mask(opt_dims, ndim); |
| auto viewed_result = |
| at::native::review_reduce_result(result, ndim, mask, keepdim); |
| if (self.scalar_type() == in_dtype) { |
| return TensorIterator::reduce_op(viewed_result, self); |
| } |
| return TensorIterator::reduce_op(viewed_result, self.to(in_dtype)); |
| } |
| |
| static TensorIterator make_reduction( |
| const Tensor& self, |
| const Tensor& result1, |
| const Tensor& result2, |
| IntArrayRef dims, |
| bool keepdim, |
| ScalarType dtype1, |
| ScalarType /*dtype2*/) { |
| int64_t ndim = self.dim(); |
| auto mask = at::native::make_dim_mask(dims, ndim); |
| auto viewed_result1 = at::native::review_reduce_result(result1, ndim, mask, keepdim); |
| auto viewed_result2 = at::native::review_reduce_result(result2, ndim, mask, keepdim); |
| // special case for type promotion in mixed precision, improves computational efficiency. |
| // We don't generalize this to common mismatched input/output types to avoid cross product |
| // of templated kernel launches. |
| if (self.scalar_type() == dtype1 || |
| (self.is_cuda() && self.scalar_type() == kHalf && dtype1 == kFloat)) { |
| return TensorIterator::reduce_op(viewed_result1, viewed_result2, self); |
| } |
| return TensorIterator::reduce_op(viewed_result1, viewed_result2, self.to(dtype1)); |
| } |
| |
| static C10_UNUSED TensorIterator make_reduction_from_out_ty( |
| const Tensor& self, |
| const Tensor& result, |
| OptionalIntArrayRef opt_dims, |
| bool keepdim, |
| ScalarType out_dtype) { |
| // special case for type promotion in mixed precision, improves computational |
| // efficiency. |
| // not generalize this to common mismatched input/output types to avoid cross |
| // product of templated kernel launches. |
| const bool gpu_lowp_to_f32 = |
| (self.is_cuda() && |
| (self.scalar_type() == kHalf || self.scalar_type() == kBFloat16) && |
| out_dtype == kFloat); |
| auto in_dtype = gpu_lowp_to_f32 ? self.scalar_type() : out_dtype; |
| return make_reduction(self, result, opt_dims, keepdim, in_dtype); |
| } |
| |
| } // namespace at::meta |