blob: c292a3b598b3674f130840be3caf0c8723ab5d2a [file] [log] [blame]
#define TORCH_ASSERT_NO_OPERATORS
#include <ATen/EmptyTensor.h>
#include <ATen/detail/CUDAHooksInterface.h>
#include <ATen/Context.h>
#include <ATen/detail/PrivateUse1HooksInterface.h>
#include <c10/core/CPUAllocator.h>
#include <c10/util/safe_numerics.h>
#include <limits>
namespace at::detail {
namespace {
c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
if (pin_memory) {
// NB: This is not quite right, if you somehow had both CUDA and PrivateUse1 initialized
// in the same PyTorch build, you would ONLY ever get the CUDA pinned memory allocator.
// To properly support this, see https://github.com/pytorch/pytorch/issues/14560
if (at::globalContext().hasCUDA()) {
return at::detail::getCUDAHooks().getPinnedMemoryAllocator();
} else if(at::isPrivateUse1HooksRegistered()) {
return at::GetPrivateUse1HooksInterface()->getPinnedMemoryAllocator();
} else {
TORCH_CHECK(false, "Need to provide pin_memory allocator to use pin memory.")
}
}
return c10::GetCPUAllocator();
}
constexpr uint64_t storage_max() {
// int64_t and size_t are used somewhat inconsistently throughout ATen.
// To be safe, storage size calculations must fit in both types.
constexpr auto int64_max = static_cast<uint64_t>(
std::numeric_limits<int64_t>::max());
constexpr auto size_max = static_cast<uint64_t>(
std::numeric_limits<size_t>::max());
return std::min(int64_max, size_max);
}
inline void raise_warning_for_complex_half(ScalarType dtype) {
if (dtype == kComplexHalf) {
TORCH_WARN_ONCE(
"ComplexHalf support is experimental and many operators don't support it yet.");
}
}
} // namespace (anonymous)
size_t computeStorageNbytesContiguous(
IntArrayRef sizes,
size_t itemsize_bytes,
size_t storage_offset
) {
// Ignore overflow checks on mobile
#ifndef C10_MOBILE
uint64_t size = 1;
bool overflowed = c10::safe_multiplies_u64(sizes, &size);
overflowed |= c10::add_overflows(size, storage_offset, &size);
overflowed |= c10::mul_overflows(size, itemsize_bytes, &size);
overflowed |= size > storage_max();
TORCH_CHECK(!overflowed,
"Storage size calculation overflowed with sizes=", sizes);
return static_cast<size_t>(size);
#else
const auto numel = c10::multiply_integers(sizes);
return itemsize_bytes * (storage_offset + numel);
#endif
}
size_t computeStorageNbytes(
IntArrayRef sizes,
IntArrayRef strides,
size_t itemsize_bytes,
size_t storage_offset
) {
TORCH_CHECK(
sizes.size() == strides.size(),
"dimensionality of sizes (",
sizes.size(),
") must match dimensionality of strides (",
strides.size(),
")");
// Ignore overflow checks on mobile
#ifndef C10_MOBILE
// size of the underlying storage is 1 bigger than the offset
// of the last element according to stride
uint64_t size = storage_offset + 1;
bool overflowed = false;
for (const auto i : c10::irange(sizes.size())) {
if (sizes[i] == 0) {
return 0;
}
uint64_t strided_size = 0;
overflowed |= c10::mul_overflows(strides[i], sizes[i] - 1, &strided_size);
overflowed |= c10::add_overflows(size, strided_size, &size);
}
overflowed |= c10::mul_overflows(size, itemsize_bytes, &size);
overflowed |= size > storage_max();
TORCH_CHECK(!overflowed,
"Storage size calculation overflowed with sizes=",
sizes, " and strides=", strides);
return static_cast<size_t>(size);
#else
// size of the underlying storage is 1 bigger than the offset
// of the last element according to stride
uint64_t size = 1;
for (const auto i : c10::irange(sizes.size())) {
if (sizes[i] == 0) {
return 0;
}
size += strides[i] * (sizes[i] - 1);
}
return itemsize_bytes * (storage_offset + size);
#endif
}
SymInt computeStorageNbytesContiguous(
SymIntArrayRef sizes,
const SymInt& itemsize_bytes,
const SymInt& storage_offset
) {
const auto numel = c10::multiply_integers(sizes);
return itemsize_bytes * (storage_offset + numel);
}
// not including mobile-only macros in this function,
// since mobile shouldn't be using symints.
SymInt computeStorageNbytes(
SymIntArrayRef sizes,
SymIntArrayRef strides,
const SymInt& itemsize_bytes,
const SymInt& storage_offset
) {
TORCH_CHECK(
sizes.size() == strides.size(),
"dimensionality of sizes (",
sizes.size(),
") must match dimensionality of strides (",
strides.size(),
")");
// size of the underlying storage is 1 bigger than the offset
// of the last element according to stride
SymInt size = 1;
for (const auto i : c10::irange(sizes.size())) {
if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[i].sym_eq(0))) {
return 0;
}
size += strides[i] * (sizes[i] - 1);
}
return itemsize_bytes * (storage_offset + size);
}
template <typename T>
TensorBase _empty_generic(
ArrayRef<T> size,
c10::Allocator* allocator,
c10::DispatchKeySet ks,
ScalarType scalar_type,
c10::optional<c10::MemoryFormat> memory_format_opt) {
at::detail::check_size_nonnegative(size);
at::detail::raise_warning_for_complex_half(scalar_type);
caffe2::TypeMeta dtype = scalarTypeToTypeMeta(scalar_type);
auto size_bytes = computeStorageNbytesContiguous(size, dtype.itemsize());
auto storage_impl = c10::make_intrusive<StorageImpl>(
c10::StorageImpl::use_byte_size_t(),
size_bytes,
allocator,
/*resizeable=*/true);
auto tensor = detail::make_tensor_base<TensorImpl>(
std::move(storage_impl), ks, dtype);
// Default TensorImpl has size [0]
// NB: test for meta dispatch key to avoid guarding on zero-ness
if (ks.has(c10::DispatchKey::Meta) || size.size() != 1 || size[0] != 0) {
tensor.unsafeGetTensorImpl()->generic_set_sizes_contiguous(size);
}
if (memory_format_opt.has_value()) {
// Restriding a just-created empty contiguous tensor does nothing.
if (*memory_format_opt != MemoryFormat::Contiguous) {
tensor.unsafeGetTensorImpl()->empty_tensor_restride(*memory_format_opt);
}
}
return tensor;
}
TensorBase empty_generic(
IntArrayRef size,
c10::Allocator* allocator,
c10::DispatchKeySet ks,
ScalarType scalar_type,
c10::optional<c10::MemoryFormat> memory_format_opt) {
return _empty_generic(size, allocator, ks, scalar_type, memory_format_opt);
}
template <typename T>
TensorBase _empty_strided_generic(
T size,
T stride,
c10::Allocator* allocator,
c10::DispatchKeySet ks,
ScalarType scalar_type) {
at::detail::check_size_nonnegative(size);
at::detail::raise_warning_for_complex_half(scalar_type);
caffe2::TypeMeta dtype = scalarTypeToTypeMeta(scalar_type);
auto size_bytes = computeStorageNbytes(size, stride, dtype.itemsize());
auto storage_impl = c10::make_intrusive<StorageImpl>(
c10::StorageImpl::use_byte_size_t(),
size_bytes,
allocator,
/*resizeable=*/true);
auto tensor = detail::make_tensor_base<TensorImpl>(
std::move(storage_impl), ks, dtype);
tensor.unsafeGetTensorImpl()->set_sizes_and_strides(size, stride);
return tensor;
}
TensorBase empty_strided_generic(
IntArrayRef size,
IntArrayRef stride,
c10::Allocator* allocator,
c10::DispatchKeySet ks,
ScalarType scalar_type) {
return _empty_strided_generic<IntArrayRef>(size, stride, allocator, ks, scalar_type);
}
TensorBase empty_strided_symint_generic(
SymIntArrayRef size,
SymIntArrayRef stride,
c10::Allocator* allocator,
c10::DispatchKeySet ks,
ScalarType scalar_type) {
return _empty_strided_generic<SymIntArrayRef>(size, stride, allocator, ks, scalar_type);
}
TensorBase empty_cpu(IntArrayRef size, ScalarType dtype, bool pin_memory,
c10::optional<c10::MemoryFormat> memory_format_opt) {
auto allocator = GetCPUAllocatorMaybePinned(pin_memory);
constexpr c10::DispatchKeySet cpu_ks(c10::DispatchKey::CPU);
return empty_generic(size, allocator, cpu_ks, dtype, memory_format_opt);
}
TensorBase empty_cpu(
IntArrayRef size,
c10::optional<ScalarType> dtype_opt,
c10::optional<Layout> layout_opt,
c10::optional<Device> device_opt,
c10::optional<bool> pin_memory_opt,
c10::optional<c10::MemoryFormat> memory_format_opt) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::CPU);
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);
auto pin_memory = pinned_memory_or_default(pin_memory_opt);
auto dtype = dtype_or_default(dtype_opt);
return empty_cpu(size, dtype, pin_memory, memory_format_opt);
}
TensorBase empty_cpu(
IntArrayRef size, const TensorOptions &options) {
return at::detail::empty_cpu(
size,
optTypeMetaToScalarType(options.dtype_opt()),
options.layout_opt(),
options.device_opt(),
options.pinned_memory_opt(),
options.memory_format_opt());
}
TensorBase empty_strided_cpu(IntArrayRef size, IntArrayRef stride,
ScalarType dtype, bool pin_memory) {
auto allocator = at::detail::GetCPUAllocatorMaybePinned(pin_memory);
constexpr c10::DispatchKeySet cpu_ks(c10::DispatchKey::CPU);
return at::detail::empty_strided_generic(
size, stride, allocator, cpu_ks, dtype);
}
TensorBase empty_strided_cpu(
IntArrayRef size,
IntArrayRef stride,
c10::optional<ScalarType> dtype_opt,
c10::optional<Layout> layout_opt,
c10::optional<Device> device_opt,
c10::optional<bool> pin_memory_opt) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::CPU);
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);
auto pin_memory = pinned_memory_or_default(pin_memory_opt);
auto dtype = dtype_or_default(dtype_opt);
return at::detail::empty_strided_cpu(size, stride, dtype, pin_memory);
}
TensorBase empty_strided_cpu(
IntArrayRef size,
IntArrayRef stride,
const TensorOptions &options) {
return at::detail::empty_strided_cpu(
size,
stride,
optTypeMetaToScalarType(options.dtype_opt()),
options.layout_opt(),
options.device_opt(),
options.pinned_memory_opt());
}
// The meta allocator ignores whatever allocation is requested and always
// gives you nullptr
struct MetaAllocator final : public at::Allocator {
MetaAllocator() = default;
~MetaAllocator() override = default;
static void deleter(void* const pointer) {
TORCH_INTERNAL_ASSERT(!pointer);
}
DataPtr allocate(const size_t nbytes) const override {
return {nullptr, nullptr, &deleter, at::Device(DeviceType::Meta)};
}
DeleterFnPtr raw_deleter() const override {
return deleter;
}
void copy_data(void* dest, const void* src, std::size_t count) const final {}
};
static MetaAllocator g_meta_alloc;
REGISTER_ALLOCATOR(kMeta, &g_meta_alloc);
TensorBase empty_meta(IntArrayRef size, ScalarType dtype,
c10::optional<c10::MemoryFormat> memory_format_opt) {
auto *allocator = GetAllocator(kMeta);
constexpr c10::DispatchKeySet meta_dks(c10::DispatchKey::Meta);
return at::detail::empty_generic(
size, allocator, meta_dks, dtype, memory_format_opt);
}
TensorBase empty_meta(
IntArrayRef size,
c10::optional<ScalarType> dtype_opt,
c10::optional<Layout> layout_opt,
c10::optional<Device> device_opt,
c10::optional<bool> pin_memory_opt,
c10::optional<c10::MemoryFormat> memory_format_opt
) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::Meta);
// NB: because there is no SparseMeta (yet), non-strided layout is
// exerciseable
TORCH_CHECK_NOT_IMPLEMENTED(
layout_or_default(layout_opt) == Layout::Strided,
"non-strided meta tensors not supported yet"
);
auto dtype = dtype_or_default(dtype_opt);
return empty_meta(size, dtype, memory_format_opt);
}
TensorBase empty_symint_meta(
SymIntArrayRef size,
c10::optional<ScalarType> dtype_opt,
c10::optional<Layout> layout_opt,
c10::optional<Device> device_opt,
c10::optional<bool> pin_memory_opt,
c10::optional<c10::MemoryFormat> memory_format_opt
) {
auto *allocator = GetAllocator(kMeta);
constexpr c10::DispatchKeySet ks(c10::DispatchKey::Meta);
auto scalar_type = dtype_or_default(dtype_opt);
return _empty_generic(size, allocator, ks, scalar_type, memory_format_opt);
}
TensorBase empty_meta(
IntArrayRef size, const TensorOptions &options) {
return at::detail::empty_meta(
size,
optTypeMetaToScalarType(options.dtype_opt()),
options.layout_opt(),
options.device_opt(),
options.pinned_memory_opt(),
options.memory_format_opt());
}
TensorBase empty_strided_meta(IntArrayRef size, IntArrayRef stride,
ScalarType dtype) {
auto *allocator = GetAllocator(kMeta);
constexpr c10::DispatchKeySet meta_dks(c10::DispatchKey::Meta);
return at::detail::empty_strided_generic(
size, stride, allocator, meta_dks, dtype);
}
TensorBase empty_strided_meta(
IntArrayRef size,
IntArrayRef stride,
c10::optional<ScalarType> dtype_opt,
c10::optional<Layout> layout_opt,
c10::optional<Device> device_opt,
c10::optional<bool> pin_memory_opt) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::Meta);
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);
auto dtype = dtype_or_default(dtype_opt);
return at::detail::empty_strided_meta(size, stride, dtype);
}
TensorBase empty_strided_meta(
IntArrayRef size,
IntArrayRef stride,
const TensorOptions &options) {
return at::detail::empty_strided_meta(
size,
stride,
optTypeMetaToScalarType(options.dtype_opt()),
options.layout_opt(),
options.device_opt(),
options.pinned_memory_opt());
}
TensorBase empty_strided_symint_meta(SymIntArrayRef size, SymIntArrayRef stride,
ScalarType dtype) {
auto *allocator = GetAllocator(kMeta);
constexpr c10::DispatchKeySet meta_dks(c10::DispatchKey::Meta);
return at::detail::empty_strided_symint_generic(
size, stride, allocator, meta_dks, dtype);
}
TensorBase empty_strided_symint_meta(
SymIntArrayRef size,
SymIntArrayRef stride,
c10::optional<ScalarType> dtype_opt,
c10::optional<Layout> layout_opt,
c10::optional<Device> device_opt,
c10::optional<bool> pin_memory_opt) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::Meta);
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);
auto dtype = dtype_or_default(dtype_opt);
return at::detail::empty_strided_symint_meta(size, stride, dtype);
}
TensorBase empty_strided_symint_meta(
SymIntArrayRef size,
SymIntArrayRef stride,
const TensorOptions &options) {
return at::detail::empty_strided_symint_meta(
size,
stride,
optTypeMetaToScalarType(options.dtype_opt()),
options.layout_opt(),
options.device_opt(),
options.pinned_memory_opt());
}
} // namespace at::detail