blob: 01b4560e68732245a5134648850587df95436677 [file] [log] [blame]
#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <c10/util/Optional.h>
#include <ATen/quantized/Quantizer.h>
#include <c10/core/impl/DeviceGuardImplInterface.h>
namespace at {
namespace native {
// Take a Device that may not have device_index set (i.e., having it as -1
// representing the current device) and return the corresponding Device
// according to the actual device at the time of this function call. No-op
// if the device_index is set.
static inline Device ensure_has_index(Device device) {
if (device.is_cpu() || device.has_index()) {
return device;
}
const c10::impl::DeviceGuardImplInterface* impl = c10::impl::getDeviceGuardImpl(device.type());
return impl->getDevice();
}
static inline optional<Device> ensure_has_index(optional<Device> device) {
if (!device.has_value()) {
return nullopt;
}
return ensure_has_index(device.value());
}
Tensor _to_copy(
const Tensor& self,
c10::optional<ScalarType> dtype,
c10::optional<Layout> layout,
c10::optional<Device> device,
c10::optional<bool> pin_memory,
bool non_blocking,
c10::optional<c10::MemoryFormat> optional_memory_format) {
TORCH_CHECK(!layout.has_value() || self.layout() == layout.value(),
"to(options) doesn't support converting to a different layout, "
"but got self.layout being ", self.layout(),
" and options.layout set as ", layout.value());
auto options = TensorOptions()
.dtype(dtype)
.layout(layout)
.device(device)
.pinned_memory(pin_memory);
if (options.has_device()) {
options = options.device(ensure_has_index(options.device()));
}
// memory_format is handled separately due to MemoryFormat::Preserve logic
options = self.options().merge_in(options).memory_format(c10::nullopt);
auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve);
bool pin_out = (non_blocking && self.is_cuda() && options.device().is_cpu() &&
(options.layout() == c10::kStrided));
if (memory_format == MemoryFormat::Preserve) {
if (self.is_non_overlapping_and_dense() && options.device().supports_as_strided()) {
Tensor r;
if (self.is_quantized()) {
r = at::empty_quantized(self.sizes(), self, options);
at::QuantizerPtr quantizer = r.quantizer();
r.copy_(self, non_blocking);
set_quantizer_(r, quantizer);
} else {
r = at::empty_strided(
self.sizes(),
self.strides(),
options.pinned_memory(pin_out));
r.copy_(self, non_blocking);
}
return r;
} else {
memory_format = self.suggest_memory_format();
}
}
// See Note [Explicit nullopt MemoryFormat argument]
// TODO: empty_quantized does not work here. It raises an exception in CheckMemoryFormat.h prior to
// empty_affine_quantizd/_empty_per_channel_affine_quantized calls
// at::empty also does not work here because there is no proper at::empty support for quantized tensors
// as it would return a quantized tensor with an UnknownQuantizer
auto r = self.is_quantized() ? at::empty_like(self, memory_format)
: at::empty(self.sizes(),
options.memory_format(memory_format).pinned_memory(pin_out), c10::nullopt);
r.copy_(self, non_blocking);
return r;
}
template <typename T>
static inline bool is_null_or_equal_to(const c10::optional<T>& test, const T& value) {
if (!test.has_value()) {
return true;
}
return test.value() == value;
}
// NOTE: static runtime's to_maybe_copy_out relies on details of this
// check; if you change how it works, please update static runtime as
// well.
bool to_will_alias(
const Tensor& self,
c10::optional<ScalarType> dtype,
c10::optional<Layout> layout,
c10::optional<Device> device,
bool copy,
c10::optional<c10::MemoryFormat> optional_memory_format) {
auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve);
return is_null_or_equal_to(dtype, self.dtype().toScalarType()) &&
is_null_or_equal_to(layout, self.layout()) &&
is_null_or_equal_to(device, self.device()) &&
!copy &&
(memory_format == MemoryFormat::Preserve ||
self.suggest_memory_format() == memory_format);
}
static inline Tensor to_impl(
const Tensor& self,
c10::optional<ScalarType> dtype,
c10::optional<Layout> layout,
c10::optional<Device> device,
c10::optional<bool> pin_memory,
bool non_blocking,
bool copy,
c10::optional<c10::MemoryFormat> optional_memory_format) {
// fast path
if (to_will_alias(self, dtype, layout, device, copy, optional_memory_format)) {
return self;
}
return at::_to_copy(
self, dtype, layout, device, pin_memory, non_blocking, optional_memory_format);
}
// If input tensor is fp32, cast it to fp16, otherwise leave it alone.
// (this is intended to be used internally by the JIT autocast implementation)
Tensor _autocast_to_reduced_precision(const Tensor& self, bool cuda_enabled, bool cpu_enabled, ScalarType cuda_dtype, ScalarType cpu_dtype) {
if (self.dtype() == at::ScalarType::Float &&
((self.device().is_cuda() && cuda_enabled) ||
(self.device().is_cpu() && cpu_enabled))
) {
at::ScalarType target = at::ScalarType::Undefined;
if (self.device().is_cuda()) {
target = cuda_dtype;
} else if (self.device().is_cpu()) {
target = cpu_dtype;
}
TORCH_INTERNAL_ASSERT(target != at::ScalarType::Undefined, "_autocast_to_reduced_precision requires legit ScalarType argument for given device");
return to_impl(
self, target, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt);
} else {
return self;
}
}
// If input tensor is fp16, cast it to fp32, otherwise leave it alone.
// (this is intended to be used internally by the JIT autocast implementation)
Tensor _autocast_to_full_precision(const Tensor& self, bool cuda_enabled, bool cpu_enabled) {
if ((self.dtype() == at::ScalarType::Half || self.dtype() == at::ScalarType::BFloat16) &&
((self.device().is_cuda() && cuda_enabled) ||
(self.device().is_cpu() && cpu_enabled))
) {
return to_impl(
self, at::ScalarType::Float, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt);
} else {
return self;
}
}
Tensor to(
const Tensor& self,
c10::optional<ScalarType> dtype,
c10::optional<Layout> layout,
c10::optional<Device> device,
c10::optional<bool> pin_memory,
bool non_blocking,
bool copy,
c10::optional<c10::MemoryFormat> optional_memory_format
) {
return to_impl(
self,
dtype,
layout,
ensure_has_index(device),
pin_memory,
non_blocking,
copy,
optional_memory_format);
}
Tensor to(const Tensor& self, Device device, ScalarType dtype, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
return to_impl(
self,
dtype,
nullopt,
ensure_has_index(device),
nullopt,
non_blocking,
copy,
optional_memory_format);
}
Tensor to(const Tensor& self, ScalarType dtype, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
return to_impl(
self,
dtype,
nullopt,
nullopt,
nullopt,
non_blocking,
copy,
optional_memory_format);
}
Tensor to(const Tensor& self, const Tensor& other, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
auto options = other.options();
return to_impl(
self,
options.dtype().toScalarType(),
options.layout(),
options.device(),
options.pinned_memory(),
non_blocking,
copy,
optional_memory_format);
}
// This op is important primarily for lazy / graph-based backends.
// While this vanilla implementation loops through each tensor and independently converts it to cpu,
// a lazy backend like XLA might need to tell sync updates across tensors.
std::vector<Tensor> _to_cpu(TensorList tensors) {
std::vector<Tensor> cpu_tensors;
for (const auto& t : tensors) {
cpu_tensors.push_back(t.cpu());
}
return cpu_tensors;
}
Tensor to_dense_backward(const Tensor& grad, const Tensor& input_) {
AT_ASSERT(input_.layout() != c10::kStrided);
if (input_.layout() == c10::kSparse) {
auto input = input_.coalesce();
return grad.sparse_mask(input);
}
if (input_.layout() == c10::kMkldnn) {
return grad.to_mkldnn(input_.scalar_type());
}
if (input_.layout() == c10::kStrided) {
return grad.to_dense();
}
AT_ERROR("Unsupported input layout: ", input_.layout());
}
Tensor to_mkldnn_backward(const Tensor& grad, const Tensor& input_) {
AT_ASSERT(input_.layout() == c10::kStrided);
return grad.to_dense(input_.scalar_type());
}
Tensor to_dense(const Tensor& tensor, c10::optional<c10::ScalarType> dtype) {
if (tensor.layout() == c10::kSparse) {
return tensor._to_dense(dtype);
}
if (tensor.layout() == c10::kSparseCsr) {
return tensor._to_dense(dtype);
}
if (tensor.layout() == c10::kMkldnn) {
return tensor._to_dense(dtype);
}
TORCH_CHECK(tensor.layout() == c10::kStrided, "to_dense does not support layout ", tensor.layout());
if (dtype) {
return tensor.to(*dtype);
}
return tensor;
}
Tensor sparse_to_dense(
const Tensor& self,
c10::optional<ScalarType> dtype) {
TORCH_CHECK(
!dtype.has_value(), "dtype argument is not supported by sparse_to_dense");
Tensor dst = at::zeros(self.sizes(), self.options().layout(kStrided));
return dst.add_(self);
}
Tensor sparse_csr_to_dense(
const Tensor& self,
c10::optional<ScalarType> dtype) {
TORCH_CHECK(
!dtype.has_value(), "dtype argument is not supported by sparse_csr_to_dense");
Tensor dst = at::zeros(self.sizes(), self.options().layout(kStrided));
return dst.add_(self);
}
// Computes the strides for view_dtype output when the view dtype is
// smaller than the original dtype
inline DimVector compute_strides_for_view_dtype_downsize(IntArrayRef old_strides, int64_t size_ratio, ScalarType old_dtype, ScalarType new_dtype) {
const int64_t ndim = old_strides.size();
TORCH_CHECK(
old_strides[ndim - 1] == 1,
"self.stride(-1) must be 1 to view ", old_dtype, " as ", new_dtype,
" (different element sizes), but got ", old_strides[ndim - 1]);
DimVector new_strides(ndim);
for (int64_t dim_idx = 0; dim_idx < ndim - 1; dim_idx++) {
new_strides[dim_idx] = old_strides[dim_idx] * size_ratio;
}
new_strides[ndim - 1] = 1;
return new_strides;
}
// Computes the strides for view_dtype output when the view dtype is
// larger than the original dtype
inline DimVector compute_strides_for_view_dtype_upsize(IntArrayRef old_strides, int64_t size_ratio, ScalarType old_dtype, ScalarType new_dtype) {
const int64_t ndim = old_strides.size();
TORCH_CHECK(
old_strides[ndim - 1] == 1,
"self.stride(-1) must be 1 to view ", old_dtype, " as ", new_dtype,
" (different element sizes), but got ", old_strides[ndim - 1]);
DimVector new_strides(ndim);
for (int64_t dim_idx = 0; dim_idx < ndim - 1; dim_idx++) {
TORCH_CHECK(
(old_strides[dim_idx] % size_ratio) == 0,
"self.stride(", dim_idx, ") must be divisible by ", size_ratio,
" to view ", old_dtype, " as ", new_dtype, " (different element sizes), ",
"but got ", old_strides[dim_idx]);
new_strides[dim_idx] = old_strides[dim_idx] / size_ratio;
}
new_strides[ndim - 1] = 1;
return new_strides;
}
Tensor view_dtype(const Tensor& self, ScalarType dtype) {
if (self.scalar_type() == dtype) {
return self;
}
const auto type_meta = c10::scalarTypeToTypeMeta(dtype);
TORCH_CHECK(!self.is_conj(),
"torch.Tensor.view is not supported for conjugate view tensors when converting to a different dtype.");
TORCH_CHECK(!self.is_neg(),
"torch.Tensor.view is not supported for tensors with negative bit set when converting to a different dtype.");
int64_t self_element_size = self.element_size();
int64_t new_element_size = static_cast<int64_t>(type_meta.itemsize());
Storage storage = self.storage();
auto new_tensor = detail::make_tensor<TensorImpl>(
std::move(storage), self.key_set(), type_meta);
auto* impl = new_tensor.unsafeGetTensorImpl();
if (self_element_size == new_element_size) {
impl->set_storage_offset(self.storage_offset());
impl->set_sizes_and_strides(self.sizes(), self.strides());
} else if (self.dim() == 0) {
TORCH_CHECK(false,
"self.dim() cannot be 0 to view ", self.scalar_type(), " as ",
dtype, " (different element sizes)");
} else if (self_element_size > new_element_size) {
// Downsizing element size
int64_t size_ratio = self_element_size / new_element_size;
auto new_strides = compute_strides_for_view_dtype_downsize(
self.strides(), size_ratio, self.scalar_type(), dtype);
auto old_sizes = self.sizes();
DimVector new_sizes(self.dim());
std::copy(old_sizes.begin(), old_sizes.end(), new_sizes.begin());
new_sizes[self.dim() - 1] *= size_ratio;
auto new_storage_offset = size_ratio * self.storage_offset();
impl->set_storage_offset(new_storage_offset);
impl->set_sizes_and_strides(new_sizes, new_strides);
} else {
// Upsizing element size
int64_t size_ratio = new_element_size / self_element_size;
TORCH_CHECK(
(self.size(-1) % size_ratio) == 0,
"self.size(-1) must be divisible by ", size_ratio, " to view ",
self.scalar_type(), " as ", dtype, " (different element sizes), ",
"but got ", self.size(-1));
TORCH_CHECK(
(self.storage_offset() % size_ratio) == 0,
"self.storage_offset() must be divisible by ", size_ratio, " to view ",
self.scalar_type(), " as ", dtype, " (different element sizes), but got ",
self.storage_offset());
auto new_strides = compute_strides_for_view_dtype_upsize(
self.strides(), size_ratio, self.scalar_type(), dtype);
auto old_sizes = self.sizes();
DimVector new_sizes(self.dim());
std::copy(old_sizes.begin(), old_sizes.end(), new_sizes.begin());
new_sizes[self.dim() - 1] /= size_ratio;
auto new_storage_offset = self.storage_offset() / size_ratio;
impl->set_storage_offset(new_storage_offset);
impl->set_sizes_and_strides(new_sizes, new_strides);
}
return new_tensor;
}
Tensor dense_to_sparse_csr(const Tensor& self) {
return self.to_sparse().to_sparse_csr();
}
Tensor csr_to_sparse_csr(const Tensor& self) {
return self;
}
Tensor coo_to_sparse_csr(const Tensor& self) {
TORCH_CHECK(
self.dim() == 2,
"Only 2D tensors can be converted to the CSR format but got shape: ",
self.sizes());
auto coalesced_self = self.coalesce();
auto row_indices = coalesced_self.indices()[0];
bool out_int32 = (row_indices.scalar_type() == at::kInt);
auto crow_indices = at::_convert_indices_from_coo_to_csr(
row_indices, self.size(0), out_int32);
return at::native::_sparse_csr_tensor_unsafe(
crow_indices,
coalesced_self.indices()[1].contiguous(),
coalesced_self.values(),
coalesced_self.sizes(),
coalesced_self.scalar_type(),
c10::kSparseCsr,
coalesced_self.device());
}
}} // namespace at::native