aten/src/ATen/native/TensorConversions.cpp - platform/external/pytorch - Git at Google

 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <c10/util/Optional.h>
 #include <ATen/quantized/Quantizer.h>

 #include <c10/core/impl/DeviceGuardImplInterface.h>

 namespace at {
 namespace native {

 // Take a Device that may not have device_index set (i.e., having it as -1
 // representing the current device) and return the corresponding Device
 // according to the actual device at the time of this function call.  No-op
 // if the device_index is set.
 static inline Device ensure_has_index(Device device) {
   if (device.is_cpu() || device.has_index()) {
     return device;
   }
   const c10::impl::DeviceGuardImplInterface* impl = c10::impl::getDeviceGuardImpl(device.type());
   return impl->getDevice();
 }

 static inline optional<Device> ensure_has_index(optional<Device> device) {
   if (!device.has_value()) {
     return nullopt;
   }
   return ensure_has_index(device.value());
 }

 Tensor _to_copy(
     const Tensor& self,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
     c10::optional<Device> device,
     c10::optional<bool> pin_memory,
     bool non_blocking,
     c10::optional<c10::MemoryFormat> optional_memory_format) {
   TORCH_CHECK(!layout.has_value() || self.layout() == layout.value(),
            "to(options) doesn't support converting to a different layout, "
            "but got self.layout being ", self.layout(),
            " and options.layout set as ", layout.value());
   auto options = TensorOptions()
     .dtype(dtype)
     .layout(layout)
     .device(device)
     .pinned_memory(pin_memory);

   if (options.has_device()) {
     options = options.device(ensure_has_index(options.device()));
   }
   // memory_format is handled separately due to MemoryFormat::Preserve logic
   options = self.options().merge_in(options).memory_format(c10::nullopt);
   auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve);

   bool pin_out = (non_blocking && self.is_cuda() && options.device().is_cpu() &&
                   (options.layout() == c10::kStrided));

   if (memory_format == MemoryFormat::Preserve) {
     if (self.is_non_overlapping_and_dense() && options.device().supports_as_strided()) {
       Tensor r;
       if (self.is_quantized()) {
         r = at::empty_quantized(self.sizes(), self, options);
         at::QuantizerPtr quantizer = r.quantizer();
         r.copy_(self, non_blocking);
         set_quantizer_(r, quantizer);
       } else {
         r = at::empty_strided(
             self.sizes(),
             self.strides(),
             options.pinned_memory(pin_out));
         r.copy_(self, non_blocking);
       }
       return r;
     } else {
       memory_format = self.suggest_memory_format();
     }
   }
   // See Note [Explicit nullopt MemoryFormat argument]
   // TODO: empty_quantized does not work here. It raises an exception in CheckMemoryFormat.h prior to
   // empty_affine_quantizd/_empty_per_channel_affine_quantized calls
   // at::empty also does not work here because there is no proper at::empty support for quantized tensors
   // as it would return a quantized tensor with an UnknownQuantizer
   auto r = self.is_quantized() ? at::empty_like(self, memory_format)
                                : at::empty(self.sizes(),
                                  options.memory_format(memory_format).pinned_memory(pin_out), c10::nullopt);
   r.copy_(self, non_blocking);
   return r;
 }

 template <typename T>
 static inline bool is_null_or_equal_to(const c10::optional<T>& test, const T& value) {
   if (!test.has_value()) {
     return true;
   }
   return test.value() == value;
 }

 // NOTE: static runtime's to_maybe_copy_out relies on details of this
 // check; if you change how it works, please update static runtime as
 // well.
 bool to_will_alias(
     const Tensor& self,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
     c10::optional<Device> device,
     bool copy,
     c10::optional<c10::MemoryFormat> optional_memory_format) {
   auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve);

   return is_null_or_equal_to(dtype, self.dtype().toScalarType()) &&
     is_null_or_equal_to(layout, self.layout()) &&
     is_null_or_equal_to(device, self.device()) &&
     !copy &&
     (memory_format == MemoryFormat::Preserve ||
      self.suggest_memory_format() == memory_format);
 }

 static inline Tensor to_impl(
     const Tensor& self,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
     c10::optional<Device> device,
     c10::optional<bool> pin_memory,
     bool non_blocking,
     bool copy,
     c10::optional<c10::MemoryFormat> optional_memory_format) {

   // fast path
   if (to_will_alias(self, dtype, layout, device, copy, optional_memory_format)) {
     return self;
   }
   return at::_to_copy(
       self, dtype, layout, device, pin_memory, non_blocking, optional_memory_format);
 }

 // If input tensor is fp32, cast it to fp16, otherwise leave it alone.
 // (this is intended to be used internally by the JIT autocast implementation)
 Tensor _autocast_to_reduced_precision(const Tensor& self, bool cuda_enabled, bool cpu_enabled, ScalarType cuda_dtype, ScalarType cpu_dtype) {
   if (self.dtype() == at::ScalarType::Float &&
       ((self.device().is_cuda() && cuda_enabled) ||
       (self.device().is_cpu() && cpu_enabled))
       ) {
     at::ScalarType target = at::ScalarType::Undefined;
     if (self.device().is_cuda()) {
       target = cuda_dtype;
     } else if (self.device().is_cpu()) {
       target = cpu_dtype;
     }

     TORCH_INTERNAL_ASSERT(target != at::ScalarType::Undefined, "_autocast_to_reduced_precision requires legit ScalarType argument for given device");

     return to_impl(
         self, target, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt);
   } else {
     return self;
   }
 }

 // If input tensor is fp16, cast it to fp32, otherwise leave it alone.
 // (this is intended to be used internally by the JIT autocast implementation)
 Tensor _autocast_to_full_precision(const Tensor& self, bool cuda_enabled, bool cpu_enabled) {
   if ((self.dtype() == at::ScalarType::Half || self.dtype() == at::ScalarType::BFloat16) &&
       ((self.device().is_cuda() && cuda_enabled) ||
       (self.device().is_cpu() && cpu_enabled))
       ) {
     return to_impl(
         self, at::ScalarType::Float, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt);
   } else {
     return self;
   }
 }

 Tensor to(
   const Tensor& self,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
     c10::optional<Device> device,
     c10::optional<bool> pin_memory,
   bool non_blocking,
   bool copy,
   c10::optional<c10::MemoryFormat> optional_memory_format
 ) {
   return to_impl(
       self,
       dtype,
       layout,
       ensure_has_index(device),
       pin_memory,
       non_blocking,
       copy,
       optional_memory_format);
 }

 Tensor to(const Tensor& self, Device device, ScalarType dtype, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
   return to_impl(
       self,
       dtype,
       nullopt,
       ensure_has_index(device),
       nullopt,
       non_blocking,
       copy,
       optional_memory_format);
 }

 Tensor to(const Tensor& self, ScalarType dtype, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
   return to_impl(
       self,
       dtype,
       nullopt,
       nullopt,
       nullopt,
       non_blocking,
       copy,
       optional_memory_format);
 }

 Tensor to(const Tensor& self, const Tensor& other, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
   auto options = other.options();
   return to_impl(
       self,
       options.dtype().toScalarType(),
       options.layout(),
       options.device(),
       options.pinned_memory(),
       non_blocking,
       copy,
       optional_memory_format);
 }

 // This op is important primarily for lazy / graph-based backends.
 // While this vanilla implementation loops through each tensor and independently converts it to cpu,
 // a lazy backend like XLA might need to tell sync updates across tensors.
 std::vector<Tensor> _to_cpu(TensorList tensors) {
     std::vector<Tensor> cpu_tensors;
     for (const auto& t : tensors) {
         cpu_tensors.push_back(t.cpu());
     }
     return cpu_tensors;
 }

 Tensor to_dense_backward(const Tensor& grad, const Tensor& input_) {
   AT_ASSERT(input_.layout() != c10::kStrided);
   if (input_.layout() == c10::kSparse) {
     auto input = input_.coalesce();
     return grad.sparse_mask(input);
   }
   if (input_.layout() == c10::kMkldnn) {
     return grad.to_mkldnn(input_.scalar_type());
   }
   if (input_.layout() == c10::kStrided) {
     return grad.to_dense();
   }
   AT_ERROR("Unsupported input layout: ", input_.layout());
 }

 Tensor to_mkldnn_backward(const Tensor& grad, const Tensor& input_) {
   AT_ASSERT(input_.layout() == c10::kStrided);
   return grad.to_dense(input_.scalar_type());
 }

 Tensor to_dense(const Tensor& tensor, c10::optional<c10::ScalarType> dtype) {
   if (tensor.layout() == c10::kSparse) {
     return tensor._to_dense(dtype);
   }
   if (tensor.layout() == c10::kSparseCsr) {
     return tensor._to_dense(dtype);
   }
   if (tensor.layout() == c10::kMkldnn) {
     return tensor._to_dense(dtype);
   }
   TORCH_CHECK(tensor.layout() == c10::kStrided, "to_dense does not support layout ", tensor.layout());
   if (dtype) {
     return tensor.to(*dtype);
   }
   return tensor;
 }

 Tensor sparse_to_dense(
     const Tensor& self,
     c10::optional<ScalarType> dtype) {
   TORCH_CHECK(
       !dtype.has_value(), "dtype argument is not supported by sparse_to_dense");
   Tensor dst = at::zeros(self.sizes(), self.options().layout(kStrided));
   return dst.add_(self);
 }

 Tensor sparse_csr_to_dense(
     const Tensor& self,
     c10::optional<ScalarType> dtype) {
   TORCH_CHECK(
       !dtype.has_value(), "dtype argument is not supported by sparse_csr_to_dense");
   Tensor dst = at::zeros(self.sizes(), self.options().layout(kStrided));
   return dst.add_(self);
 }

 // Computes the strides for view_dtype output when the view dtype is
 // smaller than the original dtype
 inline DimVector compute_strides_for_view_dtype_downsize(IntArrayRef old_strides, int64_t size_ratio, ScalarType old_dtype, ScalarType new_dtype) {
   const int64_t ndim = old_strides.size();

   TORCH_CHECK(
     old_strides[ndim - 1] == 1,
     "self.stride(-1) must be 1 to view ", old_dtype, " as ", new_dtype,
     " (different element sizes), but got ", old_strides[ndim - 1]);

   DimVector new_strides(ndim);
   for (int64_t dim_idx = 0; dim_idx < ndim - 1; dim_idx++) {
     new_strides[dim_idx] = old_strides[dim_idx] * size_ratio;
   }
   new_strides[ndim - 1] = 1;
   return new_strides;
 }

 // Computes the strides for view_dtype output when the view dtype is
 // larger than the original dtype
 inline DimVector compute_strides_for_view_dtype_upsize(IntArrayRef old_strides, int64_t size_ratio, ScalarType old_dtype, ScalarType new_dtype) {
   const int64_t ndim = old_strides.size();
   TORCH_CHECK(
     old_strides[ndim - 1] == 1,
     "self.stride(-1) must be 1 to view ", old_dtype, " as ", new_dtype,
     " (different element sizes), but got ", old_strides[ndim - 1]);

   DimVector new_strides(ndim);
   for (int64_t dim_idx = 0; dim_idx < ndim - 1; dim_idx++) {
     TORCH_CHECK(
       (old_strides[dim_idx] % size_ratio) == 0,
       "self.stride(", dim_idx, ") must be divisible by ", size_ratio,
       " to view ", old_dtype, " as ", new_dtype, " (different element sizes), ",
       "but got ", old_strides[dim_idx]);

     new_strides[dim_idx] = old_strides[dim_idx] / size_ratio;
   }
   new_strides[ndim - 1] = 1;
   return new_strides;
 }

 Tensor view_dtype(const Tensor& self, ScalarType dtype) {
   if (self.scalar_type() == dtype) {
     return self;
   }
   const auto type_meta = c10::scalarTypeToTypeMeta(dtype);
   TORCH_CHECK(!self.is_conj(),
     "torch.Tensor.view is not supported for conjugate view tensors when converting to a different dtype.");
   TORCH_CHECK(!self.is_neg(),
     "torch.Tensor.view is not supported for tensors with negative bit set when converting to a different dtype.");

   int64_t self_element_size = self.element_size();
   int64_t new_element_size = static_cast<int64_t>(type_meta.itemsize());

   Storage storage = self.storage();
   auto new_tensor = detail::make_tensor<TensorImpl>(
       std::move(storage), self.key_set(), type_meta);
   auto* impl = new_tensor.unsafeGetTensorImpl();

   if (self_element_size == new_element_size) {
     impl->set_storage_offset(self.storage_offset());
     impl->set_sizes_and_strides(self.sizes(), self.strides());

   } else if (self.dim() == 0) {
     TORCH_CHECK(false,
       "self.dim() cannot be 0 to view ", self.scalar_type(), " as ",
       dtype, " (different element sizes)");

   } else if (self_element_size > new_element_size) {
     // Downsizing element size

     int64_t size_ratio = self_element_size / new_element_size;
     auto new_strides = compute_strides_for_view_dtype_downsize(
       self.strides(), size_ratio, self.scalar_type(), dtype);

     auto old_sizes = self.sizes();
     DimVector new_sizes(self.dim());
     std::copy(old_sizes.begin(), old_sizes.end(), new_sizes.begin());
     new_sizes[self.dim() - 1] *= size_ratio;

     auto new_storage_offset = size_ratio * self.storage_offset();

     impl->set_storage_offset(new_storage_offset);
     impl->set_sizes_and_strides(new_sizes, new_strides);

   } else {
     // Upsizing element size

     int64_t size_ratio = new_element_size / self_element_size;

     TORCH_CHECK(
       (self.size(-1) % size_ratio) == 0,
       "self.size(-1) must be divisible by ", size_ratio, " to view ",
       self.scalar_type(), " as ", dtype, " (different element sizes), ",
       "but got ", self.size(-1));

     TORCH_CHECK(
       (self.storage_offset() % size_ratio) == 0,
       "self.storage_offset() must be divisible by ", size_ratio, " to view ",
       self.scalar_type(), " as ", dtype, " (different element sizes), but got ",
       self.storage_offset());

     auto new_strides = compute_strides_for_view_dtype_upsize(
       self.strides(), size_ratio, self.scalar_type(), dtype);

     auto old_sizes = self.sizes();
     DimVector new_sizes(self.dim());
     std::copy(old_sizes.begin(), old_sizes.end(), new_sizes.begin());
     new_sizes[self.dim() - 1] /= size_ratio;

     auto new_storage_offset = self.storage_offset() / size_ratio;

     impl->set_storage_offset(new_storage_offset);
     impl->set_sizes_and_strides(new_sizes, new_strides);
   }

   return new_tensor;
 }

 Tensor dense_to_sparse_csr(const Tensor& self) {
   return self.to_sparse().to_sparse_csr();
 }

 Tensor csr_to_sparse_csr(const Tensor& self) {
   return self;
 }

 Tensor coo_to_sparse_csr(const Tensor& self) {
   TORCH_CHECK(
       self.dim() == 2,
       "Only 2D tensors can be converted to the CSR format but got shape: ",
       self.sizes());
   auto coalesced_self = self.coalesce();
   auto row_indices = coalesced_self.indices()[0];
   bool out_int32 = (row_indices.scalar_type() == at::kInt);
   auto crow_indices = at::_convert_indices_from_coo_to_csr(
       row_indices, self.size(0), out_int32);
   return at::native::_sparse_csr_tensor_unsafe(
       crow_indices,
       coalesced_self.indices()[1].contiguous(),
       coalesced_self.values(),
       coalesced_self.sizes(),
       coalesced_self.scalar_type(),
       c10::kSparseCsr,
       coalesced_self.device());
 }

 }} // namespace at::native
	#include <ATen/ATen.h>
	#include <ATen/NativeFunctions.h>
	#include <c10/util/Optional.h>
	#include <ATen/quantized/Quantizer.h>

	#include <c10/core/impl/DeviceGuardImplInterface.h>

	namespace at {
	namespace native {

	// Take a Device that may not have device_index set (i.e., having it as -1
	// representing the current device) and return the corresponding Device
	// according to the actual device at the time of this function call. No-op
	// if the device_index is set.
	static inline Device ensure_has_index(Device device) {
	if (device.is_cpu() \|\| device.has_index()) {
	return device;
	}
	const c10::impl::DeviceGuardImplInterface* impl = c10::impl::getDeviceGuardImpl(device.type());
	return impl->getDevice();
	}

	static inline optional<Device> ensure_has_index(optional<Device> device) {
	if (!device.has_value()) {
	return nullopt;
	}
	return ensure_has_index(device.value());
	}

	Tensor _to_copy(
	const Tensor& self,
	c10::optional<ScalarType> dtype,
	c10::optional<Layout> layout,
	c10::optional<Device> device,
	c10::optional<bool> pin_memory,
	bool non_blocking,
	c10::optional<c10::MemoryFormat> optional_memory_format) {
	TORCH_CHECK(!layout.has_value() \|\| self.layout() == layout.value(),
	"to(options) doesn't support converting to a different layout, "
	"but got self.layout being ", self.layout(),
	" and options.layout set as ", layout.value());
	auto options = TensorOptions()
	.dtype(dtype)
	.layout(layout)
	.device(device)
	.pinned_memory(pin_memory);

	if (options.has_device()) {
	options = options.device(ensure_has_index(options.device()));
	}
	// memory_format is handled separately due to MemoryFormat::Preserve logic
	options = self.options().merge_in(options).memory_format(c10::nullopt);
	auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve);

	bool pin_out = (non_blocking && self.is_cuda() && options.device().is_cpu() &&
	(options.layout() == c10::kStrided));

	if (memory_format == MemoryFormat::Preserve) {
	if (self.is_non_overlapping_and_dense() && options.device().supports_as_strided()) {
	Tensor r;
	if (self.is_quantized()) {
	r = at::empty_quantized(self.sizes(), self, options);
	at::QuantizerPtr quantizer = r.quantizer();
	r.copy_(self, non_blocking);
	set_quantizer_(r, quantizer);
	} else {
	r = at::empty_strided(
	self.sizes(),
	self.strides(),
	options.pinned_memory(pin_out));
	r.copy_(self, non_blocking);
	}
	return r;
	} else {
	memory_format = self.suggest_memory_format();
	}
	}
	// See Note [Explicit nullopt MemoryFormat argument]
	// TODO: empty_quantized does not work here. It raises an exception in CheckMemoryFormat.h prior to
	// empty_affine_quantizd/_empty_per_channel_affine_quantized calls
	// at::empty also does not work here because there is no proper at::empty support for quantized tensors
	// as it would return a quantized tensor with an UnknownQuantizer
	auto r = self.is_quantized() ? at::empty_like(self, memory_format)
	: at::empty(self.sizes(),
	options.memory_format(memory_format).pinned_memory(pin_out), c10::nullopt);
	r.copy_(self, non_blocking);
	return r;
	}

	template <typename T>
	static inline bool is_null_or_equal_to(const c10::optional<T>& test, const T& value) {
	if (!test.has_value()) {
	return true;
	}
	return test.value() == value;
	}

	// NOTE: static runtime's to_maybe_copy_out relies on details of this
	// check; if you change how it works, please update static runtime as
	// well.
	bool to_will_alias(
	const Tensor& self,
	c10::optional<ScalarType> dtype,
	c10::optional<Layout> layout,
	c10::optional<Device> device,
	bool copy,
	c10::optional<c10::MemoryFormat> optional_memory_format) {
	auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve);

	return is_null_or_equal_to(dtype, self.dtype().toScalarType()) &&
	is_null_or_equal_to(layout, self.layout()) &&
	is_null_or_equal_to(device, self.device()) &&
	!copy &&
	(memory_format == MemoryFormat::Preserve \|\|
	self.suggest_memory_format() == memory_format);
	}

	static inline Tensor to_impl(
	const Tensor& self,
	c10::optional<ScalarType> dtype,
	c10::optional<Layout> layout,
	c10::optional<Device> device,
	c10::optional<bool> pin_memory,
	bool non_blocking,
	bool copy,
	c10::optional<c10::MemoryFormat> optional_memory_format) {

	// fast path
	if (to_will_alias(self, dtype, layout, device, copy, optional_memory_format)) {
	return self;
	}
	return at::_to_copy(
	self, dtype, layout, device, pin_memory, non_blocking, optional_memory_format);
	}

	// If input tensor is fp32, cast it to fp16, otherwise leave it alone.
	// (this is intended to be used internally by the JIT autocast implementation)
	Tensor _autocast_to_reduced_precision(const Tensor& self, bool cuda_enabled, bool cpu_enabled, ScalarType cuda_dtype, ScalarType cpu_dtype) {
	if (self.dtype() == at::ScalarType::Float &&
	((self.device().is_cuda() && cuda_enabled) \|\|
	(self.device().is_cpu() && cpu_enabled))
	) {
	at::ScalarType target = at::ScalarType::Undefined;
	if (self.device().is_cuda()) {
	target = cuda_dtype;
	} else if (self.device().is_cpu()) {
	target = cpu_dtype;
	}

	TORCH_INTERNAL_ASSERT(target != at::ScalarType::Undefined, "_autocast_to_reduced_precision requires legit ScalarType argument for given device");

	return to_impl(
	self, target, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt);
	} else {
	return self;
	}
	}

	// If input tensor is fp16, cast it to fp32, otherwise leave it alone.
	// (this is intended to be used internally by the JIT autocast implementation)
	Tensor _autocast_to_full_precision(const Tensor& self, bool cuda_enabled, bool cpu_enabled) {
	if ((self.dtype() == at::ScalarType::Half \|\| self.dtype() == at::ScalarType::BFloat16) &&
	((self.device().is_cuda() && cuda_enabled) \|\|
	(self.device().is_cpu() && cpu_enabled))
	) {
	return to_impl(
	self, at::ScalarType::Float, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt);
	} else {
	return self;
	}
	}

	Tensor to(
	const Tensor& self,
	c10::optional<ScalarType> dtype,
	c10::optional<Layout> layout,
	c10::optional<Device> device,
	c10::optional<bool> pin_memory,
	bool non_blocking,
	bool copy,
	c10::optional<c10::MemoryFormat> optional_memory_format
	) {
	return to_impl(
	self,
	dtype,
	layout,
	ensure_has_index(device),
	pin_memory,
	non_blocking,
	copy,
	optional_memory_format);
	}

	Tensor to(const Tensor& self, Device device, ScalarType dtype, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
	return to_impl(
	self,
	dtype,
	nullopt,
	ensure_has_index(device),
	nullopt,
	non_blocking,
	copy,
	optional_memory_format);
	}

	Tensor to(const Tensor& self, ScalarType dtype, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
	return to_impl(
	self,
	dtype,
	nullopt,
	nullopt,
	nullopt,
	non_blocking,
	copy,
	optional_memory_format);
	}

	Tensor to(const Tensor& self, const Tensor& other, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
	auto options = other.options();
	return to_impl(
	self,
	options.dtype().toScalarType(),
	options.layout(),
	options.device(),
	options.pinned_memory(),
	non_blocking,
	copy,
	optional_memory_format);
	}

	// This op is important primarily for lazy / graph-based backends.
	// While this vanilla implementation loops through each tensor and independently converts it to cpu,
	// a lazy backend like XLA might need to tell sync updates across tensors.
	std::vector<Tensor> _to_cpu(TensorList tensors) {
	std::vector<Tensor> cpu_tensors;
	for (const auto& t : tensors) {
	cpu_tensors.push_back(t.cpu());
	}
	return cpu_tensors;
	}

	Tensor to_dense_backward(const Tensor& grad, const Tensor& input_) {
	AT_ASSERT(input_.layout() != c10::kStrided);
	if (input_.layout() == c10::kSparse) {
	auto input = input_.coalesce();
	return grad.sparse_mask(input);
	}
	if (input_.layout() == c10::kMkldnn) {
	return grad.to_mkldnn(input_.scalar_type());
	}
	if (input_.layout() == c10::kStrided) {
	return grad.to_dense();
	}
	AT_ERROR("Unsupported input layout: ", input_.layout());
	}

	Tensor to_mkldnn_backward(const Tensor& grad, const Tensor& input_) {
	AT_ASSERT(input_.layout() == c10::kStrided);
	return grad.to_dense(input_.scalar_type());
	}

	Tensor to_dense(const Tensor& tensor, c10::optional<c10::ScalarType> dtype) {
	if (tensor.layout() == c10::kSparse) {
	return tensor._to_dense(dtype);
	}
	if (tensor.layout() == c10::kSparseCsr) {
	return tensor._to_dense(dtype);
	}
	if (tensor.layout() == c10::kMkldnn) {
	return tensor._to_dense(dtype);
	}
	TORCH_CHECK(tensor.layout() == c10::kStrided, "to_dense does not support layout ", tensor.layout());
	if (dtype) {
	return tensor.to(*dtype);
	}
	return tensor;
	}

	Tensor sparse_to_dense(
	const Tensor& self,
	c10::optional<ScalarType> dtype) {
	TORCH_CHECK(
	!dtype.has_value(), "dtype argument is not supported by sparse_to_dense");
	Tensor dst = at::zeros(self.sizes(), self.options().layout(kStrided));
	return dst.add_(self);
	}

	Tensor sparse_csr_to_dense(
	const Tensor& self,
	c10::optional<ScalarType> dtype) {
	TORCH_CHECK(
	!dtype.has_value(), "dtype argument is not supported by sparse_csr_to_dense");
	Tensor dst = at::zeros(self.sizes(), self.options().layout(kStrided));
	return dst.add_(self);
	}

	// Computes the strides for view_dtype output when the view dtype is
	// smaller than the original dtype
	inline DimVector compute_strides_for_view_dtype_downsize(IntArrayRef old_strides, int64_t size_ratio, ScalarType old_dtype, ScalarType new_dtype) {
	const int64_t ndim = old_strides.size();

	TORCH_CHECK(
	old_strides[ndim - 1] == 1,
	"self.stride(-1) must be 1 to view ", old_dtype, " as ", new_dtype,
	" (different element sizes), but got ", old_strides[ndim - 1]);

	DimVector new_strides(ndim);
	for (int64_t dim_idx = 0; dim_idx < ndim - 1; dim_idx++) {
	new_strides[dim_idx] = old_strides[dim_idx] * size_ratio;
	}
	new_strides[ndim - 1] = 1;
	return new_strides;
	}

	// Computes the strides for view_dtype output when the view dtype is
	// larger than the original dtype
	inline DimVector compute_strides_for_view_dtype_upsize(IntArrayRef old_strides, int64_t size_ratio, ScalarType old_dtype, ScalarType new_dtype) {
	const int64_t ndim = old_strides.size();
	TORCH_CHECK(
	old_strides[ndim - 1] == 1,
	"self.stride(-1) must be 1 to view ", old_dtype, " as ", new_dtype,
	" (different element sizes), but got ", old_strides[ndim - 1]);

	DimVector new_strides(ndim);
	for (int64_t dim_idx = 0; dim_idx < ndim - 1; dim_idx++) {
	TORCH_CHECK(
	(old_strides[dim_idx] % size_ratio) == 0,
	"self.stride(", dim_idx, ") must be divisible by ", size_ratio,
	" to view ", old_dtype, " as ", new_dtype, " (different element sizes), ",
	"but got ", old_strides[dim_idx]);

	new_strides[dim_idx] = old_strides[dim_idx] / size_ratio;
	}
	new_strides[ndim - 1] = 1;
	return new_strides;
	}

	Tensor view_dtype(const Tensor& self, ScalarType dtype) {
	if (self.scalar_type() == dtype) {
	return self;
	}
	const auto type_meta = c10::scalarTypeToTypeMeta(dtype);
	TORCH_CHECK(!self.is_conj(),
	"torch.Tensor.view is not supported for conjugate view tensors when converting to a different dtype.");
	TORCH_CHECK(!self.is_neg(),
	"torch.Tensor.view is not supported for tensors with negative bit set when converting to a different dtype.");

	int64_t self_element_size = self.element_size();
	int64_t new_element_size = static_cast<int64_t>(type_meta.itemsize());

	Storage storage = self.storage();
	auto new_tensor = detail::make_tensor<TensorImpl>(
	std::move(storage), self.key_set(), type_meta);
	auto* impl = new_tensor.unsafeGetTensorImpl();

	if (self_element_size == new_element_size) {
	impl->set_storage_offset(self.storage_offset());
	impl->set_sizes_and_strides(self.sizes(), self.strides());

	} else if (self.dim() == 0) {
	TORCH_CHECK(false,
	"self.dim() cannot be 0 to view ", self.scalar_type(), " as ",
	dtype, " (different element sizes)");

	} else if (self_element_size > new_element_size) {
	// Downsizing element size

	int64_t size_ratio = self_element_size / new_element_size;
	auto new_strides = compute_strides_for_view_dtype_downsize(
	self.strides(), size_ratio, self.scalar_type(), dtype);

	auto old_sizes = self.sizes();
	DimVector new_sizes(self.dim());
	std::copy(old_sizes.begin(), old_sizes.end(), new_sizes.begin());
	new_sizes[self.dim() - 1] *= size_ratio;

	auto new_storage_offset = size_ratio * self.storage_offset();

	impl->set_storage_offset(new_storage_offset);
	impl->set_sizes_and_strides(new_sizes, new_strides);

	} else {
	// Upsizing element size

	int64_t size_ratio = new_element_size / self_element_size;

	TORCH_CHECK(
	(self.size(-1) % size_ratio) == 0,
	"self.size(-1) must be divisible by ", size_ratio, " to view ",
	self.scalar_type(), " as ", dtype, " (different element sizes), ",
	"but got ", self.size(-1));

	TORCH_CHECK(
	(self.storage_offset() % size_ratio) == 0,
	"self.storage_offset() must be divisible by ", size_ratio, " to view ",
	self.scalar_type(), " as ", dtype, " (different element sizes), but got ",
	self.storage_offset());

	auto new_strides = compute_strides_for_view_dtype_upsize(
	self.strides(), size_ratio, self.scalar_type(), dtype);

	auto old_sizes = self.sizes();
	DimVector new_sizes(self.dim());
	std::copy(old_sizes.begin(), old_sizes.end(), new_sizes.begin());
	new_sizes[self.dim() - 1] /= size_ratio;

	auto new_storage_offset = self.storage_offset() / size_ratio;

	impl->set_storage_offset(new_storage_offset);
	impl->set_sizes_and_strides(new_sizes, new_strides);
	}

	return new_tensor;
	}

	Tensor dense_to_sparse_csr(const Tensor& self) {
	return self.to_sparse().to_sparse_csr();
	}

	Tensor csr_to_sparse_csr(const Tensor& self) {
	return self;
	}

	Tensor coo_to_sparse_csr(const Tensor& self) {
	TORCH_CHECK(
	self.dim() == 2,
	"Only 2D tensors can be converted to the CSR format but got shape: ",
	self.sizes());
	auto coalesced_self = self.coalesce();
	auto row_indices = coalesced_self.indices()[0];
	bool out_int32 = (row_indices.scalar_type() == at::kInt);
	auto crow_indices = at::_convert_indices_from_coo_to_csr(
	row_indices, self.size(0), out_int32);
	return at::native::_sparse_csr_tensor_unsafe(
	crow_indices,
	coalesced_self.indices()[1].contiguous(),
	coalesced_self.values(),
	coalesced_self.sizes(),
	coalesced_self.scalar_type(),
	c10::kSparseCsr,
	coalesced_self.device());
	}

	}} // namespace at::native