aten/src/ATen/native/CPUFallback.cpp - platform/external/pytorch - Git at Google

 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/CPUFallback.h>

 #include <ATen/core/ivalue.h>
 #include <ATen/core/stack.h>
 #include <ATen/core/dispatch/Dispatcher.h>

 #include <sstream>
 #include <vector>


 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #else
 #include <ATen/ops/_copy_from_and_resize.h>
 #include <ATen/ops/_to_cpu.h>
 #endif


 namespace at::native {

 // convenience helper for converting tensors to cpu

 template<typename T, std::enable_if_t<std::is_same_v<T, at::Tensor> || std::is_same_v<T, std::optional<at::Tensor>>, int> = 1>
 static std::vector<T> to_cpu(const std::vector<T>& tensors) {
     // We can't just call at::to_cpu() on the entire list of Tensors
     // Because it will break on undefined tensors. Separate out undefined tensors first.
     const int num = tensors.size();
     std::vector<T> cpu_tensors(num);
     std::vector<at::Tensor> valid_tensors;
     std::vector<bool> to_translate(num);
     for (const auto i : c10::irange(num)) {
       // Explicitly handling undefined tensors here instead of letting `at::_to_cpu` handle it.
       // Otherwise, we'd need to require all backends with their own implementation of _to_cpu
       // to properly handle undefined tensors.
       if constexpr(std::is_same<T, std::optional<at::Tensor>>::value) {
         if (tensors[i].has_value() && tensors[i].value().defined()) {
           to_translate[i] = true;
           valid_tensors.push_back(tensors[i].value());
         } else {
           cpu_tensors[i] = tensors[i];
         }
       } else {
         if (tensors[i].defined()) {
           to_translate[i] = true;
           valid_tensors.push_back(tensors[i]);
         } else {
           cpu_tensors[i] = tensors[i];
         }
       }
     }
     auto cpu_valid_tensors = at::_to_cpu(valid_tensors);
     for (int i = 0, defined_pos = 0; i < num; ++i) {
       if (to_translate[i]) {
         cpu_tensors[i] = std::move(cpu_valid_tensors[defined_pos++]);
       }
     }
   return cpu_tensors;
 }

 static std::optional<c10::Device> compute_target_device(std::vector<at::Tensor>& t_args, std::vector<c10::List<at::Tensor>> tlist_args) {
   // Decide what device to move the output tensor(s) to.
   // The current convention is that we use the first tensor arg to pick the device
   // Barring that, we take the first tensor from a TensorList arg.
   if (!t_args.empty()) {
     return t_args[0].device();
   } else {
     // We need to loop through all of the (potentially multiple) TensorList arguments
     // In case, e.g. the first one is empty but the second is not.
     for (auto& tens_list : tlist_args) {
       for (const auto i : c10::irange(tens_list.size())) {
         return tens_list.get(i).device();
       }
     }
   }
   return std::nullopt;
 }

 static bool validate_tensor_list(const c10::List<at::Tensor>& tensorlist) {
   bool flag = false;

   for (const auto& i : c10::irange(tensorlist.size())) {
     if (tensorlist[i].defined())
       flag = true;
   }

   return flag;
 }

 void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool error_on_views) {
   auto& schema_args = op.schema().arguments();
   const auto num_arguments = schema_args.size();
   auto arguments = torch::jit::last(stack, num_arguments);
   const auto arguments_begin = stack->size() - num_arguments;

   std::vector<at::Tensor> tensor_args;
   std::vector<int> tensor_args_indices;

   std::vector<c10::List<at::Tensor>> tensorlist_args;
   std::vector<int> tensorlist_args_indices;

   std::vector<c10::List<std::optional<at::Tensor>>> optional_tensorlist_args;
   std::vector<int> optional_tensorlist_args_indices;

   std::optional<c10::Device> tgt_device = std::nullopt;
   // save converted cpu tensor for TensorList and optional TensorList
   std::vector<c10::IValue> tensorlist_cpu_args;
   std::vector<c10::IValue> optional_tensorlist_cpu_args;

   // Step 1: Convert all non-CPU tensor inputs into CPU tensors
   // and put them on the stack at the correct indices.
   for (const auto idx : c10::irange(arguments.size())) {
     const auto& ivalue = arguments[idx];
     if (ivalue.isTensor()) {
       tensor_args.push_back(ivalue.toTensor());
       tensor_args_indices.push_back(idx);
     } else if (ivalue.isTensorList()) {
       // Note: we copy each TensorList argument to CPU individually out of convenience,
       // but XLA would benefit from materializing all tensor and TensorList args onto the CPU at the same time.
       // We can improve this if we need better perf for XLA's CPU fallbacks.
       tensorlist_args.push_back(ivalue.toTensorList());
       tensorlist_args_indices.push_back(idx);
       auto cpu_ivalue = c10::IValue(c10::List<at::Tensor>(to_cpu(ivalue.toTensorVector())));
       tensorlist_cpu_args.push_back(cpu_ivalue);
       (*stack)[arguments_begin + idx] = std::move(cpu_ivalue);
     } else if (ivalue.isOptionalTensorList()) {
       optional_tensorlist_args.push_back(ivalue.toOptionalTensorList());
       optional_tensorlist_args_indices.push_back(idx);
       auto cpu_ivalue = c10::IValue(c10::List<std::optional<at::Tensor>>(to_cpu(ivalue.toOptionalTensorVector())));
       optional_tensorlist_cpu_args.push_back(cpu_ivalue);
       (*stack)[arguments_begin + idx] = c10::IValue(cpu_ivalue);
     } else if (ivalue.isDevice()) {
       tgt_device = ivalue.toDevice();
       (*stack)[arguments_begin + idx] = c10::IValue(c10::Device(kCPU));
     }
   }
   // XLA requires all of the tensor arguments to be gathered up and converted to CPU together.
   auto cpu_tensors = to_cpu(tensor_args);

   for (const auto i : c10::irange(tensor_args_indices.size())) {
     auto idx = tensor_args_indices[i];
     (*stack)[arguments_begin + idx] = c10::IValue(cpu_tensors[i]);
   }

   // Step 2: Call the underlying CPU implementation of the operator
   op.redispatchBoxed(c10::DispatchKeySet(c10::DispatchKey::CPU), stack);

   // Step 3: We need to take special care to handle mutable aliases properly:
   // If any input tensors are mutable aliases, we need to
   // directly copy the updated data on the CPU tensors back to the original inputs.
   for (const auto i : c10::irange(tensor_args_indices.size())) {
     auto tensor_idx = tensor_args_indices[i];
     const AliasInfo* alias_info = schema_args[tensor_idx].alias_info();
     if (alias_info != nullptr && alias_info->isWrite()) {
       if (!tensor_args[i].defined()) continue;
       at::_copy_from_and_resize(cpu_tensors[i], tensor_args[i]);
     }
   }

   // We also need to explicit reapply input mutations to inputs that are lists
   // of tensors
   for (const auto i : c10::irange(tensorlist_args_indices.size())) {
     auto tensorlist_idx = tensorlist_args_indices[i];
     const AliasInfo* alias_info = schema_args[tensorlist_idx].alias_info();
     if (alias_info != nullptr && alias_info->isWrite()) {
       const auto& cpu_tensors = tensorlist_cpu_args[i].toTensorVector();
       for (const auto idx : c10::irange(tensorlist_args[i].size())) {
         if (!cpu_tensors[idx].defined()) continue;
         at::_copy_from_and_resize(cpu_tensors[idx], tensorlist_args[i][idx]);
       }
     }
   }

   // We also need to explicit reapply input mutations to inputs that are lists
   // of optional tensors
   for (const auto i : c10::irange(optional_tensorlist_args_indices.size())) {
     auto tensorlist_idx = optional_tensorlist_args_indices[i];
     const AliasInfo* alias_info = schema_args[tensorlist_idx].alias_info();
     if (alias_info != nullptr && alias_info->isWrite()) {
       const auto& cpu_tensors = optional_tensorlist_cpu_args[i].toOptionalTensorList();
       for (const auto idx : c10::irange(optional_tensorlist_args[i].size())) {
         if (cpu_tensors[idx].has_value() && cpu_tensors[idx].value().defined()) {
           const std::optional<at::Tensor>& optional_tensor = optional_tensorlist_args[i][idx];
           at::_copy_from_and_resize(cpu_tensors[idx].value(), optional_tensor.value());
         }
       }
     }
   }

   // Step 4: Convert any CPU output tensors back to the original input device.
   // For mutable alias'd outputs, we also need to take special care
   // to move the ORIGINAL input tensor back onto the stack, in place of
   // the temporary CPU output tensor that we created.
   //
   // Note [CPU Fallback Does Not Handle View Operators]
   // Also note that we are incapable of handling immutable aliases properly.
   // Why?
   // Schemas with an immutable alias'd tensor outputs correspond to view operators.
   // For example, the `view_as` schema from native_functions.yaml:
   // `view_as(Tensor(a) self, Tensor other) -> Tensor(a)`
   // We can't handle these ops properly, because view ops are supposed to return
   // a NEW tensor that shares the SAME storage as the original tensor.
   // However, the new tensor that we created cannot share the same storage,
   // since it lives on CPU and the original tensor lives on a different device.
   // Because of that, we warn if someone attempts to call the
   // CPU fallback on a view operator (this is to maintain BC for view ops for XLA
   // that fall back to CPU).
   const auto& schema_returns = op.schema().returns();
   const auto& num_returns = schema_returns.size();
   auto returns = torch::jit::last(stack, num_returns);
   const auto returns_begin = stack->size() - num_returns;

   if (tgt_device == std::nullopt) {
     tgt_device = compute_target_device(tensor_args, tensorlist_args);
   }

   for (const auto idx : c10::irange(returns.size())) {
     const AliasInfo* alias_info = schema_returns[idx].alias_info();
     if (alias_info != nullptr && alias_info->isWrite()) {
       // Case (1): mutable alias case.
       // Move the input ivalue directly onto the stack in place of
       // the existing cpu output tensor.
       bool found_alias = false;
       if (returns[idx].isTensor() && returns[idx].toTensor().defined()) {
         // We could store some extra metadata on the function schema to avoid
         // the loop here if we need to improve perf.
         for (const auto i : c10::irange(tensor_args_indices.size())) {
           auto input_tensor_idx = tensor_args_indices[i];
           const auto& input_tensor = cpu_tensors[i];
           const AliasInfo* input_alias_info =
               schema_args[input_tensor_idx].alias_info();
           // Checked above; adding assert to guard against breakage of the below
           // condition due to changing the above if test.
           TORCH_INTERNAL_ASSERT_DEBUG_ONLY(alias_info != nullptr);
           if (input_tensor.defined() &&
               (alias_info == input_alias_info ||
                (input_alias_info != nullptr &&
                 *alias_info == *input_alias_info))) {
             // We've found the original input tensor that aliases with the
             // current output. Wrap it in an IValue and put it directly on the
             // stack.
             (*stack)[returns_begin + idx] = c10::IValue(tensor_args[i]);
             found_alias = true;
             break;
           }
         }
       } else if (
           returns[idx].isTensorList() &&
           validate_tensor_list(returns[idx].toTensorList())) {
         for (const auto i : c10::irange(tensorlist_args_indices.size())) {
           auto input_tensor_idx = tensorlist_args_indices[i];
           const AliasInfo* input_alias_info =
               schema_args[input_tensor_idx].alias_info();
           // Checked above; adding assert to guard against breakage of the below
           // condition due to changing the above if test.
           TORCH_INTERNAL_ASSERT_DEBUG_ONLY(alias_info != nullptr);
           if (validate_tensor_list(tensorlist_args[i]) &&
               (alias_info == input_alias_info ||
                (input_alias_info != nullptr &&
                 *alias_info == *input_alias_info))) {
             // We've found the original input tensor that aliases with the
             // current output. Wrap it in an IValue and put it directly on the
             // stack.
             (*stack)[returns_begin + idx] = c10::IValue(tensorlist_args[i]);
             found_alias = true;
             break;
           }
         }
       }
       TORCH_CHECK(
           found_alias,
           "The operator ",
           op.schema().operator_name(),
           " appears to have invalid alias information. ",
           "Found a return tensor argument with a mismatched mutable alias: ",
           schema_returns[idx]);
     } else {
       if (alias_info != nullptr && !alias_info->isWrite()) {
         // Case (3): immutable alias (view) case.
         // Warn here, since we're copying and not creating a view.
         // If this operator is needed, the backend should provide a kernel for
         // it. See Note [CPU Fallback Does Not Handle View Operators]
         std::stringstream dev_str;
         if (tgt_device) {
           dev_str << *tgt_device;
         } else {
           dev_str << "<none>";
         }
         if (error_on_views) {
           TORCH_CHECK(
               false,
               "The operator ",
               op.schema().operator_name(),
               " appears to be a view operator, ",
               "but it has no implementation for the backend \"",
               dev_str.str(),
               "\". View operators don't support ",
               "since the tensor's storage cannot be shared across devices.");
         } else {
           TORCH_WARN(
               false,
               "The operator ",
               op.schema().operator_name(),
               " appears to be a view operator, ",
               "but it has no implementation for the backend \"",
               dev_str.str(),
               "\". View operators don't support falling back to run on the CPU, ",
               "since the tensor's storage cannot be shared across devices.");
         }
       }
       // Case (2): copy case.
       // Copy the cpu output tensor to the original device.

       // We technically  might not have a target device, e.g. if you call
       // torch.cat() with an empty list In that case, we shouldn't have any
       // tensors to schlep across devices anyway.
       if (tgt_device) {
         if (returns[idx].isTensor() && returns[idx].toTensor().defined()) {
           (*stack)[returns_begin + idx] =
               c10::IValue(returns[idx].toTensor().to(*tgt_device));
         } else if (
             returns[idx].isTensorList() &&
             validate_tensor_list(returns[idx].toTensorList())) {
           const auto& cpu_tensors = returns[idx].toTensorList().vec();
           std::vector<at::Tensor> tensors;
           tensors.reserve(cpu_tensors.size());

           for (const auto& tensor : cpu_tensors) {
             tensors.push_back(tensor.to(*tgt_device));
           }
           (*stack)[returns_begin + idx] =
               c10::IValue(c10::List<at::Tensor>(tensors));
         }
       }
     }
   }
 }

 } // namespace at::native
	#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
	#include <ATen/native/CPUFallback.h>

	#include <ATen/core/ivalue.h>
	#include <ATen/core/stack.h>
	#include <ATen/core/dispatch/Dispatcher.h>

	#include <sstream>
	#include <vector>


	#ifndef AT_PER_OPERATOR_HEADERS
	#include <ATen/Functions.h>
	#else
	#include <ATen/ops/_copy_from_and_resize.h>
	#include <ATen/ops/_to_cpu.h>
	#endif


	namespace at::native {

	// convenience helper for converting tensors to cpu

	template<typename T, std::enable_if_t<std::is_same_v<T, at::Tensor> \|\| std::is_same_v<T, std::optional<at::Tensor>>, int> = 1>
	static std::vector<T> to_cpu(const std::vector<T>& tensors) {
	// We can't just call at::to_cpu() on the entire list of Tensors
	// Because it will break on undefined tensors. Separate out undefined tensors first.
	const int num = tensors.size();
	std::vector<T> cpu_tensors(num);
	std::vector<at::Tensor> valid_tensors;
	std::vector<bool> to_translate(num);
	for (const auto i : c10::irange(num)) {
	// Explicitly handling undefined tensors here instead of letting `at::_to_cpu` handle it.
	// Otherwise, we'd need to require all backends with their own implementation of _to_cpu
	// to properly handle undefined tensors.
	if constexpr(std::is_same<T, std::optional<at::Tensor>>::value) {
	if (tensors[i].has_value() && tensors[i].value().defined()) {
	to_translate[i] = true;
	valid_tensors.push_back(tensors[i].value());
	} else {
	cpu_tensors[i] = tensors[i];
	}
	} else {
	if (tensors[i].defined()) {
	to_translate[i] = true;
	valid_tensors.push_back(tensors[i]);
	} else {
	cpu_tensors[i] = tensors[i];
	}
	}
	}
	auto cpu_valid_tensors = at::_to_cpu(valid_tensors);
	for (int i = 0, defined_pos = 0; i < num; ++i) {
	if (to_translate[i]) {
	cpu_tensors[i] = std::move(cpu_valid_tensors[defined_pos++]);
	}
	}
	return cpu_tensors;
	}

	static std::optional<c10::Device> compute_target_device(std::vector<at::Tensor>& t_args, std::vector<c10::List<at::Tensor>> tlist_args) {
	// Decide what device to move the output tensor(s) to.
	// The current convention is that we use the first tensor arg to pick the device
	// Barring that, we take the first tensor from a TensorList arg.
	if (!t_args.empty()) {
	return t_args[0].device();
	} else {
	// We need to loop through all of the (potentially multiple) TensorList arguments
	// In case, e.g. the first one is empty but the second is not.
	for (auto& tens_list : tlist_args) {
	for (const auto i : c10::irange(tens_list.size())) {
	return tens_list.get(i).device();
	}
	}
	}
	return std::nullopt;
	}

	static bool validate_tensor_list(const c10::List<at::Tensor>& tensorlist) {
	bool flag = false;

	for (const auto& i : c10::irange(tensorlist.size())) {
	if (tensorlist[i].defined())
	flag = true;
	}

	return flag;
	}

	void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool error_on_views) {
	auto& schema_args = op.schema().arguments();
	const auto num_arguments = schema_args.size();
	auto arguments = torch::jit::last(stack, num_arguments);
	const auto arguments_begin = stack->size() - num_arguments;

	std::vector<at::Tensor> tensor_args;
	std::vector<int> tensor_args_indices;

	std::vector<c10::List<at::Tensor>> tensorlist_args;
	std::vector<int> tensorlist_args_indices;

	std::vector<c10::List<std::optional<at::Tensor>>> optional_tensorlist_args;
	std::vector<int> optional_tensorlist_args_indices;

	std::optional<c10::Device> tgt_device = std::nullopt;
	// save converted cpu tensor for TensorList and optional TensorList
	std::vector<c10::IValue> tensorlist_cpu_args;
	std::vector<c10::IValue> optional_tensorlist_cpu_args;

	// Step 1: Convert all non-CPU tensor inputs into CPU tensors
	// and put them on the stack at the correct indices.
	for (const auto idx : c10::irange(arguments.size())) {
	const auto& ivalue = arguments[idx];
	if (ivalue.isTensor()) {
	tensor_args.push_back(ivalue.toTensor());
	tensor_args_indices.push_back(idx);
	} else if (ivalue.isTensorList()) {
	// Note: we copy each TensorList argument to CPU individually out of convenience,
	// but XLA would benefit from materializing all tensor and TensorList args onto the CPU at the same time.
	// We can improve this if we need better perf for XLA's CPU fallbacks.
	tensorlist_args.push_back(ivalue.toTensorList());
	tensorlist_args_indices.push_back(idx);
	auto cpu_ivalue = c10::IValue(c10::List<at::Tensor>(to_cpu(ivalue.toTensorVector())));
	tensorlist_cpu_args.push_back(cpu_ivalue);
	(*stack)[arguments_begin + idx] = std::move(cpu_ivalue);
	} else if (ivalue.isOptionalTensorList()) {
	optional_tensorlist_args.push_back(ivalue.toOptionalTensorList());
	optional_tensorlist_args_indices.push_back(idx);
	auto cpu_ivalue = c10::IValue(c10::List<std::optional<at::Tensor>>(to_cpu(ivalue.toOptionalTensorVector())));
	optional_tensorlist_cpu_args.push_back(cpu_ivalue);
	(*stack)[arguments_begin + idx] = c10::IValue(cpu_ivalue);
	} else if (ivalue.isDevice()) {
	tgt_device = ivalue.toDevice();
	(*stack)[arguments_begin + idx] = c10::IValue(c10::Device(kCPU));
	}
	}
	// XLA requires all of the tensor arguments to be gathered up and converted to CPU together.
	auto cpu_tensors = to_cpu(tensor_args);

	for (const auto i : c10::irange(tensor_args_indices.size())) {
	auto idx = tensor_args_indices[i];
	(*stack)[arguments_begin + idx] = c10::IValue(cpu_tensors[i]);
	}

	// Step 2: Call the underlying CPU implementation of the operator
	op.redispatchBoxed(c10::DispatchKeySet(c10::DispatchKey::CPU), stack);

	// Step 3: We need to take special care to handle mutable aliases properly:
	// If any input tensors are mutable aliases, we need to
	// directly copy the updated data on the CPU tensors back to the original inputs.
	for (const auto i : c10::irange(tensor_args_indices.size())) {
	auto tensor_idx = tensor_args_indices[i];
	const AliasInfo* alias_info = schema_args[tensor_idx].alias_info();
	if (alias_info != nullptr && alias_info->isWrite()) {
	if (!tensor_args[i].defined()) continue;
	at::_copy_from_and_resize(cpu_tensors[i], tensor_args[i]);
	}
	}

	// We also need to explicit reapply input mutations to inputs that are lists
	// of tensors
	for (const auto i : c10::irange(tensorlist_args_indices.size())) {
	auto tensorlist_idx = tensorlist_args_indices[i];
	const AliasInfo* alias_info = schema_args[tensorlist_idx].alias_info();
	if (alias_info != nullptr && alias_info->isWrite()) {
	const auto& cpu_tensors = tensorlist_cpu_args[i].toTensorVector();
	for (const auto idx : c10::irange(tensorlist_args[i].size())) {
	if (!cpu_tensors[idx].defined()) continue;
	at::_copy_from_and_resize(cpu_tensors[idx], tensorlist_args[i][idx]);
	}
	}
	}

	// We also need to explicit reapply input mutations to inputs that are lists
	// of optional tensors
	for (const auto i : c10::irange(optional_tensorlist_args_indices.size())) {
	auto tensorlist_idx = optional_tensorlist_args_indices[i];
	const AliasInfo* alias_info = schema_args[tensorlist_idx].alias_info();
	if (alias_info != nullptr && alias_info->isWrite()) {
	const auto& cpu_tensors = optional_tensorlist_cpu_args[i].toOptionalTensorList();
	for (const auto idx : c10::irange(optional_tensorlist_args[i].size())) {
	if (cpu_tensors[idx].has_value() && cpu_tensors[idx].value().defined()) {
	const std::optional<at::Tensor>& optional_tensor = optional_tensorlist_args[i][idx];
	at::_copy_from_and_resize(cpu_tensors[idx].value(), optional_tensor.value());
	}
	}
	}
	}

	// Step 4: Convert any CPU output tensors back to the original input device.
	// For mutable alias'd outputs, we also need to take special care
	// to move the ORIGINAL input tensor back onto the stack, in place of
	// the temporary CPU output tensor that we created.
	//
	// Note [CPU Fallback Does Not Handle View Operators]
	// Also note that we are incapable of handling immutable aliases properly.
	// Why?
	// Schemas with an immutable alias'd tensor outputs correspond to view operators.
	// For example, the `view_as` schema from native_functions.yaml:
	// `view_as(Tensor(a) self, Tensor other) -> Tensor(a)`
	// We can't handle these ops properly, because view ops are supposed to return
	// a NEW tensor that shares the SAME storage as the original tensor.
	// However, the new tensor that we created cannot share the same storage,
	// since it lives on CPU and the original tensor lives on a different device.
	// Because of that, we warn if someone attempts to call the
	// CPU fallback on a view operator (this is to maintain BC for view ops for XLA
	// that fall back to CPU).
	const auto& schema_returns = op.schema().returns();
	const auto& num_returns = schema_returns.size();
	auto returns = torch::jit::last(stack, num_returns);
	const auto returns_begin = stack->size() - num_returns;

	if (tgt_device == std::nullopt) {
	tgt_device = compute_target_device(tensor_args, tensorlist_args);
	}

	for (const auto idx : c10::irange(returns.size())) {
	const AliasInfo* alias_info = schema_returns[idx].alias_info();
	if (alias_info != nullptr && alias_info->isWrite()) {
	// Case (1): mutable alias case.
	// Move the input ivalue directly onto the stack in place of
	// the existing cpu output tensor.
	bool found_alias = false;
	if (returns[idx].isTensor() && returns[idx].toTensor().defined()) {
	// We could store some extra metadata on the function schema to avoid
	// the loop here if we need to improve perf.
	for (const auto i : c10::irange(tensor_args_indices.size())) {
	auto input_tensor_idx = tensor_args_indices[i];
	const auto& input_tensor = cpu_tensors[i];
	const AliasInfo* input_alias_info =
	schema_args[input_tensor_idx].alias_info();
	// Checked above; adding assert to guard against breakage of the below
	// condition due to changing the above if test.
	TORCH_INTERNAL_ASSERT_DEBUG_ONLY(alias_info != nullptr);
	if (input_tensor.defined() &&
	(alias_info == input_alias_info \|\|
	(input_alias_info != nullptr &&
	alias_info == input_alias_info))) {
	// We've found the original input tensor that aliases with the
	// current output. Wrap it in an IValue and put it directly on the
	// stack.
	(*stack)[returns_begin + idx] = c10::IValue(tensor_args[i]);
	found_alias = true;
	break;
	}
	}
	} else if (
	returns[idx].isTensorList() &&
	validate_tensor_list(returns[idx].toTensorList())) {
	for (const auto i : c10::irange(tensorlist_args_indices.size())) {
	auto input_tensor_idx = tensorlist_args_indices[i];
	const AliasInfo* input_alias_info =
	schema_args[input_tensor_idx].alias_info();
	// Checked above; adding assert to guard against breakage of the below
	// condition due to changing the above if test.
	TORCH_INTERNAL_ASSERT_DEBUG_ONLY(alias_info != nullptr);
	if (validate_tensor_list(tensorlist_args[i]) &&
	(alias_info == input_alias_info \|\|
	(input_alias_info != nullptr &&
	alias_info == input_alias_info))) {
	// We've found the original input tensor that aliases with the
	// current output. Wrap it in an IValue and put it directly on the
	// stack.
	(*stack)[returns_begin + idx] = c10::IValue(tensorlist_args[i]);
	found_alias = true;
	break;
	}
	}
	}
	TORCH_CHECK(
	found_alias,
	"The operator ",
	op.schema().operator_name(),
	" appears to have invalid alias information. ",
	"Found a return tensor argument with a mismatched mutable alias: ",
	schema_returns[idx]);
	} else {
	if (alias_info != nullptr && !alias_info->isWrite()) {
	// Case (3): immutable alias (view) case.
	// Warn here, since we're copying and not creating a view.
	// If this operator is needed, the backend should provide a kernel for
	// it. See Note [CPU Fallback Does Not Handle View Operators]
	std::stringstream dev_str;
	if (tgt_device) {
	dev_str << *tgt_device;
	} else {
	dev_str << "<none>";
	}
	if (error_on_views) {
	TORCH_CHECK(
	false,
	"The operator ",
	op.schema().operator_name(),
	" appears to be a view operator, ",
	"but it has no implementation for the backend \"",
	dev_str.str(),
	"\". View operators don't support ",
	"since the tensor's storage cannot be shared across devices.");
	} else {
	TORCH_WARN(
	false,
	"The operator ",
	op.schema().operator_name(),
	" appears to be a view operator, ",
	"but it has no implementation for the backend \"",
	dev_str.str(),
	"\". View operators don't support falling back to run on the CPU, ",
	"since the tensor's storage cannot be shared across devices.");
	}
	}
	// Case (2): copy case.
	// Copy the cpu output tensor to the original device.

	// We technically might not have a target device, e.g. if you call
	// torch.cat() with an empty list In that case, we shouldn't have any
	// tensors to schlep across devices anyway.
	if (tgt_device) {
	if (returns[idx].isTensor() && returns[idx].toTensor().defined()) {
	(*stack)[returns_begin + idx] =
	c10::IValue(returns[idx].toTensor().to(*tgt_device));
	} else if (
	returns[idx].isTensorList() &&
	validate_tensor_list(returns[idx].toTensorList())) {
	const auto& cpu_tensors = returns[idx].toTensorList().vec();
	std::vector<at::Tensor> tensors;
	tensors.reserve(cpu_tensors.size());

	for (const auto& tensor : cpu_tensors) {
	tensors.push_back(tensor.to(*tgt_device));
	}
	(*stack)[returns_begin + idx] =
	c10::IValue(c10::List<at::Tensor>(tensors));
	}
	}
	}
	}
	}

	} // namespace at::native