aten/src/ATen/TensorIterator.cpp - platform/external/pytorch - Git at Google

 #include <ATen/native/TensorIterator.h>

 #include <array>
 #include <ATen/ExpandUtils.h>
 #include <ATen/Parallel.h>
 #include <ATen/native/TypeProperties.h>
 #include <ATen/MemoryOverlap.h>
 #include <ATen/native/Resize.h>
 #include <ATen/TensorOperators.h>

 #include <c10/util/irange.h>

 namespace at {

 using DimMask = TensorIteratorBase::DimMask;
 using PtrVector = TensorIteratorBase::PtrVector;
 using loop2d_t = TensorIteratorBase::loop2d_t;
 using StrideVector = TensorIteratorBase::StrideVector;

 /// Construction
 TensorIteratorConfig& TensorIteratorConfig::add_owned_output(const Tensor& output) {
   TORCH_INTERNAL_ASSERT(
       num_inputs_ == 0,
       "Keep in mind that you have to add all outputs first before adding any input. "
       "For more details, see https://github.com/pytorch/pytorch/wiki/How-to-use-TensorIterator.");
   tensors_.push_back(c10::MaybeOwned<Tensor>::owned(c10::in_place, output));
   num_outputs_++;
   return *this;
 }

 TensorIteratorConfig& TensorIteratorConfig::add_owned_input(const Tensor& input) {
   tensors_.push_back(c10::MaybeOwned<Tensor>::owned(c10::in_place, input));
   num_inputs_++;
   return *this;
 }

 TensorIteratorConfig& TensorIteratorConfig::add_borrowed_output(const Tensor& output) {
   TORCH_INTERNAL_ASSERT(
       num_inputs_ == 0,
       "Keep in mind that you have to add all outputs first before adding any input. "
       "For more details, see https://github.com/pytorch/pytorch/wiki/How-to-use-TensorIterator.");
   tensors_.push_back(c10::MaybeOwned<Tensor>::borrowed(output));
   num_outputs_++;
   return *this;
 }

 TensorIteratorConfig& TensorIteratorConfig::add_borrowed_input(const Tensor& input) {
   tensors_.push_back(c10::MaybeOwned<Tensor>::borrowed(input));
   num_inputs_++;
   return *this;
 }

 TensorIteratorConfig& TensorIteratorConfig::declare_static_dtype_and_device(ScalarType dtype, Device device) {
   TORCH_CHECK(!check_all_same_dtype_, "check_all_same_dtype(false) must be called before declare_static_dtype(...)");
   static_dtype_and_device_ = c10::make_optional(std::make_pair(dtype, device));
   return *this;
 }

 TensorIteratorConfig& TensorIteratorConfig::declare_static_shape(IntArrayRef shape) {
   // WARNING:
   //   This will bypass all shape checking in the TensorIterator. Kernels which call this method
   //   are expected to check shapes before calling `add_owned_input` or `add_owned_output`.
   TORCH_CHECK(!resize_outputs_, "resize_outputs() must be called before declare_static_shape(...)")
   static_shape_ = c10::make_optional(DimVector(shape));
   return *this;
 }

 TensorIteratorConfig& TensorIteratorConfig::declare_static_shape(IntArrayRef shape, IntArrayRef squash_dims) {
   declare_static_shape(shape);
   if (!static_shape_->size()) return *this;
   for (const auto& squash_dim : squash_dims) {
     TORCH_CHECK(squash_dim >= 0 && squash_dim < static_cast<int64_t>(static_shape_->size()),
                 "squash_dim ", squash_dim, " must be in [0, ", static_shape_->size(), ").");
     (*static_shape_)[squash_dim] = 1;
   }
   return *this;
 }

 // NOTE: [Computing output strides]
 // We use the following algorithm to compute output strides
 // If correctly sized output is provided, we respect its stides and don't change them
 // Otherwise, if provided output is of incorrect size or no output is provided,
 // we try to recover permutation that was applied to the inputs
 // by sorting the strides of the inputs. Precedence is given to the inputs in the order they were added,
 // and to permutations involving non-broadcasted dimensions
 // 1. we loop over inputs starting from the first
 // 2. for all inputs strides of broadcasted dimensions are set to 0, and 0 compares equal to anything. If one
 // of the dimensions being compared has a stride of 0, we move on to the next tensor to determine if
 // these dimensions need to be swapped.
 // 3. strides of dimensions equal to 1 participate in sorting
 // 4. if 2 strides are equal and neither is 0, we try to break the tie by looking at the corresponding dimensions
 // of the tensor. Dimensions were permuted if, when iterating from the end, dimensions corresponding to the
 // same strides are increasing. If dimensions are non-increasing, we move on to the next input to break the tie.
 //
 // Instead of applying rule 4 for tie breaking, we could move on to the next tensor directly. This would result in possibly
 // losing the correct permuation of the first tensor if there are permuted trivial dimensions, but could potentially
 // improve traversal order of the second tensor. We chose the former option to better propagate channels last layout
 // for example for a tensor with the sizes N1H1
 // These rules result in the intuitive behavior that in most cases recovers permutation of either the first argument (if all
 // arguments are of the same size) or the argument that is not broadcasted, regardless of its position.
 // As a bonus, it also result in reasonably well-behaved traversal order of the inputs and outputs - in the kernels
 // output is traversed linearly, and since it closely follows input layouts, inputs are traversed linearly as well
 //
 // Examples:
 // full size tensor + broadcasted tensor with 0 or 1 non-trivial dimensions => strides of output are same
 // as strides of full size input regardless of the order
 // 2 tensors of same size but different strides => output strides are the same as first argument
 //
 // We also have fast path for memory-dense inputs with the same strides (or, trivially, single memory-dense input)
 // that outputs a tensor with the same strides as inputs. The only difference in result with the algorithm described
 // above is for strides for trivial (1) dimensions, where in ambiguous cases for performance reasons we default to
 // contiguous strides.
 // Example: tensor with sizes NC11 and strides C1CC will produce output with strides C111 (note differences are only
 // in the strides of trivial dimensions, so physical layout is unaffected but permutation information is lost)
 // We might change this behavior in future once performance considerations are resolved

 void TensorIteratorBase::reorder_dimensions() {
   // Sort the dimensions based on strides in ascending order with reduced dims
   // at the front. NOTE: that this inverts the order of C-contiguous tensors.
   // strides[0] is the fastest moving dimension instead of strides[ndim - 1].
   // See NOTE: [Computing output strides] and inline  comments for more detailed description

   perm_.resize(ndim());
   if (ndim() == 1) {
     perm_[0] = 0;
     return;
   }

   // initialize perm with n-1, n-2, ..., 1, 0
   std::iota(perm_.rbegin(), perm_.rend(), 0);

   // Reordering dimensions changes iteraton order
   if (enforce_linear_iteration_) {
     permute_dimensions(perm_);
     return;
   }

   // returns 1 if the dim0 should come after dim1, -1 if dim0 should come
   // before dim1, and 0 if the comparison is ambiguous.
   auto should_swap = [&](size_t dim0, size_t dim1) {
     for (int arg = 0; arg < ntensors(); arg++) {
       // ignore undefined or incorrectly sized tensors
       if (operands_[arg].stride_bytes.empty() || operands_[arg].will_resize) {
         continue;
       }
       int64_t stride0 = operands_[arg].stride_bytes[dim0];
       int64_t stride1 = operands_[arg].stride_bytes[dim1];
       if (is_reduction_ && operands_[arg].is_output) {
         // move reduced dimensions to the front
         // strides of reduced dimensions are always set to 0 by review_reduce_result
         if ((stride0 == 0) != (stride1 == 0)) {
           return stride1 == 0 ? 1 : -1;
         }
       }
       //move on to the next input if one of the dimensions is broadcasted
       if (stride0 == 0 || stride1 == 0) {
         continue;
       // it is important to return here only with strict comparisons, for equal strides we try to break the tie later
       // by comparing corresponding dimensions or if that does not work, moving on to the next tensor
       } else if (stride0 < stride1) {
         return -1;
       } else  if (stride0 > stride1) {
         return 1;
       } else { //equal strides, use dimensions themselves as the tie-breaker.
         //at this point, with zero strides out of the way, we are guaranteed that operand dimensions are equal to shape_
          auto t_dim0 = shape_[dim0];
          auto t_dim1 = shape_[dim1];
          //return only if dimensions should be swapped, otherwise move on to the next tensor
          if (t_dim0 > t_dim1) {
              return 1;
          }
       }
     }
     return 0;
   };

   // insertion sort with support for ambiguous comparisons
   for (int i = 1; i < ndim(); i++) {
     int dim1 = i;
     for (int dim0 = i - 1; dim0 >= 0; dim0--) {
       int comparison = should_swap(perm_[dim0], perm_[dim1]);
       if (comparison > 0) {
         std::swap(perm_[dim0], perm_[dim1]);
         dim1 = dim0;
       } else if (comparison < 0) {
         break;
       }
     }
   }

   // perform re-ordering of shape and strides
   permute_dimensions(perm_);
 }

 // Computes a common dtype using type promotion
 // See the [Common Dtype Computation] note
 ScalarType TensorIteratorBase::compute_common_dtype() {
   at::native::ResultTypeState state = {};
   for (const auto& op : operands_) {
     if (op.is_output) {
       continue;
     }

     state = at::native::update_result_type_state(*op.tensor, state);
   }

   common_dtype_ = at::native::result_type(state);
   TORCH_INTERNAL_ASSERT(common_dtype_ != ScalarType::Undefined);

   return common_dtype_;
 }

 TensorOptions original_options(const OperandInfo& op) {
   if (op.original_tensor->defined()) {
     return op.original_tensor->options();
   } else {
     return op.options();
   }
 }

 // Implements the the behavior of the following flags:
 //   - check_all_same_dtype_
 //   - check_all_same_device_
 //   - enforce_safe_casting_to_output_
 //   - promote_inputs_to_common_dtype_
 //   - cast_common_dtype_to_outputs_
 //
 // See their descriptions in TensorIterator.h for details.
 // NOTE: Checks for more specific behaviors (e.g. the first and second
 //   inputs must share a dtype, but the third must have the long dtype)
 //   should be implemented directly and outside of TensorIterator.
 void TensorIteratorBase::compute_types(const TensorIteratorConfig& config) {
   // Reviews operands (1/2)
   //   - validates that all input tensors are defined
   //   - computes common device
   //   - determines if there are undefined outputs
   //   - determines if there are different dtypes and attempts
   //       to quickly acquire a common dtype
   Device common_device = kCPU;
   common_dtype_ = ScalarType::Undefined;
   // NB: despite output_dtype's generic sounding name, it only is
   // used in a nontrivial way if check_all_same_dtype is true
   ScalarType output_dtype = ScalarType::Undefined;
   bool has_different_input_dtypes = false;
   bool has_different_output_dtypes = false;
   bool has_undefined_outputs = false;

   for (auto& op : operands_) {
     // Validates that all inputs have type information, and that
     //   if an output is missing type information that we can infer
     //   the device it should be allocated on.
     if (!op.is_type_defined()) {
       TORCH_INTERNAL_ASSERT(op.is_output, "Found type undefined input tensor!");
       if (config.static_dtype_and_device_.has_value()) {
         op.target_dtype = config.static_dtype_and_device_->first;
         op.device = config.static_dtype_and_device_->second;
       } else {
         TORCH_INTERNAL_ASSERT(config.check_all_same_device_);
         has_undefined_outputs = true;
         continue;
       }
     }

     // Validates input tensors are defined
     if (!op.tensor->defined()) {
       TORCH_INTERNAL_ASSERT(op.is_output, "Found undefined input tensor!");
       continue;
     }

     TORCH_INTERNAL_ASSERT(op.target_dtype == op.current_dtype)

     // Acquires the first non-CPU device (if any) as the common device
     if (common_device == kCPU && !op.tensor->is_cpu()) {
       common_device = op.tensor->device();
     }

     if (!op.is_output) {
       // Determines if there are varying input dtypes
       // NOTE: the common dtype is set to the first defined input dtype observed
       if (op.target_dtype != common_dtype_) {
         if (common_dtype_ == ScalarType::Undefined) {
           common_dtype_ = op.target_dtype;
         } else {
           has_different_input_dtypes = true;
         }
       }
     } else {  // op.is_output
       // Determines if there are varying output dtypes
       // NOTE: the output dtype is set to the first defined output dtype observed
       if (op.target_dtype != output_dtype) {
         if (output_dtype == ScalarType::Undefined) {
           output_dtype = op.target_dtype;
         } else {
           has_different_output_dtypes = true;
         }
       }
     }
   }

   // Checks that either the computation type is computable or unneeded
   TORCH_INTERNAL_ASSERT(!(has_different_input_dtypes && !config.promote_inputs_to_common_dtype_ &&
                         (has_undefined_outputs || config.enforce_safe_casting_to_output_ ||
                         config.cast_common_dtype_to_outputs_)));

   // Checks that all inputs and defined outputs are the same dtype, if requested
   if (config.check_all_same_dtype_ &&
       (has_different_input_dtypes || has_different_output_dtypes ||
       (common_dtype_ != output_dtype && output_dtype != ScalarType::Undefined))) {
     // Throws an informative error message
     for (auto& op : operands_) {
       if (!op.tensor->defined()) {
         continue;
       }

       TORCH_CHECK(op.target_dtype == common_dtype_,
                   "Found dtype ", op.target_dtype, " but expected ", common_dtype_);
     }
   }

   // Short-circuits if no additional work required
   if (!has_undefined_outputs && !config.check_all_same_device_ &&
       !config.promote_inputs_to_common_dtype_ && !config.cast_common_dtype_to_outputs_ &&
       !config.enforce_safe_casting_to_output_) {
     // Invalidates common_dtype_ if it could not be inferred
     common_dtype_ = has_different_input_dtypes ? ScalarType::Undefined : common_dtype_;
     return;
   }

   // Computes a common dtype, if needed
   if (has_different_input_dtypes && config.promote_inputs_to_common_dtype_) {
     common_dtype_ = compute_common_dtype();
   }

   // Promotes common dtype to the default float scalar type, if needed
   if (config.promote_integer_inputs_to_float_ &&
       c10::isIntegralType(common_dtype_, /*includeBool=*/true)) {
     common_dtype_ = c10::typeMetaToScalarType(c10::get_default_dtype());
   }

   // Reviews operands (2/2)
   //   - sets metadata for undefined outputs
   //   - checks that all tensors are on the same device, if requested
   //   - checks that the common dtype can safely cast to each output, if requested
   //   - creates temporaries for CPU operations, if needed and requested
   int max_cpu_scalars_on_non_cpu = config.allow_cpu_scalars_ ? 1 : 0;
   int current_cpu_scalars_on_non_cpu = 0;
   for (auto& op : operands_) {
     if (!op.is_type_defined()) {
       op.target_dtype = common_dtype_;
       op.device = common_device;
       continue;
     }

     // Skips undefined tensors
     if (!op.tensor->defined()) {
       continue;
     }

     // Checks all tensors are on the same device, if requested
     if (config.check_all_same_device_) {
       // Handles CPU scalars on CUDA kernels that support them
       if (!common_device.is_cpu() &&
           config.allow_cpu_scalars_ && !op.is_output && op.tensor->dim() == 0 &&
           op.tensor->is_cpu()) {
         TORCH_CHECK(current_cpu_scalars_on_non_cpu < max_cpu_scalars_on_non_cpu,
                     "Trying to pass too many CPU scalars to non-CPU kernel!");
         ++current_cpu_scalars_on_non_cpu;
       } else if (op.device != common_device) {
         TORCH_CHECK(false,
                     "Expected all tensors to be on the same device, but "
                     "found at least two devices, ", common_device, " and ", op.device, "!");
       }
     }

     // Checks safe casting, if requested
     if (config.enforce_safe_casting_to_output_ && op.is_output && op.current_dtype != common_dtype_) {
       TORCH_CHECK(canCast(common_dtype_, op.current_dtype),
                   "result type ", common_dtype_, " can't be cast to the "
                   "desired output type ", op.current_dtype);
     }

     // Creates temporaries for CPU operations, if needed and requested
     // TODO: reuse temporaries when possible (e.g. for inplace operations)
     if (common_device == kCPU) {
       // Casts to outputs by creating temporaries of the correct dtype (if needed)
       // NB: we skip this on is_meta_, because the temporary allocation here is
       // unnecessary if we aren't going to actually do the compute
       if (config.cast_common_dtype_to_outputs_ && op.is_output && op.current_dtype != common_dtype_ && !is_meta_) {
         TORCH_INTERNAL_ASSERT(op.tensor->defined());
         // Marker [Output original_tensor is set]
         op.original_tensor = op.tensor;
         // NB: do NOT use set_output here, as the temporary is NOT a true output;
         // op.tensor is the true output and it was pre-provided for us.
         // TODO: The logic for cast_outputs will need to be handled by the
         // structured kernels implementation.  What probably should happen
         // is that we pass in the inferred dtype into the out kernel, and
         // then after calling the out kernel, do the conversion (which
         // is cast_outputs here), but integrating this with existing
         // TensorIterator will take a little doing
         op.tensor = c10::MaybeOwned<Tensor>::owned(
             at::empty_like(*op.tensor,
                            op.tensor->options().dtype(common_dtype_),
                            LEGACY_CONTIGUOUS_MEMORY_FORMAT));
         if (!names_.empty()) {
           namedinference::propagate_names(*op.tensor, names_);
         }
         op.current_dtype = common_dtype_;
         op.target_dtype = common_dtype_;
       }

       // Promotes inputs by creating temporaries of the correct dtype
       if (config.promote_inputs_to_common_dtype_ && !op.is_output && op.current_dtype != common_dtype_) {
         op.original_tensor = op.tensor;
         op.tensor = c10::MaybeOwned<Tensor>::owned(op.tensor->to(common_dtype_));
         op.current_dtype = common_dtype_;
         op.target_dtype = common_dtype_;
       }
     }
     common_device_ = common_device;
   }
 }

 StrideVector TensorIteratorBase::compatible_stride(int element_size) const {
   auto stride = StrideVector();
   int64_t next_stride = element_size;
   for (int dim = 0; dim < ndim(); dim++) {
     stride.push_back(next_stride);
     next_stride *= shape_[dim];
   }
   return stride;
 }

 DimVector TensorIteratorBase::invert_perm(IntArrayRef input) const {
   // Invert the permutation caused by reorder_dimensions. This is not valid
   // after coalesce_dimensions is called.
   TORCH_INTERNAL_ASSERT(!has_coalesced_dimensions_);
   TORCH_INTERNAL_ASSERT(input.size()==perm_.size());
   auto res = DimVector(input.size()); //no initialization needed, every value in res should be written to.
   for (int dim = 0; dim < ndim(); dim++) {
     res[perm_[dim]] = input[dim];
   }
   return res;
 }

 void TensorIteratorBase::allocate_or_resize_outputs() {
   for (int i = 0; i < num_outputs_; i++) {
     auto& op = operands_[i];
     if (!op.tensor->defined() || op.will_resize) {
       TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
       int element_size = elementSize(op.target_dtype);
       op.stride_bytes = compatible_stride(element_size);
       // check if permutation is just an inverted order
       bool inverted = true;
       for (int i = 0; i < ndim(); i++) {
         if (perm_[i] != ndim() - i - 1) {
           inverted = false;
           break;
         }
       }
       auto tensor_shape = invert_perm(shape_);
       if (inverted) {
         // can just return contiguous output
         // it is faster because it avoids allocating 0 size tensor and
         // resizing and restriding it
         set_output(i, tensor_shape, {}, original_options(op), names_);
       } else {
         auto tensor_stride = invert_perm(op.stride_bytes);
         for (int dim = 0; dim < ndim(); dim++) {
           tensor_stride[dim] /= element_size;
         }
         set_output(i, tensor_shape, tensor_stride, original_options(op), names_);
       }
       op.current_dtype = op.target_dtype;
     } else if (op.tensor->defined()) {
       // Even if we don't resize, we still need to tell set_output about
       // the output, so that we properly set guard and propagate names
       set_output(i, op.tensor->sizes(), {}, original_options(op), names_);
     }
   }
 }

 void TensorIteratorBase::compute_names(const TensorIteratorConfig& config) {
   bool should_infer_names = std::any_of(
       operands_.begin(),
       operands_.end(),
       [](const OperandInfo& op) {
         return op.tensor->defined() && op.tensor->has_names();
       });
   if (!should_infer_names) {
     return;
   }

   for (auto& op : operands_) {
     if (!op.tensor->defined()) continue;
     // Don't include output tensors if we are resizing, since we will
     // clobber their names in any case.  (If the output tensor was
     // also an input tensor, we'll pick it up when it shows up again
     // in operands).
     if (config.resize_outputs_ && op.is_output) continue;
     // perform name inference
     if (names_.empty()) {
       names_ = op.tensor->names();
     } else {
       names_ = NameVector(unify_from_right(names_, op.tensor->names()));
     }
   }
 }

 void TensorIteratorBase::coalesce_dimensions() {
   if (ndim() <= 1) {
     return;
   }

   // We can coalesce two adjacent dimensions if either dim has size 1 or if:
   // shape[n] * stride[n] == shape[n + 1].
   auto can_coalesce = [&](int dim0, int dim1) {
     auto shape0 = shape_[dim0];
     auto shape1 = shape_[dim1];
     if (shape0 == 1 || shape1 == 1) {
       return true;
     }
     for (int i = 0; i < ntensors(); i++) {
       auto& stride = operands_[i].stride_bytes;
       if (shape0 * stride[dim0] != stride[dim1]) {
         return false;
       }
     }
     return true;
   };

   // replace each operands stride at dim0 with its stride at dim1
   auto replace_stride = [&](int dim0, int dim1) {
     for (int i = 0; i < ntensors(); i++) {
       auto& stride = operands_[i].stride_bytes;
       stride[dim0] = stride[dim1];
     }
   };

   int prev_dim = 0;
   for (int dim = 1; dim < ndim(); dim++) {
     if (can_coalesce(prev_dim, dim)) {
       if (shape_[prev_dim] == 1) {
         replace_stride(prev_dim, dim);
       }
       shape_[prev_dim] *= shape_[dim];
     } else {
       prev_dim++;
       if (prev_dim != dim) {
         replace_stride(prev_dim, dim);
         shape_[prev_dim] = shape_[dim];
       }
     }
   }

   shape_.resize(prev_dim + 1);
   for (int i = 0; i < ntensors(); i++) {
     operands_[i].stride_bytes.resize(ndim());
   }
   has_coalesced_dimensions_ = true;
 }

 int64_t TensorIteratorBase::numel() const {
   int64_t numel = 1;
   for (int64_t size : shape_) {
     numel *= size;
   }
   return numel;
 }

 StrideVector TensorIteratorBase::get_dim_strides(int dim) const {
   auto dims = ndim();
   auto inner_strides = StrideVector();
   for (auto& op : operands_) {
     inner_strides.push_back(dims == 0 ? 0 : op.stride_bytes[dim]);
   }
   return inner_strides;
 }

 SmallVector<char*, 4> TensorIteratorBase::get_data_ptrs(ArrayRef<char*> base, IntArrayRef counter) const {
   auto ptrs = SmallVector<char*, 4>(base);
   for (int dim = 0; dim < ndim(); dim++) {
     int64_t value = counter[dim];
     for (int arg = 0; arg < ntensors(); arg++) {
       ptrs[arg] += value * operands_[arg].stride_bytes[dim];
     }
   }
   return ptrs;
 }

 SmallVector<char*, 4> TensorIteratorBase::get_base_ptrs() const {
   auto ptrs = SmallVector<char*, 4>();
   for (int i = 0; i < ntensors(); i++) {
     ptrs.push_back((char*)data_ptr(i));
   }
   return ptrs;
 }

 bool TensorIteratorBase::is_dim_reduced(int dim) const {
   for (auto& op : operands_) {
     if (op.is_output && op.stride_bytes[dim] == 0 && shape_[dim] > 1) {
       return true;
     }
   }
   return false;
 }

 void TensorIteratorBase::permute_dimensions(IntArrayRef perm) {
   TORCH_INTERNAL_ASSERT(perm.size() == static_cast<unsigned>(ndim()));

   auto reorder = [perm](IntArrayRef data) {
     auto res = DimVector(data.size(), 0);
     for (size_t i = 0; i < perm.size(); i++) {
       res[i] = data[perm[i]];
     }
     return res;
   };

   // Update shape and strides
   shape_ = reorder(shape_);
   for (auto& op : operands_) {
     if (op.stride_bytes.size() > 0) {
       op.stride_bytes = reorder(op.stride_bytes);
     }
   }
 }

 int64_t TensorIteratorBase::num_output_elements() const {
   int64_t elem = 1;
   for (int dim = 0; dim < ndim(); dim++) {
     if (operands_[0].stride_bytes[dim] != 0 || shape_[dim] == 0)  {
       elem *= shape_[dim];
     }
   }
   return elem;
 }

 int TensorIteratorBase::num_reduce_dims() const {
   int count = 0;
   for (int dim = 0; dim < ndim(); dim++) {
     if (operands_[0].stride_bytes[dim] == 0) {
       count++;
     }
   }
   return count;
 }

 void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) {
   int64_t numel = this->numel();
   if (numel == 0) {
     return;
   } else if (numel < grain_size || at::get_num_threads() == 1) {
     return serial_for_each(loop, {0, numel});
   } else {
     at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
       serial_for_each(loop, {begin, end});
     });
   }
 }

 StrideVector TensorIteratorBase::get_strides() const {
   StrideVector strides;
   for (int dim = 0; dim < ndim(); dim++) {
     for (int arg = 0; arg < ntensors(); arg++) {
       strides.emplace_back(operands_[arg].stride_bytes[dim]);
     }
   }
   return strides;
 }

 void TensorIteratorBase::serial_for_each(loop2d_t loop, Range range) const {
   if (range.size() == 0) {
     return;
   }
   auto strides = get_strides();
   while (strides.size() < 2U * ntensors()) {
     strides.push_back(0);
   }


   auto base_ptrs = get_base_ptrs();
   if (ndim() <= 1) {
     if (range.begin > 0) {
       auto ptrs = get_data_ptrs(base_ptrs, {range.begin});
       loop(ptrs.data(), strides.data(), range.size(), 1);
     } else {
       loop(base_ptrs.data(), strides.data(), range.size(), 1);
     }
   } else {
     auto counter = DimCounter(shape_, range);
     while (!counter.is_done()) {
       auto ptrs = get_data_ptrs(base_ptrs, counter.values);
       auto step = counter.max_2d_step();
       loop(ptrs.data(), strides.data(), step[0], step[1]);
       counter.increment(step);
     }
   }
 }

 bool TensorIteratorBase::is_trivial_1d() const {
   // TODO: check for casting once it's supported
   return ndim() == 1;
 }

 bool TensorIteratorBase::is_contiguous() const {
   if (numel() == 1) {
     return true;
   }
   if (ndim() != 1) {
     return false;
   }
   return has_contiguous_first_dim();
 }


 bool TensorIteratorBase::is_scalar(int arg) const {
   const auto& stride = operands_[arg].stride_bytes;
   for (int i = 0; i < ndim(); i++) {
     if (stride[i] != 0 && shape_[i] != 1) {
       return false;
     }
   }
   return true;
 }

 bool TensorIteratorBase::is_cpu_scalar(int arg) const {
   return is_scalar(arg) && device(arg).is_cpu();
 }

 void TensorIteratorBase::cast_outputs() {
   for (auto& op : operands_) {
     if (op.is_output && op.original_tensor->defined() &&
         op.original_tensor->scalar_type() != op.current_dtype) {
       // TODO: Now that set_output resizes both the original_tensor
       // and tensor, this condition should no longer ever be true
       if (op.original_tensor->sizes() != op.tensor->sizes()){
         op.original_tensor->resize_as_(*op.tensor).as_strided_(op.tensor->sizes(), op.tensor->strides());
       }
       op.original_tensor->copy_(*op.tensor);
       op.tensor = op.original_tensor;
     }
   }
 }

 void* TensorIteratorBase::data_ptr(int arg) const {
   return operands_[arg].data;
 }

 void TensorIteratorBase::remove_operand(int arg) {
   operands_.erase(operands_.begin() + arg);
 }

 void TensorIteratorBase::unsafe_replace_operand(int arg, void* data) {
   operands_[arg].data = data;
 }

 void TensorIteratorBase::narrow(int dim, int64_t start, int64_t size) {
   TORCH_INTERNAL_ASSERT(dim < ndim() && size >= 1);
   shape_[dim] = size;
   view_offsets_[dim] += start;
   for (auto& op : operands_) {
     op.data = ((char*)op.data) + op.stride_bytes[dim] * start;
   }
   if (size == 1 && !is_reduction_) {
     coalesce_dimensions();
   }
 }

 void TensorIteratorBase::select_all_keeping_dim(int start_dim, IntArrayRef indices) {
   TORCH_INTERNAL_ASSERT(start_dim <= ndim());
   for (int i = start_dim; i < ndim(); ++i) {
     for (auto& op : operands_) {
       op.data = ((char*)op.data) + op.stride_bytes[i] * indices[i - start_dim];
     }
     shape_[i] = 1;
   }
 }

 #define BINARY_FLOAT_OP_CONFIG()                \
   TensorIteratorConfig()                        \
     .set_check_mem_overlap(true)                \
     .allow_cpu_scalars(true)                    \
     .promote_inputs_to_common_dtype(true)       \
     .cast_common_dtype_to_outputs(true)         \
     .enforce_safe_casting_to_output(true)       \
     .promote_integer_inputs_to_float(true)

 // Helper to construct a binary op that promotes integer inputs to float.
 void TensorIteratorBase::build_binary_float_op(const Tensor& out, const Tensor& a, const Tensor& b) {
   build(BINARY_FLOAT_OP_CONFIG()
         .add_owned_output(out)
         .add_owned_input(a)
         .add_owned_input(b));
 }

 void TensorIteratorBase::build_borrowing_binary_float_op(const Tensor& out, const Tensor& a, const Tensor& b) {
   build(BINARY_FLOAT_OP_CONFIG()
         .add_output(out)
         .add_input(a)
         .add_input(b));
 }

 // This cannot be a function because TensorIteratorConfig is not
 // copyable or movable, so it can't be returned from the function.
 #define BINARY_OP_CONFIG()                              \
   TensorIteratorConfig()                                \
     .set_check_mem_overlap(true)                        \
     .allow_cpu_scalars(true)                            \
     .promote_inputs_to_common_dtype(true)               \
     .cast_common_dtype_to_outputs(true)                 \
     .enforce_safe_casting_to_output(true)               \

 void TensorIteratorBase::build_binary_op(const Tensor& out, const Tensor& a, const Tensor& b) {
   build(BINARY_OP_CONFIG()
       .add_owned_output(out)
       .add_owned_input(a)
       .add_owned_input(b));
 }

 void TensorIteratorBase::build_borrowing_binary_op(const Tensor& out, const Tensor& a, const Tensor& b) {
   build(BINARY_OP_CONFIG()
       .add_output(out)
       .add_input(a)
       .add_input(b));
 }

 void TensorIteratorBase::build_unary_float_op(const Tensor& out, const Tensor& a) {
   build(TensorIteratorConfig()
       .set_check_mem_overlap(true)
       .add_owned_output(out)
       .add_owned_input(a)
       .promote_inputs_to_common_dtype(true)
       .cast_common_dtype_to_outputs(true)
       .enforce_safe_casting_to_output(true)
       .promote_integer_inputs_to_float(true));
 }

 void TensorIteratorBase::build_unary_op(const Tensor& out, const Tensor& a) {
   build(TensorIteratorConfig()
       .set_check_mem_overlap(true)
       .add_owned_output(out)
       .add_owned_input(a)
       .cast_common_dtype_to_outputs(false)
       .enforce_safe_casting_to_output(false)
       .check_all_same_dtype(true));
 }

 TensorIterator TensorIterator::binary_op(Tensor& out, const Tensor& a, const Tensor& b) {
   TensorIterator iter;
   iter.build_binary_op(out, a, b);
   return iter;
 }

 TensorIterator TensorIterator::borrowing_binary_op(const Tensor& out, const Tensor& a, const Tensor& b) {
   TensorIterator iter;
   iter.build_borrowing_binary_op(out, a, b);
   return iter;
 }

 TensorIterator TensorIterator::binary_float_op(Tensor& out, const Tensor& a, const Tensor& b) {
   TensorIterator iter;
   iter.build_binary_float_op(out, a, b);
   return iter;
 }

 TensorIterator TensorIterator::comparison_op(Tensor& out, const Tensor& a,
     const Tensor& b) {
   // Note [special-case bool outputs]
   // We explicitly don't call `cast_common_dtype_to_outputs` when the output tensor
   // has `bool` dtype. This is a performance optimization: the functional
   // version of all comparison/logical ops uses a bool output tensor, and we'd like to
   // avoid creating a temporary copy of the output.
   // However, note that all kernels using this TensorIterator will need to special-case when
   // the output tensor has bool dtype, and provide a lambda of type (scalar_t, scalar_t -> bool).
   if (out.scalar_type() == kBool) {
     return TensorIteratorConfig()
     .set_check_mem_overlap(true)
     .add_owned_output(out)
     .add_owned_input(a)
     .add_owned_input(b)
     .allow_cpu_scalars(true)
     .promote_inputs_to_common_dtype(true)
     .build();
   } else {
     return TensorIteratorConfig()
     .set_check_mem_overlap(true)
     .add_owned_output(out)
     .add_owned_input(a)
     .add_owned_input(b)
     .allow_cpu_scalars(true)
     .promote_inputs_to_common_dtype(true)
     .cast_common_dtype_to_outputs(true)
     .build();
   }
 }

 TensorIterator TensorIterator::unary_op(Tensor& out, const Tensor& a) {
   TensorIterator iter;
   iter.build_unary_op(out, a);
   return iter;
 }

 TensorIterator TensorIterator::unary_float_op(Tensor& out, const Tensor& a) {
   TensorIterator iter;
   iter.build_unary_float_op(out, a);
   return iter;
 }

 #define NULLARY_OP_CONFIG()                                     \
   TensorIteratorConfig()                                        \
     .set_check_mem_overlap(true)                                \
     .check_all_same_dtype(false)                                \
   /* FIXME: workaround for bug: https://github.com/pytorch/pytorch/issues/20342 */ \
     .resize_outputs(false)

 TensorIterator TensorIterator::nullary_op(Tensor& out) {
   return NULLARY_OP_CONFIG()
     .add_owned_output(out)
     .build();
 }

 TensorIterator TensorIterator::borrowing_nullary_op(const Tensor& out) {
   return NULLARY_OP_CONFIG()
     .add_output(out)
     .build();
 }

 TensorIterator TensorIterator::reduce_op(Tensor& out, const Tensor& a) {
   TORCH_INTERNAL_ASSERT(out.defined());
   return TensorIteratorConfig()
     .set_check_mem_overlap(false)
     .add_owned_output(out)
     .add_owned_input(a)
     .resize_outputs(false)
     .is_reduction(true)
     // TODO: not supporting casting to outputs is only really necessary for arg{min,max}
     .promote_inputs_to_common_dtype(true)
     .build();
 }

 TensorIterator TensorIterator::reduce_op(Tensor& out1, Tensor& out2, const Tensor& a) {
   TORCH_INTERNAL_ASSERT(out1.defined());
   TORCH_INTERNAL_ASSERT(out2.defined());
   TORCH_CHECK(a.device() == out1.device() && out1.device() == out2.device(),
       "reduce_op(): expected input and both outputs to be on same device, but input is on ", a.device(),
       ", output1 is on ", out1.device(), " and output2 is on", out2.device());
   TORCH_CHECK(out1.dim() == out2.dim(), "reduce_op(): expected both outputs to have same number of dims, but output1 has ", out1.dim(),
       " and output2 has ", out2.dim());
   TORCH_CHECK(out1.sizes() == out2.sizes(), "reduce_op(): expected both outputs to have same sizes, but output1 has ", out1.sizes(),
       " and output2 has ", out2.sizes());
   TORCH_CHECK(out1.strides() == out2.strides(), "reduce_op(): expected both outputs to have same strides, but output1 has ", out1.strides(),
       " and output2 has ", out2.strides());
   return TensorIteratorConfig()
     .set_check_mem_overlap(false)
     .add_owned_output(out1)
     .add_owned_output(out2)
     .add_owned_input(a)
     .resize_outputs(false)
     .is_reduction(true)
     .check_all_same_dtype(false)
     .build();
 }

 void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) {
   for (auto& tensor: config.tensors_) {
     // If *any* of the arguments is a meta tensor, the overall
     // computation is a meta computation (don't do any work,
     // just compute output information).  This aligns with
     // our multiple dispatch semantics.
     if (tensor->is_meta()) {
       is_meta_ = true;
     }
     operands_.emplace_back(std::move(tensor));
   }
   num_outputs_ = config.num_outputs_;
 }

 void TensorIteratorBase::mark_outputs() {
   // TODO: merge this into populate_operands
   for (int i = 0; i < num_outputs_; i++) {
     operands_[i].is_output = true;
     const auto& output = operands_[i].tensor;
     if (!output->defined()) continue;

     // check if output is also an input
     for (int arg = num_outputs_; arg < ntensors(); arg++) {
       const auto& input = operands_[arg].tensor;
       if (output->is_same(*input)) {
         operands_[i].is_read_write = true;
       }
     }
   }
 }

 void TensorIteratorBase::mark_resize_outputs(const TensorIteratorConfig& config) {
   // Outputs cannot be broadcasted. Check that the shape of the outputs matches
   // the inferred shape. There's an exception for write-only tensors to support
   // our legacy behavior that functions with `out=` arguments resize their
   // outputs.
   if (config.static_shape_.has_value()) {
     return;
   }
   for (int i = 0; i < num_outputs_; i++) {
     const auto& output = operands_[i].tensor;
     if (output->defined() && !output->sizes().equals(shape_)) {
       if (config.resize_outputs_ && !operands_[i].is_read_write) {
         operands_[i].will_resize = true;
         continue;
       }
       // for reduction, output size does not match shape_, as output is reduced size, and shape_ is size of the input
       TORCH_CHECK(is_reduction_,  "output with shape ", output->sizes(), " doesn't match the broadcast shape ",
                  shape_);
     }
   }
 }

 void TensorIteratorBase::compute_mem_overlaps(const TensorIteratorConfig& config) {
   if (!config.check_mem_overlap_) {
     return;
   }
   for (int i = 0; i < num_outputs_; i++) {
     const auto& output = operands_[i].tensor;
     if (!output->defined()) continue;
     assert_no_internal_overlap(*output);
     for (int j = num_outputs_; j < ntensors(); j++) {
       const auto& input = operands_[j].tensor;
       if (input->unsafeGetTensorImpl()!=output->unsafeGetTensorImpl()) {
         assert_no_partial_overlap(*output, *input);
       }
     }
   }
 }

 void TensorIteratorBase::compute_shape(const TensorIteratorConfig& config) {
   if (config.static_shape_.has_value()) {
     shape_ = *config.static_shape_;
     return;
   }

   all_ops_same_shape_ = true;
   bool has_scalars = false;
   bool has_tensors = false;
   for (auto& op : operands_) {
     if (!op.tensor->defined()) continue;

     // For now, don't include output tensors when we're resizing outputs.
     // These shapes don't participate in shape computation.
     // This preserves the legacy behavior where torch.add(..., out=dst) resizes
     // the destination tensor.  If the output tensor is also an input, we'll
     // pick it up later in the operands.
     if (config.resize_outputs_ && op.is_output) continue;
     auto shape = op.tensor->sizes();
     if (shape.size() == 0) {
       has_scalars = true;
     } else {
       has_tensors = true;
     }
     if (has_scalars && has_tensors) {
       all_ops_same_shape_ = false;
     }
     if (shape_.empty()) {
       shape_ = shape;
     } else if (!shape.equals(shape_)) {
       all_ops_same_shape_ = false;
       shape_ = infer_size_dimvector(shape_, shape);
     }
   }
 }

 void TensorIteratorBase::compute_strides(const TensorIteratorConfig& config) {
   for (auto& op : operands_) {
     if (op.tensor->defined()) {
       IntArrayRef original_shape = config.static_shape_ ? shape_ : op.tensor->sizes();
       auto original_stride = op.tensor->strides();
       auto element_size_in_bytes = op.tensor->element_size();
       auto offset = ndim() - original_shape.size();
       if (offset > 0)
           op.stride_bytes.resize(ndim(), 0);
       else
           op.stride_bytes.resize(ndim());
       for (size_t i = 0; i < original_shape.size(); i++) {
         // see NOTE: [Computing output strides]
         if (original_shape[i] == 1 && shape_[offset + i] !=1) {
           op.stride_bytes[offset + i] = 0;
         } else {
           op.stride_bytes[offset + i] = original_stride[i] * element_size_in_bytes;
         }
       }
     }
   }
 }

 bool TensorIteratorBase::can_use_32bit_indexing() const {
   int64_t max_value = std::numeric_limits<int32_t>::max();
   if (numel() > max_value) {
     return false;
   }
   for (auto& op : operands_) {
     int64_t max_offset = 1;
     for (int dim = 0; dim < ndim(); dim++) {
       max_offset += (shape_[dim] - 1) * op.stride_bytes[dim];
     }
     if (max_offset > max_value) {
       return false;
     }
   }
   return true;
 }

 std::unique_ptr<TensorIterator> TensorIteratorBase::split(int dim) {
   TORCH_INTERNAL_ASSERT(dim >= 0 && dim < ndim() && shape()[dim] >= 2);
   std::unique_ptr<TensorIterator> copy(new TensorIterator(*this));

   bool overlaps = is_dim_reduced(dim);
   auto copy_size = shape_[dim] / 2;
   auto this_size = shape_[dim] - copy_size;
   copy->narrow(dim, 0, copy_size);
   copy->final_output_ &= !overlaps;
   this->narrow(dim, copy_size, this_size);
   this->accumulate_ |= overlaps;

   return copy;
 }


 int TensorIteratorBase::get_dim_to_split() const {
   TORCH_INTERNAL_ASSERT(ndim() >= 1);
   int64_t max_extent = -1;
   int dim_to_split = -1;
   for (int dim = ndim() - 1; dim >= 0; dim--) {
     if (shape_[dim] == 0) {
       continue;
     }
     int64_t size = shape_[dim];
     for (auto& op : operands_) {
       int64_t extent = (size - 1) * op.stride_bytes[dim];
       if (extent > max_extent) {
         max_extent = extent;
         dim_to_split = dim;
       }
     }
   }
   TORCH_INTERNAL_ASSERT(max_extent >= 0);
   return dim_to_split;
 }

 bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
   // This function tries to do a fast setup to avoid needless reordering of dimensions and tracking output strides
   // Return true if it can do fast setup or false otherwise
   // TODO enable fast handling for reductions
   FastSetupType setup_type = compute_fast_setup_type(config);
   if (setup_type == FastSetupType::NONE) {
     return false;
   }

   // allocate memory for output, memory format depends on setup_type
   switch (setup_type) {
     case FastSetupType::CONTIGUOUS:
       {
         for (int i = 0; i < num_outputs_; i++){
           auto& op = operands_[i];
           if (!op.tensor->defined()) {
             TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
           }
           set_output(i, shape_, {}, original_options(op).memory_format(MemoryFormat::Contiguous), names_);
         }
         break;
       }
     case FastSetupType::CHANNELS_LAST:
       {
         for (int i = 0; i < num_outputs_; i++){
           auto& op = operands_[i];
           if (!op.tensor->defined()) {
             TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
           }
           set_output(i, shape_, {}, original_options(op).memory_format(MemoryFormat::ChannelsLast), names_);
         }
         break;
       }
     case FastSetupType::NON_OVERLAPPING_DENSE:
       {
         // find the index of a defined tensor in operands_ start from input tensor
         int i_defined; // NOLINT(cppcoreguidelines-init-variables)
         for (i_defined = ntensors() - 1; i_defined >= 0; --i_defined) {
           if (operands_[i_defined].tensor->defined()) break;
         }
         TORCH_CHECK(i_defined >= 0, "Can not find a defined tensor when fast allocating memory to outputs");
         for (int i = 0; i < num_outputs_; i++){
           auto& op = operands_[i];
           if (!op.tensor->defined()) {
             TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
           }
           set_output(i, shape_, operands_[i_defined].tensor->strides(), original_options(op), names_);
         }
         break;
       }
     default:
       TORCH_INTERNAL_ASSERT(false, "Unsupported fast setup type", c10::to_string((int)setup_type));
   }
   //coalescing dimensions consists of collapsing dimensions to 1 (we are limited to contiguous no-broadcast cases here)
   if (ndim() > 1){
     has_coalesced_dimensions_ = true;
   }
   if (ndim() >= 1) {
     shape_[0] = numel();
     shape_.resize(1);
   }
   for (auto& op : operands_ ) {
     auto element_size_in_bytes = op.tensor->element_size();
     op.stride_bytes.resize(ndim());
     if (ndim()>0) {
       op.stride_bytes[0] = element_size_in_bytes;
     }
   }
   return true;
 }

 FastSetupType TensorIteratorBase::compute_fast_setup_type(const TensorIteratorConfig& config) {
   if (is_reduction_ || !all_ops_same_shape_) {
     return FastSetupType::NONE;
   }

   // For linear iteration, only contiguous tensors can be coalesced
   // Fast setup of any other format requires changing iteration order
   if (enforce_linear_iteration_) {
     for (const auto& op : operands_) {
       if (op.tensor->defined() && !op.will_resize) {
         auto is_contiguous = op.tensor->is_contiguous(at::MemoryFormat::Contiguous);
         if (!is_contiguous) {
           return FastSetupType::NONE;
         }
       }
     }
     return FastSetupType::CONTIGUOUS;
   }

   bool is_contiguous = true;
   bool is_channels_last = true;
   bool is_non_overlapping_and_dense = true;
   for (const auto& op : operands_) {
     if (op.tensor->defined() && !op.will_resize) {
       is_contiguous &= op.tensor->is_contiguous(at::MemoryFormat::Contiguous);
       is_channels_last &= op.tensor->is_contiguous(at::MemoryFormat::ChannelsLast);
       is_non_overlapping_and_dense &= op.tensor->is_non_overlapping_and_dense();
     }
   }
   // TODO this leads to ambiguous cases (NC11) to be always treated as contiguous
   if (is_contiguous) {
     return FastSetupType::CONTIGUOUS;
   }
   if (is_channels_last) {
     return FastSetupType::CHANNELS_LAST;
   }
   if (is_non_overlapping_and_dense) {
     int64_t prev = -1;
     // Fast setup is allowed only when all the defined tensors have the same shape and strides,
     // Iterate from back to check input tensors' strides first, then output tensors'.
     for (int64_t i = ntensors() - 1; i >= 0; --i) {
       const auto& op = operands_[i];
       if (op.tensor->defined() && !op.will_resize) {
         if (prev < 0) {
           prev = i;
           continue;
         }
         if (!operands_[prev].tensor->strides().equals(op.tensor->strides())) {
           // [Note: stride check for non contiguous tensors in fast setup]
           // We prevent 3 cases doing fast setup here:
           // 1. input tensors have different strides.
           // 2. output tensors won't be resized and have different strides.
           // 3. input tensors have the same strides, but output tensors have different strides with input tensors.
           //    We don't allow re-stride output tensors in this case since it is not compatible with
           //    numpy. The behavior in numpy is that if the output tensor has same shape as the input
           //    tensor but different strides, the strides of output tensor will be preserved, so we do
           //    the same in tensor iterator.
           return FastSetupType::NONE;
         }
       }
     }
     return FastSetupType::NON_OVERLAPPING_DENSE;
   }
   return FastSetupType::NONE;
 }

 TensorIteratorBase::TensorIteratorBase() = default;

 void TensorIteratorBase::build(TensorIteratorConfig& config) {
   // populate some persistent configuration fields
   is_reduction_ = config.is_reduction_;
   enforce_linear_iteration_ = config.enforce_linear_iteration_;

   // fill in operands_ based on configuration
   populate_operands(config);
   // set is_output and is_read_write flags on appropriate tensors
   mark_outputs();
   // Check that the outputs have no internal overlap
   // and do not share memory with inputs.
   compute_mem_overlaps(config);
   // Check that input dimensions are aligned correctly & compute outnames.
   compute_names(config);
   // compute the broadcasted shape
   compute_shape(config);
   // mark outputs for resizing if necessary
   mark_resize_outputs(config);
   // compute the result dtype and device
   compute_types(config);
   // try fast setup output tensor, if failed, fallback to normal setup
   if (!fast_set_up(config)) {
     // compute each tensor's stride after broadcasting
     compute_strides(config);
     // re-order dimensions to improve coalescing
     reorder_dimensions();
     // allocate the output tensor if it's not provided
     allocate_or_resize_outputs();
     // coalesce adjacent dimensions when possible
     if (!is_meta_) coalesce_dimensions();
   }

   if (is_meta_) return;

   // XLA tensors don't have storage, so they don't have an underlying data pointer.
   // Nothing beyond this point is important for meta functions, so it's fine to exit early here.
   if (common_device_.type() == DeviceType::XLA) return;

   for (auto& op : operands_) {
     TORCH_INTERNAL_ASSERT(op.tensor->defined());
     op.data = op.tensor->data_ptr();
   }

   // zero out offsets
   // If the tensor is a scalar, we leave room for it
   // So index translations in reduction can access
   // a valid value for the offset
   int64_t ndim_offsets = (ndim() ? ndim() : 1);
   view_offsets_ = DimVector(ndim_offsets, 0);
 }

 // This is the structured kernels implementation of set_output.  It is
 // NEVER actually called directly; instead, a subclass of TensorIteratorBase
 // will override set_output to actually do the operation, and then call
 // set_output on the TensorIteratorBase to setup TI's metadata.
 // The precondition for this function is that maybe_get_output() now
 // unconditionally returns a real Tensor (prior to output setting,
 // this function may return an undefined tensor.)
 void TensorIteratorBase::set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) {
   auto& op = operands_[output_idx];
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(output_idx < num_outputs_);
   const auto& t = maybe_get_output(output_idx);
   TORCH_INTERNAL_ASSERT(t.defined());
   if (!op.tensor->defined()) {
     op.tensor = c10::MaybeOwned<Tensor>::borrowed(t);
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(op.target_dtype == t.scalar_type());
   } else if (op.will_resize) {
     if (op.original_tensor->defined()) {
       // OK, so this is pretty weird.  To understand how we can end up in
       // this situation, first look at Marker [Output original_tensor is set].
       // That is the sole site where original_tensor may be set on an
       // output operand.  Essentially, when we are given an explicit output
       // tensor whose dtype doesn't match the computed common dtype from
       // the input operands, we do a switcheroo: we replace the (incorrectly
       // typed) output tensor with a correctly typed, *temporary* tensor,
       // and remember the original tensor in original_tensor (which will
       // then get written back to when we cast_outputs).
       //
       // Now, what if the given output tensor also happened to be zero
       // size (meaning that we will_resize it)?  Well, at the call site
       // above, we don't necessarily(*) know what the correct shape should
       // be, so we give the temporary tensor the same shape as the original.
       // At the time of set_output is when we DO know what the correct size
       // is, and the subclass's implementation of set_output in structured class
       // responsible for resizing original_tensor.  But we still have this
       // incorrectly sized temporary output which the structured subclass
       // knows nothing about, so we are obligated to also resize it here.
       //
       // This is a slight memory pessimization, because previously
       // original_tensor only got resized at the end of the computation, rather
       // than at the beginning (as happens here).  However, the peak memory
       // usage is the same, since you need to materialize both original tensor
       // and temporary tensor to do the copy.
       //
       // (*) Actually, technically, we probably do know what the shape
       // should be, since we do shape computation before dtype computation.
       // So hypothetically we could figure out what the correct shape is
       // at that point in time and directly allocate the temporary at
       // the right size.
       //
       // But a better solution is to delay allocation of temporaries until
       // after TensorIterator builder, waiting until we actually want
       // to do the computation.  That would also remove the necessity
       // for the is_meta_ test.
       TORCH_INTERNAL_ASSERT(op.original_tensor->is_same(t));
       TORCH_INTERNAL_ASSERT(!op.tensor->is_same(t));
       at::native::resize_output(*op.tensor, sizes);
       if (!strides.empty()) {
         TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
         op.tensor->as_strided_(sizes, strides);
       } else if (options.memory_format_opt().has_value()) {
         op.tensor->unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt());
       }
     }
   }
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       op.tensor->is_same(t) || op.current_dtype == op.tensor->scalar_type());
 // For simplicity, just always update the cached current_type.
   op.current_dtype = op.tensor->scalar_type();
 }

 // This is the "traditional" implementation of set_output.  On TensorIterator
 // instances, it is invoked directly from various call sites in this file.  No
 // funny business.
 void TensorIterator::set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) {
   // NB: intentionally no superclass call
   auto& op = operands_[output_idx];
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(output_idx < num_outputs_);
   if (!op.tensor->defined()) {
       if (strides.empty()) {
         op.tensor = c10::MaybeOwned<Tensor>::owned(at::empty(sizes, options));
       } else {
         op.tensor = c10::MaybeOwned<Tensor>::owned(at::empty_strided(sizes, strides, options));
       }
       op.current_dtype = op.target_dtype;
   } else if (op.will_resize) {
       at::native::resize_output(*op.tensor, sizes);
       if (!strides.empty()) {
         TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
         op.tensor->as_strided_(sizes, strides);
       } else if (options.memory_format_opt().has_value()) {
         op.tensor->unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt());
       }
   }
   if (!names.empty()) {
     TORCH_INTERNAL_ASSERT(op.tensor->defined());
     namedinference::propagate_names(*op.tensor, names);
   }
 }

 // Not actually used by anything (TensorIterator subclass calls
 // its own implementation of set_output which knows exactly where
 // all the outputs are), but we have to provide all pure virtual methods
 // for MetaBase
 const Tensor& TensorIterator::maybe_get_output(int64_t output_idx) {
   return *operands_[output_idx].tensor;
 }

 SplitUntil32Bit TensorIteratorBase::with_32bit_indexing() const {
   return SplitUntil32Bit(*this);
 }

 /// SplitUntil32Bit. Recursively splits an iterator into sub-iterators that
 /// can use 32-bit indexing.

 SplitUntil32Bit::iterator::iterator(const TensorIteratorBase& iter) {
   vec.emplace_back(new TensorIterator(iter));
   vec.emplace_back(nullptr); // ++ first pops the last element
   ++(*this);
 }

 SplitUntil32Bit::iterator& SplitUntil32Bit::iterator::operator++() {
   vec.pop_back();
   while (!vec.empty() && !vec.back()->can_use_32bit_indexing()) {
     auto& iter = *vec.back();
     int64_t split_dim = iter.get_dim_to_split();
     vec.emplace_back(iter.split(split_dim));
   }
   return *this;
 }

 TensorIterator& SplitUntil32Bit::iterator::operator*() const {
   return *vec.back();
 }

 SplitUntil32Bit::iterator SplitUntil32Bit::begin() const {
   return SplitUntil32Bit::iterator(iter);
 }

 SplitUntil32Bit::iterator SplitUntil32Bit::end() const {
   return SplitUntil32Bit::iterator();
 }

 DimCounter::DimCounter(IntArrayRef shape, Range range)
   : shape(shape)
   , range(range)
   , values(shape.size(), 0)
   , offset(range.begin) {
   int64_t linear_offset = range.begin;
   int64_t ndim = values.size();
   for (const auto dim : c10::irange(ndim)) {
     int64_t size = shape[dim];
     if (size > 0) {
       values[dim] = linear_offset % size;
       linear_offset /= size;
     }
   }
   TORCH_INTERNAL_ASSERT(linear_offset == 0);
 }

 bool DimCounter::is_done() const {
   return offset >= range.end;
 }

 void DimCounter::increment(const std::array<int64_t, 2>& step) {
   offset += step[0] * step[1];
   int64_t ndim = values.size();
   int64_t overflow = step[0];
   int i = 0;
   if (step[1] != 1) {
     TORCH_INTERNAL_ASSERT(step[0] == shape[0] && values[0] == 0);
     i = 1;
     overflow = step[1];
   }
   for (; i < ndim && overflow > 0; i++) {
     auto size = shape[i];
     auto prev = values[i];
     auto value = prev + overflow;
     if (value >= size) {
       overflow = 1;
       value -= size;
       TORCH_INTERNAL_ASSERT(value < size);
     } else {
       overflow = 0;
     }
     values[i] = value;
   }
   TORCH_INTERNAL_ASSERT(overflow == 0 || overflow == 1);
 }

 std::array<int64_t, 2> DimCounter::max_2d_step() const {
   int64_t step0 = std::min(shape[0] - values[0], range.end - offset);
   int64_t step1 = 1;
   if (step0 == shape[0] && shape.size() >= 1) {
     step1 = std::min(shape[1] - values[1], (range.end - offset) / shape[0]);
   }
   return {step0, step1};
 }

 }  // namespace at