kernels/portable/cpu/op_cumsum.cpp - platform/external/executorch - Git at Google

 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */

 #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 #include <cmath>
 #include <cstddef>
 // #include <cstdint>
 // #include <type_traits>

 namespace torch {
 namespace executor {
 namespace native {

 using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;

 namespace {
 /**
  * Returns the cumulative sum of elements of input in the dimension dim.
  *
  * Given a self tensor whose size is (d1, d2, .., d_dim, ..., dm), and does
  * cumsum along dim, we first copy all values in self[d1, d2, .., 0, ..., dm]
  * to out[d1, d2, .., 0, ..., dm] since no cumsum should be done for the
  * first element. Then calculate all out[d1, d2, .., i, ..., dm] by adding
  * out[d1, d2, .., i-1, ..., dm] and self[d1, d2, .., i-1, ..., dm].
  * This approach ensures that computations are sequential rather than jumpy at
  * the memory level, thereby increasing the speed of memory IO as
  * well as reducing the number of cache misses.
  */
 template <typename CTYPE_IN, typename CTYPE_OUT>
 void cumsum_tensors(const Tensor& self, int64_t dim, Tensor& out) {
   if (self.numel() == 0) {
     return;
   }

   const CTYPE_IN* input_data_base = self.const_data_ptr<CTYPE_IN>();
   CTYPE_OUT* output_data_base = out.mutable_data_ptr<CTYPE_OUT>();

   if (self.dim() == 0) {
     output_data_base[0] = input_data_base[0];
     return;
   }

   const size_t dim_size = static_cast<size_t>(self.size(dim));
   const size_t leading_dims = getLeadingDims(self, dim);
   const size_t trailing_dims = getTrailingDims(self, dim);

   for (size_t i = 0; i < leading_dims; i++) {
     size_t start_loc = i * (trailing_dims * dim_size);

     for (size_t idx = 0; idx < trailing_dims; idx++) {
       output_data_base[start_loc + idx] =
           static_cast<CTYPE_OUT>(input_data_base[start_loc + idx]);
     }

     for (size_t j = 1; j < dim_size; j++) {
       size_t cur_round_base = start_loc + j * trailing_dims;
       size_t prev_round_base = start_loc + (j - 1) * trailing_dims;
       for (size_t idx = 0; idx < trailing_dims; idx++) {
         output_data_base[cur_round_base + idx] =
             static_cast<CTYPE_OUT>(input_data_base[cur_round_base + idx]) +
             output_data_base[prev_round_base + idx];
       }
     }
   }
 }

 } // namespace

 /**
  * Returns the cumulative sum of elements of input in the dimension dim.
  * If dtype is specified, the input tensor is casted to dtype before the
  * operation is performed. This is useful for preventing data type overflows.
  */
 Tensor& cumsum_out(
     RuntimeContext& ctx,
     const Tensor& self,
     int64_t dim,
     optional<ScalarType> enforced_dtype,
     Tensor& out) {
   (void)ctx;

   ET_KERNEL_CHECK(
       ctx,
       check_cumsum_args(self, dim, enforced_dtype, out),
       InvalidArgument,
       out);

   ET_KERNEL_CHECK(
       ctx, resize_tensor(out, self.sizes()) == Error::Ok, InvalidArgument, out);

   dim = (self.dim() == 0) ? 0 : dim < 0 ? dim + self.dim() : dim;

   ET_SWITCH_REAL_TYPES_AND(
       Bool, self.scalar_type(), ctx, "cumsum", CTYPE_SELF, [&] {
         ET_SWITCH_REAL_TYPES_AND(
             Bool, out.scalar_type(), ctx, "cumsum", CTYPE_OUT, [&] {
               cumsum_tensors<CTYPE_SELF, CTYPE_OUT>(self, dim, out);
             });
       });

   return out;
 }

 } // namespace native
 } // namespace executor
 } // namespace torch
	/*
	* Copyright (c) Meta Platforms, Inc. and affiliates.
	* All rights reserved.
	*
	* This source code is licensed under the BSD-style license found in the
	* LICENSE file in the root directory of this source tree.
	*/

	#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
	#include <executorch/runtime/kernel/kernel_includes.h>
	#include <executorch/runtime/platform/assert.h>
	#include <cmath>
	#include <cstddef>
	// #include <cstdint>
	// #include <type_traits>

	namespace torch {
	namespace executor {
	namespace native {

	using Tensor = exec_aten::Tensor;
	using ScalarType = exec_aten::ScalarType;

	namespace {
	/**
	* Returns the cumulative sum of elements of input in the dimension dim.
	*
	* Given a self tensor whose size is (d1, d2, .., d_dim, ..., dm), and does
	* cumsum along dim, we first copy all values in self[d1, d2, .., 0, ..., dm]
	* to out[d1, d2, .., 0, ..., dm] since no cumsum should be done for the
	* first element. Then calculate all out[d1, d2, .., i, ..., dm] by adding
	* out[d1, d2, .., i-1, ..., dm] and self[d1, d2, .., i-1, ..., dm].
	* This approach ensures that computations are sequential rather than jumpy at
	* the memory level, thereby increasing the speed of memory IO as
	* well as reducing the number of cache misses.
	*/
	template <typename CTYPE_IN, typename CTYPE_OUT>
	void cumsum_tensors(const Tensor& self, int64_t dim, Tensor& out) {
	if (self.numel() == 0) {
	return;
	}

	const CTYPE_IN* input_data_base = self.const_data_ptr<CTYPE_IN>();
	CTYPE_OUT* output_data_base = out.mutable_data_ptr<CTYPE_OUT>();

	if (self.dim() == 0) {
	output_data_base[0] = input_data_base[0];
	return;
	}

	const size_t dim_size = static_cast<size_t>(self.size(dim));
	const size_t leading_dims = getLeadingDims(self, dim);
	const size_t trailing_dims = getTrailingDims(self, dim);

	for (size_t i = 0; i < leading_dims; i++) {
	size_t start_loc = i * (trailing_dims * dim_size);

	for (size_t idx = 0; idx < trailing_dims; idx++) {
	output_data_base[start_loc + idx] =
	static_cast<CTYPE_OUT>(input_data_base[start_loc + idx]);
	}

	for (size_t j = 1; j < dim_size; j++) {
	size_t cur_round_base = start_loc + j * trailing_dims;
	size_t prev_round_base = start_loc + (j - 1) * trailing_dims;
	for (size_t idx = 0; idx < trailing_dims; idx++) {
	output_data_base[cur_round_base + idx] =
	static_cast<CTYPE_OUT>(input_data_base[cur_round_base + idx]) +
	output_data_base[prev_round_base + idx];
	}
	}
	}
	}

	} // namespace

	/**
	* Returns the cumulative sum of elements of input in the dimension dim.
	* If dtype is specified, the input tensor is casted to dtype before the
	* operation is performed. This is useful for preventing data type overflows.
	*/
	Tensor& cumsum_out(
	RuntimeContext& ctx,
	const Tensor& self,
	int64_t dim,
	optional<ScalarType> enforced_dtype,
	Tensor& out) {
	(void)ctx;

	ET_KERNEL_CHECK(
	ctx,
	check_cumsum_args(self, dim, enforced_dtype, out),
	InvalidArgument,
	out);

	ET_KERNEL_CHECK(
	ctx, resize_tensor(out, self.sizes()) == Error::Ok, InvalidArgument, out);

	dim = (self.dim() == 0) ? 0 : dim < 0 ? dim + self.dim() : dim;

	ET_SWITCH_REAL_TYPES_AND(
	Bool, self.scalar_type(), ctx, "cumsum", CTYPE_SELF, [&] {
	ET_SWITCH_REAL_TYPES_AND(
	Bool, out.scalar_type(), ctx, "cumsum", CTYPE_OUT, [&] {
	cumsum_tensors<CTYPE_SELF, CTYPE_OUT>(self, dim, out);
	});
	});

	return out;
	}

	} // namespace native
	} // namespace executor
	} // namespace torch