aten/src/ATen/native/cpu/ReduceOpsKernel.cpp - platform/external/pytorch - Git at Google

 #include <numeric>
 #include <iterator>
 #include <algorithm>

 #include <ATen/Dispatch.h>
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/native/ReduceOps.h>
 #include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/cpu/Reduce.h>

 #include <c10/util/Optional.h>
 #include <c10/util/irange.h>
 #include <ATen/AccumulateType.h>

 namespace at { namespace native { namespace {

 using namespace vec;

 template <typename scalar_t, typename func_t>
 static inline void cpu_cum_base_kernel(const Tensor& result,
     const Tensor& self,
     int64_t dim,
     const func_t& f,
     scalar_t init_val) {
   if (result.sizes() != self.sizes()) {
     at::native::resize_output(result, self.sizes());
   }
   if (self.numel() == 0) {
     return;
   }
   const auto input_ndim = self.dim();
   if (input_ndim == 0) {
     result.fill_(self);
     return;
   }

   // TODO This probably should be using at::native::make_reduction
   auto iter = TensorIteratorConfig()
     .check_all_same_dtype(false)
     .resize_outputs(false)
     // NOLINTNEXTLINE(bugprone-argument-comment)
     .declare_static_shape(self.sizes(), /*squash_dim=*/dim)
     .add_output(result)
     .add_input(self)
     .build();

   auto result_dim_stride = ensure_nonempty_stride(result, dim);
   auto self_dim_stride = ensure_nonempty_stride(self, dim);

   auto loop = [&](char** data, const int64_t* strides, int64_t n) {
     auto* result_data_bytes = data[0];
     const auto* self_data_bytes = data[1];

     for (const auto i : c10::irange(n)) {
       (void)i; //Suppress unused variable warning
       f(
         (scalar_t*)result_data_bytes, result_dim_stride,
         (scalar_t*)self_data_bytes, self_dim_stride, init_val
       );
       result_data_bytes += strides[0];
       self_data_bytes += strides[1];
     }
   };

   iter.for_each(loop);
 }

 static void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
   auto wrap_dim = maybe_wrap_dim(dim, self.dim());
   int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim);

   AT_DISPATCH_ALL_TYPES_AND_COMPLEX(self.scalar_type(), "cumsum_out_cpu", [&] {
     cpu_cum_base_kernel<scalar_t>(result, self, wrap_dim, [&] (
       scalar_t* result_data, auto result_dim_stride,
       const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) {
         // NOLINTNEXTLINE(bugprone-signed-char-misuse)
         auto cum_number = (at::acc_type<scalar_t, false>)init_val;
         for (const auto i : c10::irange(self_dim_size)) {
           cum_number += self_data[i * self_dim_stride];
           result_data[i * result_dim_stride] = (scalar_t)cum_number;
         }
       }, /*init_val=*/ 0
     );
   });
 }

 static void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
   auto wrap_dim = maybe_wrap_dim(dim, self.dim());
   int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim);

   AT_DISPATCH_ALL_TYPES_AND_COMPLEX(self.scalar_type(), "cumprod_out_cpu", [&] {
     cpu_cum_base_kernel<scalar_t>(result, self, wrap_dim, [&] (
       scalar_t* result_data, auto result_dim_stride,
       const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) {
         // NOLINTNEXTLINE(bugprone-signed-char-misuse)
         auto cum_number = (at::acc_type<scalar_t, false>)init_val;
         for (const auto i : c10::irange(self_dim_size)) {
           cum_number *= self_data[i * self_dim_stride];
           result_data[i * result_dim_stride] = (scalar_t)cum_number;
         }
       }, /*init_val=*/ 1
     );
   });
 }

 static void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t dim) {
   auto wrap_dim = maybe_wrap_dim(dim, self.dim());
   int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim);

   AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "logcumsumexp_out_cpu", [&] {
     cpu_cum_base_kernel<scalar_t>(result, self, wrap_dim, [&] (
       scalar_t* result_data, auto result_dim_stride,
       const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) {
         scalar_t cum_number = (at::acc_type<scalar_t, false>)init_val;
         for (const auto i : c10::irange(self_dim_size)) {
           scalar_t x = self_data[i * self_dim_stride];

           // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
           auto log_add_exp = [](scalar_t x, scalar_t y) -> scalar_t {
             scalar_t min = std::isnan(y) ? y : std::min(x,y); //std::min returns first arg if one of the args is nan
             scalar_t max = std::isnan(y) ? y : std::max(x,y); //std::max returns first arg if one of the args is nan
             if (min != max || std::isfinite(min)) {
               // nan will be propagated here
               return std::log1p(std::exp(min - max)) + max;
             } else {
            // special case to correctly handle infinite cases
               return x;
             }
           };
           cum_number = log_add_exp(x, cum_number);
           result_data[i * result_dim_stride] = static_cast<scalar_t>(cum_number);
         }
       }, /*init_val=*/ -std::numeric_limits<scalar_t>::infinity()
     );
   });
 }

 static void mean_kernel_impl(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "mean_cpu", [&] {
     scalar_t factor = scalar_t(iter.num_output_elements()) / scalar_t(iter.numel());
     binary_kernel_reduce(
       iter,
       MeanOps<scalar_t, scalar_t> {factor},
       scalar_t(0)
     );
   });
 }

 static void std_var_kernel_impl(TensorIterator& iter, int64_t correction, bool take_sqrt) {
   AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "std_cpu", [&] {
     binary_kernel_reduce(
         iter,
         WelfordOps<
             scalar_t,
             double,
             int64_t,
             double,
             std::tuple<scalar_t, scalar_t>>{correction, take_sqrt},
         WelfordData<double, int64_t, double>());
   });
 }

 static void prod_kernel_impl(TensorIterator& iter) {
   // Workaround for the error: '*' in boolean context, suggest '&&' instead
   // [-Werror=int-in-bool-context]
   if (iter.dtype() == ScalarType::Bool) {
     using scalar_t = bool;
     binary_kernel_reduce_vec(
         iter,
         [=](scalar_t a, scalar_t b)
             __ubsan_ignore_undefined__ -> scalar_t { return a && b; },
         [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b)
             __ubsan_ignore_undefined__ { return a && b; },
         // NOLINTNEXTLINE(bugprone-argument-comment)
         /*identity=*/1);
   } else {
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "prod_cpu", [&] {
       binary_kernel_reduce_vec(
           iter,
           [=](scalar_t a, scalar_t b)
               __ubsan_ignore_undefined__ -> scalar_t { return a * b; },
           [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b)
               __ubsan_ignore_undefined__ { return a * b; },
           // NOLINTNEXTLINE(bugprone-argument-comment)
           /*identity=*/1);
     });
   }
 }

 static void norm_kernel_tensor_iterator_impl(
     TensorIterator& iter,
     const Scalar& p) {
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   float val;
   if (p.isIntegral(false)) {
     val = p.to<int64_t>();
   } else if (p.isFloatingPoint()) {
     // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
     val = p.to<double>();
   } else {
     AT_ERROR("norm_kernel_tensor_iterator_impl expects norm to be integer or float");
   }

   // In the dispatch code blocks below, reduction kernels accumulate results as
   // the type `acc_t`. When `scalar_t` is complex, `acc_t` is the downgraded
   // real number type. Otherwise, `acc_t` and `scalar_t` are the same type.
   if (val == 0) {
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
       using acc_t = typename scalar_value_type<scalar_t>::type;
       binary_kernel_reduce(
         iter,
         NormZeroOps<scalar_t, acc_t>(),
         acc_t(0)
       );
     });
   } else if (val == 1) {
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
       using acc_t = typename scalar_value_type<scalar_t>::type;
       binary_kernel_reduce(
         iter,
         NormOneOps<scalar_t, acc_t>(),
         acc_t(0)
       );
     });
   } else if (val == 2) {
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
       using acc_t = typename scalar_value_type<scalar_t>::type;
       binary_kernel_reduce(
         iter,
         NormTwoOps<scalar_t, acc_t>(),
         acc_t(0)
       );
     });
   } else if (val == INFINITY) {
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
       using acc_t = typename scalar_value_type<scalar_t>::type;
       binary_kernel_reduce(
         iter,
         AbsMaxOps<scalar_t, acc_t>(),
         acc_t(0)
       );
     });
   } else if (val == -INFINITY) {
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
       using acc_t = typename scalar_value_type<scalar_t>::type;
       binary_kernel_reduce(
         iter,
         AbsMinOps<scalar_t, acc_t>(),
         std::numeric_limits<acc_t>::max()
       );
     });
   } else {
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
       using acc_t = typename scalar_value_type<scalar_t>::type;
       binary_kernel_reduce(
         iter,
         NormOps<scalar_t, acc_t> { acc_t(val) },
         acc_t(0)
       );
     });
   }

   // For complex outputs, the above kernels do not touch the imaginary values,
   // so we must zero them out
   if (isComplexType(iter.output().scalar_type())) {
     at::imag(iter.output()).zero_();
   }
 }

 static void and_kernel_impl(TensorIterator& iter) {
   if (iter.dtype() == ScalarType::Byte) {
     // Refer [all, any : uint8 compatibility]
     binary_kernel_reduce_vec(
         iter,
         [=](uint8_t a, uint8_t b) -> uint8_t { return (a && b) ? 1 : 0; },
         [=](Vectorized<uint8_t> a, Vectorized<uint8_t> b) {
           Vectorized<uint8_t> c = Vectorized<uint8_t>();

           for (decltype(c.size()) i = 0; i != Vectorized<uint8_t>::size(); i++) {
             c[i] = (a[i] && b[i]) ? 1 : 0;
           }
           return c;
         },
         /*ident=*/true);
   } else {
     binary_kernel_reduce_vec(
         iter,
         [=](bool a, bool b) -> bool { return a && b; },
         [=](Vectorized<bool> a, Vectorized<bool> b) {
           // Adding the implementation here instead of in vec256_base to avoid
           // return value inconsistency. Other comparison operators in
           // vec256_base return -1/0 (all bit 1 / all bit 0) as true/false to
           // follow the AVX2 convention. This would be convenient when combined
           // with other vectorized operations. For example, one can use the
           // logical operation results as a mask for a bit operation to
           // retrieve/reset multiple elements in a vector.
           //
           // In this method, users would expect, e.g., all(), to return 1/0 as
           // true/false.
           Vectorized<bool> c = Vectorized<bool>();

           for (decltype(c.size()) i = 0; i != Vectorized<bool>::size(); i++) {
             c[i] = a[i] && b[i];
           }
           return c;
         },
         /*ident=*/true);
   }
 }

 static void or_kernel_impl(TensorIterator& iter) {
   if (iter.dtype() == ScalarType::Byte) {
     // Refer [all, any : uint8 compatibility]
     binary_kernel_reduce_vec(
         iter,
         [=](uint8_t a, uint8_t b) -> uint8_t { return (a || b) ? 1 : 0; },
         [=](Vectorized<uint8_t> a, Vectorized<uint8_t> b) {
           Vectorized<uint8_t> c = Vectorized<uint8_t>();

           for (decltype(c.size()) i = 0; i != Vectorized<uint8_t>::size(); i++) {
             c[i] = (a[i] || b[i]) ? 1 : 0;
           }
           return c;
         },
         /*ident=*/false);
   } else {
     binary_kernel_reduce_vec(
         iter,
         [=](bool a, bool b) -> bool { return a || b; },
         [=](Vectorized<bool> a, Vectorized<bool> b) {
           Vectorized<bool> c = Vectorized<bool>();

           for (decltype(c.size()) i = 0; i != Vectorized<bool>::size(); i++) {
             c[i] = a[i] || b[i];
           }
           return c;
         },
         /*ident=*/false);
   }
 }

 template<typename scalar_t>
 struct MinValuesOps: public at::native::MinOps<scalar_t> {
   using arg_t = typename MinOps<scalar_t>::arg_t;
   static scalar_t project(arg_t arg) {
     return arg.first;
   }
 };

 static void min_values_kernel_impl(TensorIterator& iter) {
   if (iter.dtype() == kLong) {
     // This case is special because of Vectorized<int64_t> does not
     // handle upper_bound<int64_t>().
     // See: https://github.com/pytorch/pytorch/issues/43254
     using scalar_t = int64_t;
     binary_kernel_reduce(
       iter,
       MinValuesOps<scalar_t>{},
       std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), -1));
     return;
   }
   AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "min_values_cpu", [&iter] {
     binary_kernel_reduce_vec(
       iter,
       [](scalar_t a, scalar_t b) -> scalar_t { return min_impl(a, b); },
       [](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return minimum(a, b); },
       static_cast<double>(upper_bound<scalar_t>()));
   });
 }

 static void max_values_kernel_impl(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "max_values_cpu", [&iter] {
     binary_kernel_reduce_vec(
       iter,
       [](scalar_t a, scalar_t b) -> scalar_t { return max_impl(a, b); },
       [](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return maximum(a, b); },
       lower_bound<scalar_t>());
   });
 }

 static void argmax_kernel_impl(TensorIterator &iter) {
   AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(1), "argmax_cpu", [&] {
     binary_kernel_reduce(
       iter,
       ArgMaxOps<scalar_t>{},
       std::pair<scalar_t, int64_t>(lower_bound<scalar_t>(), 0));
   });
 }

 static void argmin_kernel_impl(TensorIterator &iter) {
   AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(1), "argmin_cpu", [&] {
     binary_kernel_reduce(
       iter,
       ArgMinOps<scalar_t>{},
       std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), 0));
   });
 }

 }  // anonymous namespace

 REGISTER_DISPATCH(std_var_stub, &std_var_kernel_impl);
 REGISTER_DISPATCH(prod_stub, &prod_kernel_impl);
 REGISTER_DISPATCH(mean_stub, &mean_kernel_impl);
 REGISTER_DISPATCH(norm_stub, &norm_kernel_tensor_iterator_impl);
 REGISTER_DISPATCH(and_stub, &and_kernel_impl);
 REGISTER_DISPATCH(or_stub, &or_kernel_impl);
 REGISTER_DISPATCH(min_values_stub, &min_values_kernel_impl);
 REGISTER_DISPATCH(max_values_stub, &max_values_kernel_impl);
 REGISTER_DISPATCH(argmax_stub, &argmax_kernel_impl);
 REGISTER_DISPATCH(argmin_stub, &argmin_kernel_impl);
 REGISTER_DISPATCH(cumprod_stub, &cumprod_cpu_kernel);
 REGISTER_DISPATCH(cumsum_stub, &cumsum_cpu_kernel);
 REGISTER_DISPATCH(logcumsumexp_stub, &logcumsumexp_cpu_kernel);

 }}  // namespace at::native
	#include <numeric>
	#include <iterator>
	#include <algorithm>

	#include <ATen/Dispatch.h>
	#include <ATen/cpu/vec/vec.h>
	#include <ATen/native/ReduceOps.h>
	#include <ATen/native/ReduceOpsUtils.h>
	#include <ATen/native/Resize.h>
	#include <ATen/native/TensorIterator.h>
	#include <ATen/native/SharedReduceOps.h>
	#include <ATen/native/ReduceOpsUtils.h>
	#include <ATen/native/cpu/Reduce.h>

	#include <c10/util/Optional.h>
	#include <c10/util/irange.h>
	#include <ATen/AccumulateType.h>

	namespace at { namespace native { namespace {

	using namespace vec;

	template <typename scalar_t, typename func_t>
	static inline void cpu_cum_base_kernel(const Tensor& result,
	const Tensor& self,
	int64_t dim,
	const func_t& f,
	scalar_t init_val) {
	if (result.sizes() != self.sizes()) {
	at::native::resize_output(result, self.sizes());
	}
	if (self.numel() == 0) {
	return;
	}
	const auto input_ndim = self.dim();
	if (input_ndim == 0) {
	result.fill_(self);
	return;
	}

	// TODO This probably should be using at::native::make_reduction
	auto iter = TensorIteratorConfig()
	.check_all_same_dtype(false)
	.resize_outputs(false)
	// NOLINTNEXTLINE(bugprone-argument-comment)
	.declare_static_shape(self.sizes(), /squash_dim=/dim)
	.add_output(result)
	.add_input(self)
	.build();

	auto result_dim_stride = ensure_nonempty_stride(result, dim);
	auto self_dim_stride = ensure_nonempty_stride(self, dim);

	auto loop = [&](char** data, const int64_t* strides, int64_t n) {
	auto* result_data_bytes = data[0];
	const auto* self_data_bytes = data[1];

	for (const auto i : c10::irange(n)) {
	(void)i; //Suppress unused variable warning
	f(
	(scalar_t*)result_data_bytes, result_dim_stride,
	(scalar_t*)self_data_bytes, self_dim_stride, init_val
	);
	result_data_bytes += strides[0];
	self_data_bytes += strides[1];
	}
	};

	iter.for_each(loop);
	}

	static void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
	auto wrap_dim = maybe_wrap_dim(dim, self.dim());
	int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim);

	AT_DISPATCH_ALL_TYPES_AND_COMPLEX(self.scalar_type(), "cumsum_out_cpu", [&] {
	cpu_cum_base_kernel<scalar_t>(result, self, wrap_dim, [&] (
	scalar_t* result_data, auto result_dim_stride,
	const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) {
	// NOLINTNEXTLINE(bugprone-signed-char-misuse)
	auto cum_number = (at::acc_type<scalar_t, false>)init_val;
	for (const auto i : c10::irange(self_dim_size)) {
	cum_number += self_data[i * self_dim_stride];
	result_data[i * result_dim_stride] = (scalar_t)cum_number;
	}
	}, /init_val=/ 0
	);
	});
	}

	static void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
	auto wrap_dim = maybe_wrap_dim(dim, self.dim());
	int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim);

	AT_DISPATCH_ALL_TYPES_AND_COMPLEX(self.scalar_type(), "cumprod_out_cpu", [&] {
	cpu_cum_base_kernel<scalar_t>(result, self, wrap_dim, [&] (
	scalar_t* result_data, auto result_dim_stride,
	const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) {
	// NOLINTNEXTLINE(bugprone-signed-char-misuse)
	auto cum_number = (at::acc_type<scalar_t, false>)init_val;
	for (const auto i : c10::irange(self_dim_size)) {
	cum_number = self_data[i self_dim_stride];
	result_data[i * result_dim_stride] = (scalar_t)cum_number;
	}
	}, /init_val=/ 1
	);
	});
	}

	static void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t dim) {
	auto wrap_dim = maybe_wrap_dim(dim, self.dim());
	int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim);

	AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "logcumsumexp_out_cpu", [&] {
	cpu_cum_base_kernel<scalar_t>(result, self, wrap_dim, [&] (
	scalar_t* result_data, auto result_dim_stride,
	const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) {
	scalar_t cum_number = (at::acc_type<scalar_t, false>)init_val;
	for (const auto i : c10::irange(self_dim_size)) {
	scalar_t x = self_data[i * self_dim_stride];

	// Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
	auto log_add_exp = [](scalar_t x, scalar_t y) -> scalar_t {
	scalar_t min = std::isnan(y) ? y : std::min(x,y); //std::min returns first arg if one of the args is nan
	scalar_t max = std::isnan(y) ? y : std::max(x,y); //std::max returns first arg if one of the args is nan
	if (min != max \|\| std::isfinite(min)) {
	// nan will be propagated here
	return std::log1p(std::exp(min - max)) + max;
	} else {
	// special case to correctly handle infinite cases
	return x;
	}
	};
	cum_number = log_add_exp(x, cum_number);
	result_data[i * result_dim_stride] = static_cast<scalar_t>(cum_number);
	}
	}, /init_val=/ -std::numeric_limits<scalar_t>::infinity()
	);
	});
	}

	static void mean_kernel_impl(TensorIterator& iter) {
	AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "mean_cpu", [&] {
	scalar_t factor = scalar_t(iter.num_output_elements()) / scalar_t(iter.numel());
	binary_kernel_reduce(
	iter,
	MeanOps<scalar_t, scalar_t> {factor},
	scalar_t(0)
	);
	});
	}

	static void std_var_kernel_impl(TensorIterator& iter, int64_t correction, bool take_sqrt) {
	AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "std_cpu", [&] {
	binary_kernel_reduce(
	iter,
	WelfordOps<
	scalar_t,
	double,
	int64_t,
	double,
	std::tuple<scalar_t, scalar_t>>{correction, take_sqrt},
	WelfordData<double, int64_t, double>());
	});
	}

	static void prod_kernel_impl(TensorIterator& iter) {
	// Workaround for the error: '*' in boolean context, suggest '&&' instead
	// [-Werror=int-in-bool-context]
	if (iter.dtype() == ScalarType::Bool) {
	using scalar_t = bool;
	binary_kernel_reduce_vec(
	iter,
	[=](scalar_t a, scalar_t b)
	__ubsan_ignore_undefined__ -> scalar_t { return a && b; },
	[=](Vectorized<scalar_t> a, Vectorized<scalar_t> b)
	__ubsan_ignore_undefined__ { return a && b; },
	// NOLINTNEXTLINE(bugprone-argument-comment)
	/identity=/1);
	} else {
	AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "prod_cpu", [&] {
	binary_kernel_reduce_vec(
	iter,
	[=](scalar_t a, scalar_t b)
	__ubsan_ignore_undefined__ -> scalar_t { return a * b; },
	[=](Vectorized<scalar_t> a, Vectorized<scalar_t> b)
	__ubsan_ignore_undefined__ { return a * b; },
	// NOLINTNEXTLINE(bugprone-argument-comment)
	/identity=/1);
	});
	}
	}

	static void norm_kernel_tensor_iterator_impl(
	TensorIterator& iter,
	const Scalar& p) {
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	float val;
	if (p.isIntegral(false)) {
	val = p.to<int64_t>();
	} else if (p.isFloatingPoint()) {
	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	val = p.to<double>();
	} else {
	AT_ERROR("norm_kernel_tensor_iterator_impl expects norm to be integer or float");
	}

	// In the dispatch code blocks below, reduction kernels accumulate results as
	// the type `acc_t`. When `scalar_t` is complex, `acc_t` is the downgraded
	// real number type. Otherwise, `acc_t` and `scalar_t` are the same type.
	if (val == 0) {
	AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
	using acc_t = typename scalar_value_type<scalar_t>::type;
	binary_kernel_reduce(
	iter,
	NormZeroOps<scalar_t, acc_t>(),
	acc_t(0)
	);
	});
	} else if (val == 1) {
	AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
	using acc_t = typename scalar_value_type<scalar_t>::type;
	binary_kernel_reduce(
	iter,
	NormOneOps<scalar_t, acc_t>(),
	acc_t(0)
	);
	});
	} else if (val == 2) {
	AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
	using acc_t = typename scalar_value_type<scalar_t>::type;
	binary_kernel_reduce(
	iter,
	NormTwoOps<scalar_t, acc_t>(),
	acc_t(0)
	);
	});
	} else if (val == INFINITY) {
	AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
	using acc_t = typename scalar_value_type<scalar_t>::type;
	binary_kernel_reduce(
	iter,
	AbsMaxOps<scalar_t, acc_t>(),
	acc_t(0)
	);
	});
	} else if (val == -INFINITY) {
	AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
	using acc_t = typename scalar_value_type<scalar_t>::type;
	binary_kernel_reduce(
	iter,
	AbsMinOps<scalar_t, acc_t>(),
	std::numeric_limits<acc_t>::max()
	);
	});
	} else {
	AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
	using acc_t = typename scalar_value_type<scalar_t>::type;
	binary_kernel_reduce(
	iter,
	NormOps<scalar_t, acc_t> { acc_t(val) },
	acc_t(0)
	);
	});
	}

	// For complex outputs, the above kernels do not touch the imaginary values,
	// so we must zero them out
	if (isComplexType(iter.output().scalar_type())) {
	at::imag(iter.output()).zero_();
	}
	}

	static void and_kernel_impl(TensorIterator& iter) {
	if (iter.dtype() == ScalarType::Byte) {
	// Refer [all, any : uint8 compatibility]
	binary_kernel_reduce_vec(
	iter,
	[=](uint8_t a, uint8_t b) -> uint8_t { return (a && b) ? 1 : 0; },
	[=](Vectorized<uint8_t> a, Vectorized<uint8_t> b) {
	Vectorized<uint8_t> c = Vectorized<uint8_t>();

	for (decltype(c.size()) i = 0; i != Vectorized<uint8_t>::size(); i++) {
	c[i] = (a[i] && b[i]) ? 1 : 0;
	}
	return c;
	},
	/ident=/true);
	} else {
	binary_kernel_reduce_vec(
	iter,
	[=](bool a, bool b) -> bool { return a && b; },
	[=](Vectorized<bool> a, Vectorized<bool> b) {
	// Adding the implementation here instead of in vec256_base to avoid
	// return value inconsistency. Other comparison operators in
	// vec256_base return -1/0 (all bit 1 / all bit 0) as true/false to
	// follow the AVX2 convention. This would be convenient when combined
	// with other vectorized operations. For example, one can use the
	// logical operation results as a mask for a bit operation to
	// retrieve/reset multiple elements in a vector.
	//
	// In this method, users would expect, e.g., all(), to return 1/0 as
	// true/false.
	Vectorized<bool> c = Vectorized<bool>();

	for (decltype(c.size()) i = 0; i != Vectorized<bool>::size(); i++) {
	c[i] = a[i] && b[i];
	}
	return c;
	},
	/ident=/true);
	}
	}

	static void or_kernel_impl(TensorIterator& iter) {
	if (iter.dtype() == ScalarType::Byte) {
	// Refer [all, any : uint8 compatibility]
	binary_kernel_reduce_vec(
	iter,
	[=](uint8_t a, uint8_t b) -> uint8_t { return (a \|\| b) ? 1 : 0; },
	[=](Vectorized<uint8_t> a, Vectorized<uint8_t> b) {
	Vectorized<uint8_t> c = Vectorized<uint8_t>();

	for (decltype(c.size()) i = 0; i != Vectorized<uint8_t>::size(); i++) {
	c[i] = (a[i] \|\| b[i]) ? 1 : 0;
	}
	return c;
	},
	/ident=/false);
	} else {
	binary_kernel_reduce_vec(
	iter,
	[=](bool a, bool b) -> bool { return a \|\| b; },
	[=](Vectorized<bool> a, Vectorized<bool> b) {
	Vectorized<bool> c = Vectorized<bool>();

	for (decltype(c.size()) i = 0; i != Vectorized<bool>::size(); i++) {
	c[i] = a[i] \|\| b[i];
	}
	return c;
	},
	/ident=/false);
	}
	}

	template<typename scalar_t>
	struct MinValuesOps: public at::native::MinOps<scalar_t> {
	using arg_t = typename MinOps<scalar_t>::arg_t;
	static scalar_t project(arg_t arg) {
	return arg.first;
	}
	};

	static void min_values_kernel_impl(TensorIterator& iter) {
	if (iter.dtype() == kLong) {
	// This case is special because of Vectorized<int64_t> does not
	// handle upper_bound<int64_t>().
	// See: https://github.com/pytorch/pytorch/issues/43254
	using scalar_t = int64_t;
	binary_kernel_reduce(
	iter,
	MinValuesOps<scalar_t>{},
	std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), -1));
	return;
	}
	AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "min_values_cpu", [&iter] {
	binary_kernel_reduce_vec(
	iter,
	[](scalar_t a, scalar_t b) -> scalar_t { return min_impl(a, b); },
	[](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return minimum(a, b); },
	static_cast<double>(upper_bound<scalar_t>()));
	});
	}

	static void max_values_kernel_impl(TensorIterator& iter) {
	AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "max_values_cpu", [&iter] {
	binary_kernel_reduce_vec(
	iter,
	[](scalar_t a, scalar_t b) -> scalar_t { return max_impl(a, b); },
	[](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return maximum(a, b); },
	lower_bound<scalar_t>());
	});
	}

	static void argmax_kernel_impl(TensorIterator &iter) {
	AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(1), "argmax_cpu", [&] {
	binary_kernel_reduce(
	iter,
	ArgMaxOps<scalar_t>{},
	std::pair<scalar_t, int64_t>(lower_bound<scalar_t>(), 0));
	});
	}

	static void argmin_kernel_impl(TensorIterator &iter) {
	AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(1), "argmin_cpu", [&] {
	binary_kernel_reduce(
	iter,
	ArgMinOps<scalar_t>{},
	std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), 0));
	});
	}

	} // anonymous namespace

	REGISTER_DISPATCH(std_var_stub, &std_var_kernel_impl);
	REGISTER_DISPATCH(prod_stub, &prod_kernel_impl);
	REGISTER_DISPATCH(mean_stub, &mean_kernel_impl);
	REGISTER_DISPATCH(norm_stub, &norm_kernel_tensor_iterator_impl);
	REGISTER_DISPATCH(and_stub, &and_kernel_impl);
	REGISTER_DISPATCH(or_stub, &or_kernel_impl);
	REGISTER_DISPATCH(min_values_stub, &min_values_kernel_impl);
	REGISTER_DISPATCH(max_values_stub, &max_values_kernel_impl);
	REGISTER_DISPATCH(argmax_stub, &argmax_kernel_impl);
	REGISTER_DISPATCH(argmin_stub, &argmin_kernel_impl);
	REGISTER_DISPATCH(cumprod_stub, &cumprod_cpu_kernel);
	REGISTER_DISPATCH(cumsum_stub, &cumsum_cpu_kernel);
	REGISTER_DISPATCH(logcumsumexp_stub, &logcumsumexp_cpu_kernel);

	}} // namespace at::native