aten/src/ATen/native/RangeFactories.cpp - platform/external/pytorch - Git at Google

 #include <ATen/NativeFunctions.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Parallel.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
 #include <cmath>
 #include <limits>

 namespace at { namespace native {

 DECLARE_DISPATCH(void(*)(TensorIterator&, const Scalar&, const Scalar&, const Scalar&), arange_stub);
 DECLARE_DISPATCH(void(*)(TensorIterator&, const Scalar&, const Scalar&, int64_t), linspace_stub);

 Tensor& linspace_cpu_out(const Scalar& start, const Scalar& end, c10::optional<int64_t> optional_steps, Tensor& result) {
   const auto steps = optional_steps.value_or(100);
   TORCH_CHECK(steps >= 0, "number of steps must be non-negative");

   if (!optional_steps.has_value()) {
     TORCH_WARN_ONCE(
       "Not providing a value for linspace's steps is deprecated and will "
       "throw a runtime error in a future release. This warning will appear "
       "only once per process.");
   }

   if (result.numel() != steps) {
     result.resize_({steps});
   }

   if (steps == 0) {
     // skip
   } else if (steps == 1) {
     result.fill_(start);
   } else {
     Tensor r = result.is_contiguous() ? result : result.contiguous();
     auto iter = TensorIterator::nullary_op(r);
     linspace_stub(iter.device_type(), iter, start, end, steps);
     if (!result.is_contiguous()) {
       result.copy_(r);
     }
   }

   return result;
 }

 Tensor& logspace_cpu_out(const Scalar& start, const Scalar& end, c10::optional<int64_t> optional_steps, double base, Tensor& result) {
   const auto steps = optional_steps.value_or(100);
   TORCH_CHECK(steps >= 0, "number of steps must be non-negative");

   if (!optional_steps.has_value()) {
     TORCH_WARN_ONCE(
       "Not providing a value for logspace's steps is deprecated and will "
       "throw a runtime error in a future release. This warning will appear "
       "only once per process.");
   }

   if (result.numel() != steps) {
     result.resize_({steps});
   }
   Tensor r = result.is_contiguous() ? result : result.contiguous();

   if (steps == 0) {
     // skip
   } else if (steps == 1) {
     if (isComplexType(r.scalar_type())){
       r.fill_(std::pow(base, start.to<c10::complex<double>>()));
     } else {
       r.fill_(std::pow(base, start.to<double>()));
     }
   } else if (isComplexType(r.scalar_type())) {
     AT_DISPATCH_COMPLEX_TYPES(r.scalar_type(), "logspace_cpu", [&]() {
       scalar_t scalar_base = static_cast<scalar_t>(base);
       scalar_t scalar_start = start.to<scalar_t>();
       scalar_t scalar_end = end.to<scalar_t>();
       scalar_t *data_ptr = r.data_ptr<scalar_t>();
       scalar_t step = (scalar_end - scalar_start) / static_cast<scalar_t>(steps - 1);
       const int64_t halfway = steps / 2;
       at::parallel_for(0, steps, internal::GRAIN_SIZE, [&](int64_t p_begin, int64_t p_end) {
         scalar_t is = static_cast<scalar_t>(p_begin);
         for (int64_t i = p_begin; i < p_end; ++i, is+=1) { //std::complex does not support ++operator
           if (i < halfway) {
             data_ptr[i] = std::pow(scalar_base, scalar_start + step*is);
           } else {
             data_ptr[i] = std::pow(scalar_base, scalar_end - (step * static_cast<scalar_t>(steps - i - 1)));
           }
         }
       });
     });
   } else {
     AT_DISPATCH_ALL_TYPES_AND(kBFloat16, r.scalar_type(), "logspace_cpu", [&]() {
       double scalar_base = static_cast<double>(base); // will be autopromoted anyway
       scalar_t scalar_start = start.to<scalar_t>();
       scalar_t scalar_end = end.to<scalar_t>();
       scalar_t *data_ptr = r.data_ptr<scalar_t>();
       double step = static_cast<double>(scalar_end - scalar_start) / (steps - 1);
       const int64_t halfway = steps / 2;
       at::parallel_for(0, steps, internal::GRAIN_SIZE, [&](int64_t p_begin, int64_t p_end) {
         for (int64_t i=p_begin; i < p_end; i++) {
           if (i < halfway) {
             data_ptr[i] = std::pow(scalar_base, scalar_start + step*i);
           } else {
             data_ptr[i] = std::pow(scalar_base, scalar_end - step * (steps - i - 1));
           }
         }
       });
     });
   }

   if (!result.is_contiguous()) {
     result.copy_(r);
   }
   return result;
 }

 Tensor& range_cpu_out(const Scalar& start, const Scalar& end, const Scalar& step, Tensor& result) {
   AT_DISPATCH_ALL_TYPES(result.scalar_type(), "range_cpu", [&]() {
     using accscalar_t = at::acc_type<scalar_t, false>;
     auto xstart = start.to<accscalar_t>();
     auto xend = end.to<accscalar_t>();
     auto xstep = step.to<accscalar_t>();

     TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
     TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) &&
              std::isfinite(static_cast<double>(xend)),
              "unsupported range: ", xstart, " -> ", xend);
     TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
              "upper bound and larger bound inconsistent with step sign");
     int64_t size = static_cast<int64_t>(((xend - xstart) / xstep) + 1);
     if (result.numel() != size) {
       result.resize_({size});
     }
     Tensor r = result.is_contiguous() ? result : result.contiguous();
     scalar_t *data_ptr = r.data_ptr<scalar_t>();

     at::parallel_for(0, size, internal::GRAIN_SIZE, [&](int64_t p_begin, int64_t p_end) {
       scalar_t is = p_begin;
       for (int64_t i = p_begin; i < p_end; ++i, ++is) {
         data_ptr[i] = xstart + is * xstep;
       }
     });
     if (!result.is_contiguous()) {
       result.copy_(r);
     }
   });

   return result;
 }

 Tensor& arange_cpu_out(const Scalar& start, const Scalar& end, const Scalar& step, Tensor& result) {
   AT_DISPATCH_ALL_TYPES(result.scalar_type(), "arange_cpu", [&]() {
     using accscalar_t = at::acc_type<scalar_t, false>;
     auto xstart = start.to<accscalar_t>();
     auto xend = end.to<accscalar_t>();
     auto xstep = step.to<accscalar_t>();

     // we use double precision for (start - end) / step
     // to compute size_d for consistency across devices.
     // The problem with using accscalar_t is that accscalar_t might be float32 on gpu for a float32 scalar_t,
     // but double on cpu for the same,
     // and the effective output size starts differing on CPU vs GPU because of precision issues, which
     // we dont want.
     // the corner-case we do want to take into account is int64_t, which has higher precision than double
     double size_d;
     if (std::is_same<scalar_t, int64_t>::value) {
       size_d = std::ceil(static_cast<double>(end.to<accscalar_t>() - start.to<accscalar_t>())
                          / step.to<accscalar_t>());
     } else {
       size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
                          / step.to<double>());
     }

     TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
     TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) &&
              std::isfinite(static_cast<double>(xend)),
              "unsupported range: ", xstart, " -> ", xend);
     TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
              "upper bound and larger bound inconsistent with step sign");

     TORCH_CHECK(size_d >= 0 && size_d <= static_cast<double>(std::numeric_limits<int64_t>::max()),
              "invalid size, possible overflow?");

     int64_t size = static_cast<int64_t>(size_d);
     int64_t numel = result.numel();

     if (numel != size) {
       if(numel > 0){
         TORCH_WARN("The number of elements in the out tensor of shape ", result.sizes(),
                     " is ", numel, " which does not match the computed number of elements ", size,
                     ". Note that this may occur as a result of rounding error. "
                     "The out tensor will be resized to a tensor of shape (", size, ",).");
       }
       result.resize_({size});
     }

     Tensor r = result.is_contiguous() ? result : result.contiguous();
     auto iter = TensorIterator::nullary_op(r);
     arange_stub(iter.device_type(), iter, start, size, step);
     if (!result.is_contiguous()) {
       result.copy_(r);
     }
   });

   return result;
 }

 DEFINE_DISPATCH(arange_stub);
 DEFINE_DISPATCH(linspace_stub);

 }} // namespace at::native
	#include <ATen/NativeFunctions.h>
	#include <ATen/AccumulateType.h>
	#include <ATen/Parallel.h>
	#include <ATen/Dispatch.h>
	#include <ATen/native/DispatchStub.h>
	#include <ATen/native/TensorIterator.h>
	#include <cmath>
	#include <limits>

	namespace at { namespace native {

	DECLARE_DISPATCH(void(*)(TensorIterator&, const Scalar&, const Scalar&, const Scalar&), arange_stub);
	DECLARE_DISPATCH(void(*)(TensorIterator&, const Scalar&, const Scalar&, int64_t), linspace_stub);

	Tensor& linspace_cpu_out(const Scalar& start, const Scalar& end, c10::optional<int64_t> optional_steps, Tensor& result) {
	const auto steps = optional_steps.value_or(100);
	TORCH_CHECK(steps >= 0, "number of steps must be non-negative");

	if (!optional_steps.has_value()) {
	TORCH_WARN_ONCE(
	"Not providing a value for linspace's steps is deprecated and will "
	"throw a runtime error in a future release. This warning will appear "
	"only once per process.");
	}

	if (result.numel() != steps) {
	result.resize_({steps});
	}

	if (steps == 0) {
	// skip
	} else if (steps == 1) {
	result.fill_(start);
	} else {
	Tensor r = result.is_contiguous() ? result : result.contiguous();
	auto iter = TensorIterator::nullary_op(r);
	linspace_stub(iter.device_type(), iter, start, end, steps);
	if (!result.is_contiguous()) {
	result.copy_(r);
	}
	}

	return result;
	}

	Tensor& logspace_cpu_out(const Scalar& start, const Scalar& end, c10::optional<int64_t> optional_steps, double base, Tensor& result) {
	const auto steps = optional_steps.value_or(100);
	TORCH_CHECK(steps >= 0, "number of steps must be non-negative");

	if (!optional_steps.has_value()) {
	TORCH_WARN_ONCE(
	"Not providing a value for logspace's steps is deprecated and will "
	"throw a runtime error in a future release. This warning will appear "
	"only once per process.");
	}

	if (result.numel() != steps) {
	result.resize_({steps});
	}
	Tensor r = result.is_contiguous() ? result : result.contiguous();

	if (steps == 0) {
	// skip
	} else if (steps == 1) {
	if (isComplexType(r.scalar_type())){
	r.fill_(std::pow(base, start.to<c10::complex<double>>()));
	} else {
	r.fill_(std::pow(base, start.to<double>()));
	}
	} else if (isComplexType(r.scalar_type())) {
	AT_DISPATCH_COMPLEX_TYPES(r.scalar_type(), "logspace_cpu", [&]() {
	scalar_t scalar_base = static_cast<scalar_t>(base);
	scalar_t scalar_start = start.to<scalar_t>();
	scalar_t scalar_end = end.to<scalar_t>();
	scalar_t *data_ptr = r.data_ptr<scalar_t>();
	scalar_t step = (scalar_end - scalar_start) / static_cast<scalar_t>(steps - 1);
	const int64_t halfway = steps / 2;
	at::parallel_for(0, steps, internal::GRAIN_SIZE, [&](int64_t p_begin, int64_t p_end) {
	scalar_t is = static_cast<scalar_t>(p_begin);
	for (int64_t i = p_begin; i < p_end; ++i, is+=1) { //std::complex does not support ++operator
	if (i < halfway) {
	data_ptr[i] = std::pow(scalar_base, scalar_start + step*is);
	} else {
	data_ptr[i] = std::pow(scalar_base, scalar_end - (step * static_cast<scalar_t>(steps - i - 1)));
	}
	}
	});
	});
	} else {
	AT_DISPATCH_ALL_TYPES_AND(kBFloat16, r.scalar_type(), "logspace_cpu", [&]() {
	double scalar_base = static_cast<double>(base); // will be autopromoted anyway
	scalar_t scalar_start = start.to<scalar_t>();
	scalar_t scalar_end = end.to<scalar_t>();
	scalar_t *data_ptr = r.data_ptr<scalar_t>();
	double step = static_cast<double>(scalar_end - scalar_start) / (steps - 1);
	const int64_t halfway = steps / 2;
	at::parallel_for(0, steps, internal::GRAIN_SIZE, [&](int64_t p_begin, int64_t p_end) {
	for (int64_t i=p_begin; i < p_end; i++) {
	if (i < halfway) {
	data_ptr[i] = std::pow(scalar_base, scalar_start + step*i);
	} else {
	data_ptr[i] = std::pow(scalar_base, scalar_end - step * (steps - i - 1));
	}
	}
	});
	});
	}

	if (!result.is_contiguous()) {
	result.copy_(r);
	}
	return result;
	}

	Tensor& range_cpu_out(const Scalar& start, const Scalar& end, const Scalar& step, Tensor& result) {
	AT_DISPATCH_ALL_TYPES(result.scalar_type(), "range_cpu", [&]() {
	using accscalar_t = at::acc_type<scalar_t, false>;
	auto xstart = start.to<accscalar_t>();
	auto xend = end.to<accscalar_t>();
	auto xstep = step.to<accscalar_t>();

	TORCH_CHECK(xstep > 0 \|\| xstep < 0, "step must be nonzero");
	TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) &&
	std::isfinite(static_cast<double>(xend)),
	"unsupported range: ", xstart, " -> ", xend);
	TORCH_CHECK(((xstep > 0) && (xend >= xstart)) \|\| ((xstep < 0) && (xend <= xstart)),
	"upper bound and larger bound inconsistent with step sign");
	int64_t size = static_cast<int64_t>(((xend - xstart) / xstep) + 1);
	if (result.numel() != size) {
	result.resize_({size});
	}
	Tensor r = result.is_contiguous() ? result : result.contiguous();
	scalar_t *data_ptr = r.data_ptr<scalar_t>();

	at::parallel_for(0, size, internal::GRAIN_SIZE, [&](int64_t p_begin, int64_t p_end) {
	scalar_t is = p_begin;
	for (int64_t i = p_begin; i < p_end; ++i, ++is) {
	data_ptr[i] = xstart + is * xstep;
	}
	});
	if (!result.is_contiguous()) {
	result.copy_(r);
	}
	});

	return result;
	}

	Tensor& arange_cpu_out(const Scalar& start, const Scalar& end, const Scalar& step, Tensor& result) {
	AT_DISPATCH_ALL_TYPES(result.scalar_type(), "arange_cpu", [&]() {
	using accscalar_t = at::acc_type<scalar_t, false>;
	auto xstart = start.to<accscalar_t>();
	auto xend = end.to<accscalar_t>();
	auto xstep = step.to<accscalar_t>();

	// we use double precision for (start - end) / step
	// to compute size_d for consistency across devices.
	// The problem with using accscalar_t is that accscalar_t might be float32 on gpu for a float32 scalar_t,
	// but double on cpu for the same,
	// and the effective output size starts differing on CPU vs GPU because of precision issues, which
	// we dont want.
	// the corner-case we do want to take into account is int64_t, which has higher precision than double
	double size_d;
	if (std::is_same<scalar_t, int64_t>::value) {
	size_d = std::ceil(static_cast<double>(end.to<accscalar_t>() - start.to<accscalar_t>())
	/ step.to<accscalar_t>());
	} else {
	size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
	/ step.to<double>());
	}

	TORCH_CHECK(xstep > 0 \|\| xstep < 0, "step must be nonzero");
	TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) &&
	std::isfinite(static_cast<double>(xend)),
	"unsupported range: ", xstart, " -> ", xend);
	TORCH_CHECK(((xstep > 0) && (xend >= xstart)) \|\| ((xstep < 0) && (xend <= xstart)),
	"upper bound and larger bound inconsistent with step sign");

	TORCH_CHECK(size_d >= 0 && size_d <= static_cast<double>(std::numeric_limits<int64_t>::max()),
	"invalid size, possible overflow?");

	int64_t size = static_cast<int64_t>(size_d);
	int64_t numel = result.numel();

	if (numel != size) {
	if(numel > 0){
	TORCH_WARN("The number of elements in the out tensor of shape ", result.sizes(),
	" is ", numel, " which does not match the computed number of elements ", size,
	". Note that this may occur as a result of rounding error. "
	"The out tensor will be resized to a tensor of shape (", size, ",).");
	}
	result.resize_({size});
	}

	Tensor r = result.is_contiguous() ? result : result.contiguous();
	auto iter = TensorIterator::nullary_op(r);
	arange_stub(iter.device_type(), iter, start, size, step);
	if (!result.is_contiguous()) {
	result.copy_(r);
	}
	});

	return result;
	}

	DEFINE_DISPATCH(arange_stub);
	DEFINE_DISPATCH(linspace_stub);

	}} // namespace at::native