Support float16 in Range and Linspace.

PiperOrigin-RevId: 275572836
Change-Id: I7987a3bdfdefe0e29f50bd50e044cdf4815de6d5
diff --git a/tensorflow/core/framework/ops_util.h b/tensorflow/core/framework/ops_util.h
index 7e4f169..feaab10 100644
--- a/tensorflow/core/framework/ops_util.h
+++ b/tensorflow/core/framework/ops_util.h
@@ -111,44 +111,6 @@
   return strides;
 }
 
-namespace internal {
-// Overloads to circumvent the problem that the C++ standard library does not
-// support half and Eigen does not support bfloat16.
-template <typename T>
-inline T tceil(const T& x) {
-  return std::ceil(x);
-}
-
-template <>
-inline Eigen::half tceil(const Eigen::half& x) {
-  return Eigen::numext::ceil(x);
-}
-}  // namespace internal
-
-// Returns the number of elements generated by RangeOp with the given
-// start, limit, and delta arguments and type T.
-template <typename T>
-Status RangeSize(T start, T limit, T delta, int64* size) {
-  if (start > limit && delta > T(0)) {
-    return errors::InvalidArgument(
-        "Requires start <= limit when delta > 0: ", start, "/", limit);
-  }
-  if (start < limit && delta < T(0)) {
-    return errors::InvalidArgument(
-        "Requires start >= limit when delta < 0: ", start, "/", limit);
-  }
-  if (delta == T(0)) {
-    return errors::InvalidArgument("Requires delta != 0");
-  }
-  const T abs_size = delta > T(0) ? limit - start : start - limit;
-  const T abs_delta = delta > T(0) ? delta : -delta;
-  *size = std::is_integral<T>::value
-              ? static_cast<int64>((abs_size + abs_delta - T(1)) / abs_delta)
-              : static_cast<int64>(internal::tceil<T>(abs_size / abs_delta));
-
-  return Status::OK();
-}
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_OPS_UTIL_H_
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
index 4d54c9c..02dcc1e 100644
--- a/tensorflow/core/kernels/sequence_ops.cc
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -18,7 +18,6 @@
 #include <cmath>
 
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/ops_util.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -49,8 +48,23 @@
     const T start = start_in.scalar<T>()();
     const T limit = limit_in.scalar<T>()();
     const T delta = delta_in.scalar<T>()();
-    int64 size;
-    OP_REQUIRES_OK(context, RangeSize<T>(start, limit, delta, &size));
+    OP_REQUIRES(context, delta != 0,
+                errors::InvalidArgument("Requires delta != 0: ", delta));
+    if (delta > 0) {
+      OP_REQUIRES(
+          context, start <= limit,
+          errors::InvalidArgument(
+              "Requires start <= limit when delta > 0: ", start, "/", limit));
+    } else {
+      OP_REQUIRES(
+          context, start >= limit,
+          errors::InvalidArgument(
+              "Requires start >= limit when delta < 0: ", start, "/", limit));
+    }
+    int64 size = (std::is_integral<T>::value
+                      ? ((std::abs(limit - start) + std::abs(delta) - 1) /
+                         std::abs(delta))
+                      : std::ceil(std::abs((limit - start) / delta)));
     Tensor* out = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape({size}), &out));
@@ -84,7 +98,6 @@
 #undef REGISTER_SYCL_KERNEL
 #endif  // TENSORFLOW_USE_SYCL
 
-TF_CALL_half(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
 TF_CALL_int32(REGISTER_CPU_KERNEL);
@@ -92,7 +105,6 @@
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-TF_CALL_half(REGISTER_GPU_KERNEL);
 TF_CALL_float(REGISTER_GPU_KERNEL);
 TF_CALL_double(REGISTER_GPU_KERNEL);
 TF_CALL_int32(REGISTER_GPU_KERNEL);
@@ -134,8 +146,7 @@
     flat(0) = start;
     if (num > 1) {
       const T step = (stop - start) / (num - 1);
-      for (Tnum i = 1; i < num - 1; ++i)
-        flat(i) = start + step * static_cast<T>(i);
+      for (Tnum i = 1; i < num - 1; ++i) flat(i) = start + step * i;
       // Ensure final value == stop; float arithmetic won't guarantee this.
       flat(num - 1) = stop;
     }
@@ -158,14 +169,12 @@
   REGISTER_KERNEL(dev, T, int64)
 
 #define REGISTER_CPU_KERNEL(T) REGISTER_KERNEL_ALL_NUMS(DEVICE_CPU, T)
-TF_CALL_half(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
 
 // NOTE(touts): We register the op on GPU but it still runs on CPU
 // because its inputs and outputs are tagged as HostMemory.
 #define REGISTER_GPU_KERNEL(T) REGISTER_KERNEL_ALL_NUMS(DEVICE_GPU, T)
-TF_CALL_half(REGISTER_GPU_KERNEL);
 TF_CALL_float(REGISTER_GPU_KERNEL);
 TF_CALL_double(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 0654f15..7d6aedb 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -16,7 +16,6 @@
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/ops_util.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
 namespace tensorflow {
@@ -1390,14 +1389,27 @@
 namespace {
 
 template <typename T>
-Status SetRangeSizeFromTensors(const Tensor& start_t, const Tensor& limit_t,
-                               const Tensor& delta_t,
-                               InferenceContext* const c) {
-  const T start = start_t.scalar<T>()();
-  const T limit = limit_t.scalar<T>()();
-  const T delta = delta_t.scalar<T>()();
-  int64 size;
-  TF_RETURN_IF_ERROR(RangeSize(start, limit, delta, &size));
+Status RangeSize(const Tensor* start_t, const Tensor* limit_t,
+                 const Tensor* delta_t, InferenceContext* const c) {
+  T start = start_t->scalar<T>()();
+  T limit = limit_t->scalar<T>()();
+  T delta = delta_t->scalar<T>()();
+  if (start > limit && delta > T(0)) {
+    return errors::InvalidArgument(
+        "Requires start <= limit when delta > 0: ", start, "/", limit);
+  }
+  if (start < limit && delta < T(0)) {
+    return errors::InvalidArgument(
+        "Requires start >= limit when delta < 0: ", start, "/", limit);
+  }
+  if (delta == T(0)) {
+    return errors::InvalidArgument("Requires delta != 0");
+  }
+
+  auto size = (std::is_integral<T>::value
+                   ? ((std::abs(limit - start) + std::abs(delta) - T(1)) /
+                      std::abs(delta))
+                   : (std::ceil(std::abs((limit - start) / delta))));
   c->set_output(0, c->Vector(static_cast<int64>(size)));
   return Status::OK();
 }
@@ -1409,7 +1421,7 @@
     .Input("limit: Tidx")
     .Input("delta: Tidx")
     .Output("output: Tidx")
-    .Attr("Tidx: {bfloat16, half, float, double, int32, int64} = DT_INT32")
+    .Attr("Tidx: {bfloat16, float, double, int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_WITH_CONTEXT_IF_ERROR(c->WithRank(c->input(0), 0, &unused),
@@ -1428,19 +1440,15 @@
         return Status::OK();
       }
       if (dtype == DT_INT32) {
-        return SetRangeSizeFromTensors<int32>(*start_t, *limit_t, *delta_t, c);
+        return RangeSize<int32>(start_t, limit_t, delta_t, c);
       } else if (dtype == DT_INT64) {
-        return SetRangeSizeFromTensors<int64>(*start_t, *limit_t, *delta_t, c);
+        return RangeSize<int64>(start_t, limit_t, delta_t, c);
       } else if (dtype == DT_FLOAT) {
-        return SetRangeSizeFromTensors<float>(*start_t, *limit_t, *delta_t, c);
+        return RangeSize<float>(start_t, limit_t, delta_t, c);
       } else if (dtype == DT_DOUBLE) {
-        return SetRangeSizeFromTensors<double>(*start_t, *limit_t, *delta_t, c);
+        return RangeSize<double>(start_t, limit_t, delta_t, c);
       } else if (dtype == DT_BFLOAT16) {
-        return SetRangeSizeFromTensors<bfloat16>(*start_t, *limit_t, *delta_t,
-                                                 c);
-      } else if (dtype == DT_HALF) {
-        return SetRangeSizeFromTensors<Eigen::half>(*start_t, *limit_t,
-                                                    *delta_t, c);
+        return RangeSize<bfloat16>(start_t, limit_t, delta_t, c);
       } else {
         return errors::InvalidArgument("Unsupported dtype", dtype);
       }
@@ -1452,7 +1460,7 @@
     .Input("stop: T")
     .Input("num: Tidx")
     .Output("output: T")
-    .Attr("T: {bfloat16, half, float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index c2d1ce7..3822b4b 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -457,61 +457,63 @@
 # TODO(vrv): move to sequence_ops_test?
 class RangeTest(test.TestCase):
 
-  def _Range(self, start, limit=None, delta=1, expected=None):
-    expected = expected or []
-    for dtype in [np.int32, np.int64, np.float32, np.float64, np.float16]:
-      with self.session(use_gpu=True):
-        cast_start = math_ops.cast(start, dtype=dtype)
-        if limit is None:
-          tf_ans = math_ops.range(cast_start, dtype=dtype, name="range")
-        else:
-          if dtype in [np.int32, np.int64] and np.floor(delta) != delta:
-            continue
-          cast_limit = math_ops.cast(limit, dtype=dtype)
-          cast_delta = math_ops.cast(delta, dtype=dtype)
-          tf_ans = math_ops.range(
-              cast_start, cast_limit, cast_delta, name="range")
-        tf_val = self.evaluate(tf_ans)
-        self.assertAllClose(tf_val, np.array(expected, dtype=dtype))
+  def _Range(self, start, limit, delta):
+    with self.cached_session(use_gpu=True):
+      tf_ans = math_ops.range(start, limit, delta, name="range")
+      self.assertEqual([len(np.arange(start, limit, delta))],
+                       tf_ans.get_shape())
+      return self.evaluate(tf_ans)
 
   def testBasic(self):
-    self._Range(0, 5, 1, [0, 1, 2, 3, 4])
-    self._Range(0, 5, 2, [0, 2, 4])
-    self._Range(0, 6, 2, [0, 2, 4])
-    self._Range(13, 32, 7, [13, 20, 27])
-    self._Range(100, 500, 100, [100, 200, 300, 400])
+    self.assertTrue(
+        np.array_equal(self._Range(0, 5, 1), np.array([0, 1, 2, 3, 4])))
+    self.assertTrue(np.array_equal(self._Range(0, 5, 2), np.array([0, 2, 4])))
+    self.assertTrue(np.array_equal(self._Range(0, 6, 2), np.array([0, 2, 4])))
+    self.assertTrue(
+        np.array_equal(self._Range(13, 32, 7), np.array([13, 20, 27])))
+    self.assertTrue(
+        np.array_equal(
+            self._Range(100, 500, 100), np.array([100, 200, 300, 400])))
+    self.assertEqual(math_ops.range(0, 5, 1).dtype, dtypes.int32)
 
   @test_util.run_deprecated_v1
   def testLimitOnly(self):
-    self._Range(5, expected=[0, 1, 2, 3, 4])
+    with self.session(use_gpu=True):
+      self.assertAllEqual(np.arange(5), math_ops.range(5).eval())
 
   def testEmpty(self):
     for start in 0, 5:
-      self._Range(start, start, 1, [])
+      self.assertTrue(np.array_equal(self._Range(start, start, 1), []))
 
-  def testNonIntegerDelta(self):
-    self._Range(0, 2, 0.5, [0, 0.5, 1, 1.5])
-    self._Range(0, 5, 2.5, [0, 2.5])
-    self._Range(0, 3, 0.9, [0, 0.9, 1.8, 2.7])
-    self._Range(100., 500., 100., [100, 200, 300, 400])
+  def testNonInteger(self):
+    self.assertTrue(
+        np.allclose(self._Range(0, 2, 0.5), np.array([0, 0.5, 1, 1.5])))
+    self.assertTrue(np.allclose(self._Range(0, 5, 2.5), np.array([0, 2.5])))
+    self.assertTrue(
+        np.allclose(self._Range(0, 3, 0.9), np.array([0, 0.9, 1.8, 2.7])))
+    self.assertTrue(
+        np.allclose(
+            self._Range(100., 500., 100.), np.array([100, 200, 300, 400])))
+    self.assertEqual(math_ops.range(0., 5., 1.).dtype, dtypes.float32)
 
   def testNegativeDelta(self):
-    self._Range(5, -1, -1, [5, 4, 3, 2, 1, 0])
-    self._Range(2.5, 0, -0.5, [2.5, 2, 1.5, 1, 0.5])
-    self._Range(-5, -10, -3, [-5, -8])
+    self.assertTrue(
+        np.array_equal(self._Range(5, -1, -1), np.array([5, 4, 3, 2, 1, 0])))
+    self.assertTrue(
+        np.allclose(self._Range(2.5, 0, -0.5), np.array([2.5, 2, 1.5, 1, 0.5])))
+    self.assertTrue(
+        np.array_equal(self._Range(-5, -10, -3), np.array([-5, -8])))
 
   def testDType(self):
     zero_int32 = math_ops.cast(0, dtypes.int32)
     zero_int64 = math_ops.cast(0, dtypes.int64)
     zero_float32 = math_ops.cast(0, dtypes.float32)
     zero_float64 = math_ops.cast(0, dtypes.float64)
-    zero_half = math_ops.cast(0, dtypes.half)
 
     self.assertEqual(math_ops.range(zero_int32, 0, 1).dtype, dtypes.int32)
     self.assertEqual(math_ops.range(zero_int64, 0, 1).dtype, dtypes.int64)
     self.assertEqual(math_ops.range(zero_float32, 0, 1).dtype, dtypes.float32)
     self.assertEqual(math_ops.range(zero_float64, 0, 1).dtype, dtypes.float64)
-    self.assertEqual(math_ops.range(zero_half, 0, 1).dtype, dtypes.half)
 
     self.assertEqual(
         math_ops.range(zero_int32, zero_int64, 1).dtype, dtypes.int64)
@@ -545,52 +547,66 @@
     else:
       return [False]
 
-  def _test_linspace(self, start, stop, num, expected):
-    for idx_type in [np.int32, np.int64]:
-      for dtype in [np.float32, np.float64, np.float16]:
-        with ops.Graph().as_default() as graph:
-          with self.session(graph=graph, force_gpu=self.force_gpu):
-            cast_start = math_ops.cast(start, dtype=dtype)
-            cast_stop = math_ops.cast(stop, dtype=dtype)
-            cast_num = math_ops.cast(num, dtype=idx_type)
-            tf_ans = math_ops.linspace(
-                cast_start, cast_stop, cast_num, name="linspace")
-            self.assertEqual([num], tf_ans.get_shape())
-            tf_val = self.evaluate(tf_ans).astype(dtype)
-            cast_expected = np.array(expected, dtype=dtype)
-            tol = 1e-3 if dtype == np.float16 else 1e-6
-            self.assertAllClose(tf_val, cast_expected, rtol=tol)
-            # Endpoints should be exact.
-            self.assertEqual(tf_val[0], cast_expected[0])
-            self.assertEqual(tf_val[-1], cast_expected[-1])
+  def _LinSpace(self, start, stop, num):
+    with ops.Graph().as_default() as graph:
+      with self.session(graph=graph, force_gpu=self.force_gpu):
+        tf_ans = math_ops.linspace(start, stop, num, name="linspace")
+        self.assertEqual([num], tf_ans.get_shape())
+        return self.evaluate(tf_ans)
 
   def testPositive(self):
     for self.force_gpu in self._gpu_modes():
-      self._test_linspace(1., 5., 1, [1.])
-      self._test_linspace(1., 5., 2, [1., 5.])
-      self._test_linspace(1., 5., 3, [1., 3., 5.])
-      self._test_linspace(1., 5., 4, [1., 7. / 3., 11. / 3., 5.])
+      self.assertArrayNear(self._LinSpace(1., 5., 1), np.array([1.]), 1e-5)
+      self.assertArrayNear(self._LinSpace(1., 5., 2), np.array([1., 5.]), 1e-5)
+      self.assertArrayNear(
+          self._LinSpace(1., 5., 3), np.array([1., 3., 5.]), 1e-5)
+      self.assertArrayNear(
+          self._LinSpace(1., 5., 4), np.array([1., 7. / 3., 11. / 3., 5.]),
+          1e-5)
 
   def testNegative(self):
     for self.force_gpu in self._gpu_modes():
-      self._test_linspace(-1., -5., 1, [-1.])
-      self._test_linspace(-1., -5., 2, [-1., -5.])
-      self._test_linspace(-1., -5., 3, [-1., -3., -5.])
-      self._test_linspace(-1., -5., 4, [-1., -7. / 3., -11. / 3., -5.])
+      self.assertArrayNear(self._LinSpace(-1., -5., 1), np.array([-1.]), 1e-5)
+      self.assertArrayNear(
+          self._LinSpace(-1., -5., 2), np.array([-1., -5.]), 1e-5)
+      self.assertArrayNear(
+          self._LinSpace(-1., -5., 3), np.array([-1., -3., -5.]), 1e-5)
+      self.assertArrayNear(
+          self._LinSpace(-1., -5., 4),
+          np.array([-1., -7. / 3., -11. / 3., -5.]), 1e-5)
 
   def testNegativeToPositive(self):
     for self.force_gpu in self._gpu_modes():
-      self._test_linspace(-1., 5., 1, [-1.])
-      self._test_linspace(-1., 5., 2, [-1., 5.])
-      self._test_linspace(-1., 5., 3, [-1., 2., 5.])
-      self._test_linspace(-1., 5., 4, [-1., 1., 3., 5.])
+      self.assertArrayNear(self._LinSpace(-1., 5., 1), np.array([-1.]), 1e-5)
+      self.assertArrayNear(
+          self._LinSpace(-1., 5., 2), np.array([-1., 5.]), 1e-5)
+      self.assertArrayNear(
+          self._LinSpace(-1., 5., 3), np.array([-1., 2., 5.]), 1e-5)
+      self.assertArrayNear(
+          self._LinSpace(-1., 5., 4), np.array([-1., 1., 3., 5.]), 1e-5)
 
   def testPoint(self):
     for self.force_gpu in self._gpu_modes():
-      self._test_linspace(5., 5., 1, [5.])
-      self._test_linspace(5., 5., 2, [5.] * 2)
-      self._test_linspace(5., 5., 3, [5.] * 3)
-      self._test_linspace(5., 5., 4, [5.] * 4)
+      self.assertArrayNear(self._LinSpace(5., 5., 1), np.array([5.]), 1e-5)
+      self.assertArrayNear(self._LinSpace(5., 5., 2), np.array([5.] * 2), 1e-5)
+      self.assertArrayNear(self._LinSpace(5., 5., 3), np.array([5.] * 3), 1e-5)
+      self.assertArrayNear(self._LinSpace(5., 5., 4), np.array([5.] * 4), 1e-5)
+
+  def testEndpointsAreExact(self):
+    for self.force_gpu in self._gpu_modes():
+      # Test some cases that produce last values not equal to "stop" when
+      # computed via start + (num - 1) * ((stop - start) / (num - 1)), since
+      # float arithmetic will introduce error through precision loss.
+      self.assertAllEqual(
+          self._LinSpace(0., 1., 42)[[0, -1]], np.array([0., 1.], np.float32))
+      self.assertAllEqual(
+          self._LinSpace(-1., 0., 42)[[0, -1]], np.array([-1., 0.], np.float32))
+      self.assertAllEqual(
+          self._LinSpace(.1, .2, 4)[[0, -1]], np.array([.1, .2], np.float32))
+      # Check a case for float64 error too.
+      self.assertAllEqual(
+          self._LinSpace(np.array(0., np.float64), .1, 12)[[0, -1]],
+          np.array([0., .1], np.float64))
 
 
 class DeviceTest(test.TestCase):
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 547e8ba..e80514a 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1432,8 +1432,7 @@
     # infer dtype if not explicitly provided
     if dtype is None:
       dtype_hierarchy = [
-          dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64,
-          dtypes.half
+          dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64
       ]
       assert all(arg.dtype in dtype_hierarchy for arg in [start, limit, delta])
       inferred_dtype = max([arg.dtype for arg in [start, limit, delta]],