tensorflow/compiler/xla/tests/exhaustive_unary_test.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h"

 #ifdef __FAST_MATH__
 #error "Can't be compiled with fast math on"
 #endif

 namespace xla {

 using Eigen::half;

 template <typename T, size_t N>
 T EvaluatePolynomial(T x, const std::array<T, N>& coeffs) {
   T result = 0;
   for (T c : coeffs) {
     result = result * x + c;
   }
   return result;
 }

 // There's no std::erfinv, so we have to implement it ourselves.  This follows
 // Wichura 1998, https://www.jstor.org/stable/2347330 which, notably, is a
 // different implementation from that in math.cc.
 float HostErfInv(float x) {
   std::array<double, 8> kPolyA = {
       8.8709406962545514830200e2, 1.1819493347062294404278e4,
       2.3782041382114385731252e4, 1.6235862515167575384252e4,
       4.8548868893843886794648e3, 6.9706266534389598238465e2,
       4.7072688112383978012285e1, 1.1975323115670912564578e0,
   };
   std::array<double, 8> kPolyB = {
       5.2264952788528545610e3, 2.8729085735721942674e4, 3.9307895800092710610e4,
       2.1213794301586595867e4, 5.3941960214247511077e3, 6.8718700749205790830e2,
       4.2313330701600911252e1, 1.0000000000000000000e0,
   };
   std::array<double, 8> kPolyC = {
       7.74545014278341407640e-4, 2.27238449892691845833e-2,
       2.41780725177450611770e-1, 1.27045825245236838258e0,
       3.64784832476320460504e0,  5.76949722146069140550e0,
       4.63033784615654529590e0,  1.42343711074968357734e0,
   };
   std::array<double, 8> kPolyD = {
       1.4859850019840355905497876e-9, 7.7441459065157709165577218e-4,
       2.1494160384252876777097297e-2, 2.0945065210512749128288442e-1,
       9.7547832001787427186894837e-1, 2.3707661626024532365971225e0,
       2.9036514445419946173133295e0,  1.4142135623730950488016887e0,
   };
   std::array<double, 8> kPolyE = {
       2.01033439929228813265e-7, 2.71155556874348757815e-5,
       1.24266094738807843860e-3, 2.65321895265761230930e-2,
       2.96560571828504891230e-1, 1.78482653991729133580e0,
       5.46378491116411436990e0,  6.65790464350110377720e0,
   };
   std::array<double, 8> kPolyF = {
       2.891024605872965461538222e-15, 2.010321207683943062279931e-7,
       2.611088405080593625138020e-5,  1.112800997078859844711555e-3,
       2.103693768272068968719679e-2,  1.936480946950659106176712e-1,
       8.482908416595164588112026e-1,  1.414213562373095048801689e0,
   };

   if (std::abs(x) > 1 || std::isnan(x)) {
     return std::numeric_limits<float>::quiet_NaN();
   }
   if (std::abs(x) == 1) {
     return std::copysign(std::numeric_limits<float>::infinity(), x);
   }

   float unsigned_result = [&] {
     float y = std::abs(x);
     if (y <= 0.85) {
       double r = 0.180625 - 0.25 * y * y;
       return (y * EvaluatePolynomial(r, kPolyA)) /
              EvaluatePolynomial(r, kPolyB);
     } else {
       double r = std::sqrt(std::log(2.0) - std::log1p(-y));
       if (r <= 5.0) {
         r -= 1.6;
         return EvaluatePolynomial(r, kPolyC) / EvaluatePolynomial(r, kPolyD);
       } else {
         r -= 5;
         return EvaluatePolynomial(r, kPolyE) / EvaluatePolynomial(r, kPolyF);
       }
     }
   }();
   return std::copysign(unsigned_result, x);
 }

 // Digamma implementation using a polynomial from Cephes.  Notably this is a
 // different implementation from the one in math.cc.
 float HostDigamma(float x) {
   // Euler-Mascheroni constant
   float kGamma = 0.57721566490153286061;
   float kPi = M_PI;

   std::array<float, 4> kPoly = {
       -4.16666666666666666667E-3,
       3.96825396825396825397E-3,
       -8.33333333333333333333E-3,
       8.33333333333333333333E-2,
   };

   float reflection = 0;
   if (x <= 0) {
     float floor = std::floor(x);
     if (x == floor) {
       return std::numeric_limits<float>::quiet_NaN();
     }
     // Compute reflection term, pi * cot(pi * x).
     reflection = x - floor;
     if (reflection == 0.5) {
       reflection = 0;
     } else {
       if (reflection > 0.5) {
         reflection = x - (floor + 1.0f);
       }
       reflection = kPi / std::tan(kPi * reflection);
     }
     x = 1 - x;
   }

   float result = 0;
   if (x <= 10 && x == std::floor(x)) {
     // Special case for integers <= 10.
     for (int i = 1; i < x; ++i) {
       result += 1.0f / i;
     }
     result -= kGamma;
   } else {
     float w = 0;
     for (; x < 10; ++x) {
       w += 1.0f / x;
     }
     if (x < 1e8) {
       float z = 1.0f / (x * x);
       result = z * EvaluatePolynomial(z, kPoly);
     }
     result = std::log(x) - 0.5f / x - result - w;
   }

   // Compute the final, reflected value.
   return result - reflection;
 }

 template <PrimitiveType T>
 using ExhaustiveUnaryTest = ExhaustiveOpTestBase<T, 1>;

 // Exhaustive test for unary operations for <= 32bit floating point types.
 //
 // Test parameter is a tuple containing
 //   - primitive type under test,
 //   - (begin, end) range under test, as zero-extended int64s bitcast to the
 //     primtive type under test.
 template <PrimitiveType T>
 class Exhaustive32BitOrLessUnaryTest
     : public ExhaustiveUnaryTest<T>,
       public ::testing::WithParamInterface<std::pair<int64, int64>> {
  public:
   // Sets error parameters appropriately for testing sin/cos/tan.
   void SetParamsForSinCosTan();

  protected:
   using typename ExhaustiveUnaryTest<T>::NativeT;

  private:
   int64 GetInputSize() override {
     int64 begin, end;
     std::tie(begin, end) = GetParam();
     VLOG(2) << "Checking range [" << begin << ", " << end << ")";
     return end - begin;
   }

   // Generates all the input values for the test. The the range of the bit
   // representation of the input values is described by the test parameter as
   // a pair of int64 representing the starting bit pattern and the ending
   // pattern. Each bit representation is first truncated to the integral type of
   // the same bit as the type being tested, if needed, and then bitcasted to the
   // type being tested.
   void FillInput(std::array<Literal, 1>* input_literal) override {
     using IntegralT =
         typename ExhaustiveOpTestBase<T, 1>::ComponentIntegralNativeT;
     int64 input_size = (*input_literal)[0].element_count();
     int64 begin, end;
     std::tie(begin, end) = GetParam();
     VLOG(2) << "Checking range [" << begin << ", " << end << ")";
     CHECK_EQ(input_size, end - begin);

     absl::Span<NativeT> input_arr = (*input_literal)[0].data<NativeT>();
     for (int64 i = 0; i < input_size; i++) {
       IntegralT input_val = i + begin;
       input_arr[i] =
           this->ConvertAndReplaceKnownIncorrectValueWith(input_val, 0);
     }
   }
 };

 typedef Exhaustive32BitOrLessUnaryTest<F32> ExhaustiveF32UnaryTest;
 typedef Exhaustive32BitOrLessUnaryTest<F16> ExhaustiveF16UnaryTest;
 typedef Exhaustive32BitOrLessUnaryTest<BF16> ExhaustiveBF16UnaryTest;

 #define XLA_TEST_FLOAT_32_BITS_OR_LESS(test_name, ...) \
   XLA_TEST_P(ExhaustiveF32UnaryTest, test_name)        \
   __VA_ARGS__                                          \
   XLA_TEST_P(ExhaustiveF16UnaryTest, test_name)        \
   __VA_ARGS__                                          \
   XLA_TEST_P(ExhaustiveBF16UnaryTest, test_name)       \
   __VA_ARGS__

 XLA_TEST_FLOAT_32_BITS_OR_LESS(Log, {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ != "Host" && platform_ != "CUDA" && ty_ == F32) {
     error_spec_gen = +[](NativeT x) { return ErrorSpec{0.001, 0.001}; };
   }
   Run(Log, std::log, error_spec_gen);
 })

 XLA_TEST_FLOAT_32_BITS_OR_LESS(Log1p, {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ != "Host" && platform_ != "CUDA" && ty_ == F32) {
     error_spec_gen = +[](NativeT x) { return ErrorSpec{0.001, 0.001}; };
   }
   Run(Log1p, std::log1p, error_spec_gen);
 })

 XLA_TEST_FLOAT_32_BITS_OR_LESS(Exp, {
   // When x < -105, the true value of exp(x) is smaller than the smallest F32,
   // so exp(x) should return exactly 0. We want our implementation of exp to
   // return exactly 0 as well, as not doing so implies either that our
   // implementation of exp is not following the asymptotic behavior that exp(x)
   // approaches 0 as x approaches -inf, or that our implementation is not
   // approaching 0 fast enough.
   ErrorSpecGen error_spec_gen = +[](NativeT x) {
     if (x < static_cast<NativeT>(-105)) {
       return ErrorSpec{0, 0};
     }
     return GetDefaultSpecGenerator()(x);
   };

   // Our CPU implementation of exp returns one incorrect value: says
   // exp(88.7228394) = max-float, but the correct answer is inf.  We deem this
   // acceptable and check for it explicitly so that we can be aware if anything
   // changes.
   if (platform_ == "Host") {
     auto host_exp_with_overflow = +[](float f) {
       if (f == 88.7228394f) {
         return 3.40282347e+38f;
       }
       return std::exp(f);
     };
     Run(Exp, host_exp_with_overflow, error_spec_gen);
   } else {
     Run(Exp, std::exp, error_spec_gen);
   }
 })

 XLA_TEST_FLOAT_32_BITS_OR_LESS(Expm1, {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (ty_ == F32) {
     error_spec_gen = +[](NativeT x) { return ErrorSpec{0, 0.00015}; };
   }

   // Our CPU implementation of expm1 returns one incorrect value: says
   // exp(88.7228394) = max-float, but the correct answer is inf.  We deem this
   // acceptable and check for it explicitly so that we can be aware if anything
   // changes.
   if (platform_ == "Host") {
     auto host_expm1_with_overflow = +[](float f) {
       if (f == 88.7228394f) {
         return 3.40282347e+38f;
       }
       return std::expm1(f);
     };
     Run(Expm1, host_expm1_with_overflow, error_spec_gen);
   } else {
     Run(Expm1, std::expm1, error_spec_gen);
   }
 })

 // It feels a little overkill to exhaustively test sqrt and pow(x, 0.5), but
 // this *did* find a bug, namely that some backends were assuming sqrt(x) ==
 // pow(x, 0.5), but this is not true for x == -inf.
 XLA_TEST_FLOAT_32_BITS_OR_LESS(PowOneHalf, {
   EvaluateOp fn = +[](float x) { return std::pow(x, 0.5f); };
   // TODO(b/123837116): Enable the test for all values after fixing the bug.
   if (platform_ != "Host" && platform_ != "CUDA") {
     fn = +[](float x) {
       if (x == -std::numeric_limits<float>::infinity()) {
         return std::nanf("");
       }
       return std::pow(x, 0.5f);
     };
   }
   Run([](XlaOp x) { return Pow(x, ScalarLike(x, 0.5)); }, fn);
 })

 XLA_TEST_FLOAT_32_BITS_OR_LESS(Rsqrt, {
   Run(
       Rsqrt, +[](float x) { return 1 / std::sqrt(x); });
 })

 XLA_TEST_FLOAT_32_BITS_OR_LESS(Sqrt, {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ == "Host" || platform_ == "CUDA") {
     error_spec_gen = +[](NativeT x) {
       auto spec = GetDefaultSpecGenerator()(x);
       spec.strict_signed_zeros = true;
       return spec;
     };
   }

   Run(Sqrt, std::sqrt, error_spec_gen);
 })

 // TODO(jlebar): Test trig functions over complex inputs.
 XLA_TEST_P(ExhaustiveF32UnaryTest, Acosh) {
   // Error inherited from Log, which our implementation of Acosh uses.
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ != "Host" && platform_ != "CUDA") {
     error_spec_gen = +[](float x) { return ErrorSpec{0.001, 0.001}; };
   }

   Run(Acosh, std::acosh, error_spec_gen);
 }
 XLA_TEST_P(ExhaustiveF16UnaryTest, Acosh) { Run(Acosh, std::acosh); }
 XLA_TEST_P(ExhaustiveBF16UnaryTest, Acosh) { Run(Acosh, std::acosh); }

 // Tests for Asinh
 XLA_TEST_P(ExhaustiveF32UnaryTest, Asinh) {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ != "Host" && platform_ != "CUDA") {
     error_spec_gen = +[](float x) { return ErrorSpec{0.001, 0.001}; };
   }

   Run(Asinh, std::asinh, error_spec_gen);
 }
 XLA_TEST_P(ExhaustiveF16UnaryTest, Asinh) { Run(Asinh, std::asinh); }
 XLA_TEST_P(ExhaustiveBF16UnaryTest, Asinh) { Run(Asinh, std::asinh); }

 XLA_TEST_FLOAT_32_BITS_OR_LESS(Atanh, { Run(Atanh, std::atanh); })
 XLA_TEST_FLOAT_32_BITS_OR_LESS(Acos, { Run(Acos, std::acos); })
 XLA_TEST_FLOAT_32_BITS_OR_LESS(Asin, { Run(Asin, std::asin); })

 XLA_TEST_FLOAT_32_BITS_OR_LESS(Cosh, {
   // Our cosh implementation incorrectly overflows to inf for +/-89.4159851.
   // The correct answer of 3.40281961e+38 (0x7f7fffec) is very close to
   // max-float, so we deem this acceptable.
   //
   // This does not occur on CPU because we have an offsetting error in our
   // implementation of exp.
   float (*host_cosh)(float);
   if (platform_ == "Host") {
     host_cosh = &std::cosh;
   } else {
     host_cosh = +[](float x) {
       if (std::abs(x) == 89.4159851f) {
         return std::numeric_limits<float>::infinity();
       }
       return std::cosh(x);
     };
   }
   Run(Cosh, host_cosh);
 })

 XLA_TEST_FLOAT_32_BITS_OR_LESS(Sinh, {
   // Our sinh implementation incorrectly overflows to +/-inf for +/-89.4159851.
   // The correct answer of 3.40281961e+38 (0x7f7fffec) is very close to
   // max-float, so we deem this acceptable.
   //
   // This does not occur on CPU because we have an offsetting error in our
   // implementation of exp.
   float (*host_sinh)(float);
   if (platform_ == "Host") {
     host_sinh = &std::sinh;
   } else {
     host_sinh = +[](float x) {
       if (std::abs(x) == 89.4159851f) {
         return std::copysign(std::numeric_limits<float>::infinity(), x);
       }
       return std::sinh(x);
     };
   }
   Run(Sinh, host_sinh);
 })

 XLA_TEST_FLOAT_32_BITS_OR_LESS(Tanh, {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ == "CUDA") {
     error_spec_gen = +[](NativeT x) {
       return x <= static_cast<NativeT>(-20.0) || x >= static_cast<NativeT>(20.0)
                  ? ErrorSpec{0, 0}
                  : GetDefaultSpecGenerator()(x);
     };
   }
   Run(Tanh, std::tanh, error_spec_gen);
 })

 template <PrimitiveType T>
 void Exhaustive32BitOrLessUnaryTest<T>::SetParamsForSinCosTan() {
   if (this->platform_ == "Host" || this->platform_ == "CUDA") {
     return;
   }

   // Non CPU/GPU targets may have used the Cody-Waite range reduction technique
   // and will not provide meaningful results for sin/cos/tan if magnitudes
   // exceed 2**p.
   if (T == F32) {
     this->known_incorrect_fn_ = [](int64 v) {
       float f = BitCast<float>(static_cast<uint32>(v));
       return std::abs(f) > (1 << 13);
     };
   } else if (T == BF16) {
     this->known_incorrect_fn_ = [](int64 v) {
       float f = static_cast<float>(BitCast<bfloat16>(static_cast<uint16>(v)));
       return std::abs(f) > (1 << 13);
     };
   }
 }

 XLA_TEST_P(ExhaustiveF32UnaryTest, Cos) {
   SetParamsForSinCosTan();
   Run(
       Cos, std::cos, +[](NativeT) {
         return ErrorSpec{0.001, 0.001};
       });
 }
 XLA_TEST_P(ExhaustiveF16UnaryTest, Cos) {
   SetParamsForSinCosTan();
   Run(Cos, std::cos);
 }
 XLA_TEST_P(ExhaustiveBF16UnaryTest, Cos) {
   SetParamsForSinCosTan();
   Run(Cos, std::cos);
 }

 XLA_TEST_P(ExhaustiveF32UnaryTest, Sin) {
   SetParamsForSinCosTan();
   Run(
       Sin, std::sin, +[](NativeT) {
         return ErrorSpec{0.001, 0.001};
       });
 }
 XLA_TEST_P(ExhaustiveF16UnaryTest, Sin) {
   SetParamsForSinCosTan();
   Run(Sin, std::sin);
 }
 XLA_TEST_P(ExhaustiveBF16UnaryTest, Sin) {
   SetParamsForSinCosTan();
   Run(Sin, std::sin);
 }

 XLA_TEST_P(ExhaustiveF32UnaryTest, Tan) {
   SetParamsForSinCosTan();
   Run(
       Tan, std::tan, +[](NativeT) {
         return ErrorSpec{0.001, 0.001};
       });
 }
 XLA_TEST_P(ExhaustiveF16UnaryTest, Tan) {
   SetParamsForSinCosTan();
   Run(Tan, std::tan);
 }
 XLA_TEST_P(ExhaustiveBF16UnaryTest, Tan) {
   SetParamsForSinCosTan();
   Run(Tan, std::tan);
 }

 // TODO(jlebar): Enable these.
 // XLA_TEST_FLOAT_32_BITS_OR_LESS(Atan) { Run(Atan, std::atan); }
 // XLA_TEST_FLOAT_32_BITS_OR_LESS(Atan2) { Run(Atan2, std::atan2); }

 XLA_TEST_FLOAT_32_BITS_OR_LESS(Erf, { Run(Erf, std::erf); })
 XLA_TEST_FLOAT_32_BITS_OR_LESS(Erfc, { Run(Erfc, std::erfc); })
 XLA_TEST_FLOAT_32_BITS_OR_LESS(ErfInv, { Run(ErfInv, HostErfInv); })
 XLA_TEST_FLOAT_32_BITS_OR_LESS(Digamma, {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ != "Host" && platform_ != "CUDA") {
     // TODO(b/123956399): This is a fairly high error, significantly higher than
     // we see on CPU/GPU.
     error_spec_gen = +[](NativeT) { return ErrorSpec{0.01, 0.01}; };
   }

   if (platform_ == "CUDA") {
     // On GPU we get a wrong answer for the denormal inputs +/-2.93873588e-39
     // (0x00200000 and 0x80200000).  These should return -/+inf (at least
     // according to our reference implementation!) but XLA:GPU returns
     // -/+3.40282326e+38 (0xff7ffffe and 0x7f7ffffe).
     //
     // I deem this an acceptable result, as XLA:GPU flushes denormals, and as
     // the results we get here are very close to MAX_FLOAT.  We just hardcode
     // these results, as this is better than ignoring these inputs altogether.
     auto host_digamma_with_gpu_ftz_errors = +[](float x) {
       if (BitCast<uint32>(x) == 0x00200000 ||
           BitCast<uint32>(x) == 0x80200000) {
         return std::copysign(std::numeric_limits<float>::max(), -x);
       }
       return HostDigamma(x);
     };
     Run(Digamma, host_digamma_with_gpu_ftz_errors, error_spec_gen);
   } else {
     Run(Digamma, HostDigamma, error_spec_gen);
   }
 })

 XLA_TEST_FLOAT_32_BITS_OR_LESS(Lgamma, {
   // Our implementation gets within 0.0001 rel error except for ~20 denormal
   // inputs on GPU.  Anyway 0.001 rel error should be good enough for lgamma.
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ == "CUDA" && (ty_ == F32 || ty_ == F16)) {
     error_spec_gen = +[](NativeT x) {
       auto spec = GetDefaultSpecGenerator()(x);
       spec.rel_err = 0.001;
       return spec;
     };
   }

   float (*host_lgamma)(float) = std::lgamma;
   if (platform_ != "Host" && platform_ != "CUDA") {
     // TODO(b/123956399): This is a fairly high error, significantly higher than
     // we see on CPU/GPU.
     error_spec_gen = +[](NativeT) { return ErrorSpec{0.01, 0.01}; };

     // Overflows to inf for input 4.08500343e+36 (0x7c44af8e).
     if (ty_ == F32) {
       host_lgamma = +[](float v) {
         if (BitCast<uint32>(v) == 0x7c44af8e) {
           return std::numeric_limits<float>::infinity();
         }
         return std::lgamma(v);
       };
     }
   }
   Run(Lgamma, host_lgamma, error_spec_gen);
 })

 XLA_TEST_FLOAT_32_BITS_OR_LESS(Round, { Run(Round, std::round); })

 #if defined(UNARY_TEST_TARGET_F32_OR_SMALLER)

 INSTANTIATE_TEST_SUITE_P(F32, ExhaustiveF32UnaryTest,
                          ::testing::ValuesIn(CreateExhaustiveF32Ranges()));

 #if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16)
 INSTANTIATE_TEST_SUITE_P(F16, ExhaustiveF16UnaryTest,
                          ::testing::Values(std::make_pair(0, 1 << 16)));
 #endif

 #if defined(XLA_BACKEND_SUPPORTS_BFLOAT16)
 INSTANTIATE_TEST_SUITE_P(BF16, ExhaustiveBF16UnaryTest,
                          ::testing::Values(std::make_pair(0, 1 << 16)));
 #endif

 #endif

 // Exhaustive test for unary operations for double.
 //
 // Test parameter is a tuple containing
 //   - primitive type under test,
 //   - FpValues representing a set of double values.

 class ExhaustiveF64UnaryTest : public ExhaustiveUnaryTest<F64>,
                                public ::testing::WithParamInterface<FpValues> {
  private:
   int64 GetInputSize() override {
     FpValues values = GetParam();
     return values.GetTotalNumValues();
   }

   void FillInput(std::array<Literal, 1>* input_literal) override {
     FpValues fp_values = GetParam();
     int64 input_size = (*input_literal)[0].element_count();
     LOG(INFO) << "Checking fp values " << fp_values.ToString() << ", "
               << input_size;
     absl::Span<double> input_arr = (*input_literal)[0].data<double>();

     uint64 i = 0;
     for (auto bits : fp_values) {
       input_arr[i] = this->ConvertAndReplaceKnownIncorrectValueWith(bits, 1);
       ++i;
     }
     CHECK_EQ(i, input_size);
   }
 };

 XLA_TEST_P(ExhaustiveF64UnaryTest, Log) { Run(Log, std::log); }

 XLA_TEST_P(ExhaustiveF64UnaryTest, Log1p) { Run(Log1p, std::log1p); }

 XLA_TEST_P(ExhaustiveF64UnaryTest, Exp) { Run(Exp, std::exp); }

 XLA_TEST_P(ExhaustiveF64UnaryTest, Expm1) { Run(Expm1, std::expm1); }

 // TODO(b/138385863): Turn on the test for GPU after fixing the bug.
 XLA_TEST_P(ExhaustiveF64UnaryTest, DISABLED_ON_GPU(PowOneHalf)) {
   Run([](XlaOp x) { return Pow(x, ScalarLike(x, 0.5)); },
       +[](double x) { return std::pow(x, 0.5); });
 }

 XLA_TEST_P(ExhaustiveF64UnaryTest, Rsqrt) {
   Run(
       Rsqrt, +[](double x) { return 1 / std::sqrt(x); });
 }

 XLA_TEST_P(ExhaustiveF64UnaryTest, Sqrt) { Run(Sqrt, std::sqrt); }

 XLA_TEST_P(ExhaustiveF64UnaryTest, Acosh) { Run(Acosh, std::acosh); }

 XLA_TEST_P(ExhaustiveF64UnaryTest, Asinh) { Run(Asinh, std::asinh); }

 XLA_TEST_P(ExhaustiveF64UnaryTest, Atanh) { Run(Atanh, std::atanh); }

 XLA_TEST_P(ExhaustiveF64UnaryTest, Acos) { Run(Acos, std::acos); }

 XLA_TEST_P(ExhaustiveF64UnaryTest, Asin) { Run(Asin, std::asin); }

 XLA_TEST_P(ExhaustiveF64UnaryTest, Cosh) { Run(Cosh, std::cosh); }

 XLA_TEST_P(ExhaustiveF64UnaryTest, Sinh) { Run(Sinh, std::sinh); }

 XLA_TEST_P(ExhaustiveF64UnaryTest, Tanh) {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ == "CUDA") {
     error_spec_gen = +[](NativeT x) {
       return x <= static_cast<NativeT>(-20.0) || x >= static_cast<NativeT>(20.0)
                  ? ErrorSpec{0, 0}
                  : GetDefaultSpecGenerator()(x);
     };
   }
   Run(Tanh, std::tanh, error_spec_gen);
 }

 XLA_TEST_P(ExhaustiveF64UnaryTest, Cos) { Run(Cos, std::cos); }

 XLA_TEST_P(ExhaustiveF64UnaryTest, Sin) { Run(Sin, std::sin); }

 XLA_TEST_P(ExhaustiveF64UnaryTest, Tan) { Run(Tan, std::tan); }

 XLA_TEST_P(ExhaustiveF64UnaryTest, Round) { Run(Round, std::round); }

 #if defined(UNARY_TEST_TARGET_F64)
 #if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64)
 INSTANTIATE_TEST_SUITE_P(
     SpecialValues, ExhaustiveF64UnaryTest,
     ::testing::ValuesIn(CreateFpValuesForBoundaryTest<double>()));

 INSTANTIATE_TEST_SUITE_P(NormalValues, ExhaustiveF64UnaryTest,
                          ::testing::Values(GetNormals<double>(1000)));

 // Tests a total of 4000000000 inputs, with 16000000 inputs in each sub-test, to
 // keep the peak memory usage low.
 INSTANTIATE_TEST_SUITE_P(
     LargeAndSmallMagnituedNormalValues, ExhaustiveF64UnaryTest,
     ::testing::ValuesIn(GetFpValuesForMagnitudeExtremeNormals<double>(
         4000000000ull, 16000000)));
 #endif
 #endif

 // T is the Primitive Type of the complex number
 // Test parameter is a tuple containing
 //   - primitive type under test,
 //   - two FpValues representing the values for the real and imaginary
 //     components. The complex numbers for the test input is the cartesian
 //     product of the values represented by the two FpValues.
 template <PrimitiveType T>
 class ExhaustiveComplexUnaryTestBase
     : public ExhaustiveUnaryTest<T>,
       public ::testing::WithParamInterface<std::tuple<FpValues, FpValues>> {
  protected:
   using typename ExhaustiveUnaryTest<T>::NativeT;

   void SetParamsForTanh() {
     // TODO(b/138126045): Current libc++ implementation of the complex tanh
     //                    function returns (NaN, NaN) when the imaginary
     //                    component is more than half of the max value.
     // TODO(b/138750327): Current libc++ implementation of the complex tanh
     //                    function returns (1, 0) when the real component is
     //                    negative infinity, when it should return (-1, 0).
     // We only need to set the former as incorrect values for C128 because when
     // testing with C64, we first cast our input to a C128 value.
     this->known_incorrect_fn_ = [&](int64 v) {
       double f = this->ConvertValue(v);
       return (T == C128 &&
               std::abs(f) > std::numeric_limits<double>::max() / 2) ||
              f == -std::numeric_limits<double>::infinity();
     };
   }

  private:
   // Generates the input complex literal given the FpValues representation for
   // the real and imaginary components.
   void FillInput(std::array<Literal, 1>* input_literal) override {
     FpValues real_values = std::get<0>(GetParam());
     FpValues imag_values = std::get<1>(GetParam());

     VLOG(2) << " testing input total "
             << real_values.GetTotalNumValues() * imag_values.GetTotalNumValues()
             << ", range " << real_values.ToString() << " "
             << imag_values.ToString();

     absl::Span<NativeT> input_arr = (*input_literal)[0].data<NativeT>();

     uint64 i = 0;
     for (auto real : real_values) {
       for (auto imag : imag_values) {
         input_arr[i] =
             NativeT(this->ConvertAndReplaceKnownIncorrectValueWith(real, 1),
                     this->ConvertAndReplaceKnownIncorrectValueWith(imag, 1));

         ++i;
       }
     }
   }

   int64 GetInputSize() override {
     FpValues real_values = std::get<0>(GetParam());
     FpValues imag_values = std::get<1>(GetParam());
     return real_values.GetTotalNumValues() * imag_values.GetTotalNumValues();
   }
 };

 typedef ExhaustiveComplexUnaryTestBase<C64> ExhaustiveC64UnaryTest;
 typedef ExhaustiveComplexUnaryTestBase<C128> ExhaustiveC128UnaryTest;

 // TODO(b/138578594): Enable the test for the CPU backend after fixing the bug.
 XLA_TEST_P(ExhaustiveC64UnaryTest, DISABLED_ON_CPU(Log)) {
   Run(Log, [](complex64 x) { return std::log<float>(x); });
 }

 XLA_TEST_P(ExhaustiveC64UnaryTest, Sqrt) {
   Run(Sqrt, [](complex64 x) {
     return static_cast<complex64>(
         std::sqrt<double>(static_cast<complex128>(x)));
   });
 }

 XLA_TEST_P(ExhaustiveC64UnaryTest, Rsqrt) {
   Run(Rsqrt, [](complex64 x) {
     return static_cast<complex64>(
         complex128(1, 0) / std::sqrt<double>(static_cast<complex128>(x)));
   });
 }

 // The current libc++ implementation of the complex tanh function provides
 // less accurate results when the denomenator of a complex tanh is small, due
 // to floating point precision loss. To avoid this issue for complex64 numbers,
 // we cast it to and from a complex128 when computing tanh.
 XLA_TEST_P(ExhaustiveC64UnaryTest, Tanh) {
   SetParamsForTanh();
   ErrorSpecGen error_spec_gen = +[](complex64 x) {
     // This implementation of Tanh becomes less accurate when the denominator
     // is small.
     if (std::cosh(2 * x.real()) + std::cos(2 * x.imag()) < 1e-4) {
       return ErrorSpec{5e-2, 5e-2};
     }

     return GetDefaultSpecGenerator()(x);
   };
   Run(
       Tanh,
       +[](complex64 x) {
         return static_cast<complex64>(std::tanh(static_cast<complex128>(x)));
       },
       error_spec_gen);
 }

 #if defined(UNARY_TEST_TARGET_COMPLEX)
 INSTANTIATE_TEST_SUITE_P(
     F32SpecialValues, ExhaustiveC64UnaryTest,
     ::testing::Combine(
         ::testing::ValuesIn(CreateFpValuesForBoundaryTest<float>()),
         ::testing::ValuesIn(CreateFpValuesForBoundaryTest<float>())));

 INSTANTIATE_TEST_SUITE_P(
     F32SpecialAndNormalValues, ExhaustiveC64UnaryTest,
     ::testing::Combine(
         ::testing::ValuesIn(CreateFpValuesForBoundaryTest<float>()),
         ::testing::Values(GetNormals<float>(10000))));

 INSTANTIATE_TEST_SUITE_P(
     F32NormalAndSpecialValues, ExhaustiveC64UnaryTest,
     ::testing::Combine(
         ::testing::Values(GetNormals<float>(10000)),
         ::testing::ValuesIn(CreateFpValuesForBoundaryTest<float>())));

 INSTANTIATE_TEST_SUITE_P(
     F32NormalAndNormalValues, ExhaustiveC64UnaryTest,
     ::testing::Combine(::testing::Values(GetNormals<float>(10000)),
                        ::testing::Values(GetNormals<float>(10000))));

 // Tests a total of 40000 ^ 2 inputs, with 4000 ^ 2 inputs in each sub-test, to
 // keep the peak memory usage low.
 INSTANTIATE_TEST_SUITE_P(
     F32LargeAndSmallMagnituedNormalValues, ExhaustiveC64UnaryTest,
     ::testing::Combine(
         ::testing::ValuesIn(GetFpValuesForMagnitudeExtremeNormals<float>(40000,
                                                                          4000)),
         ::testing::ValuesIn(
             GetFpValuesForMagnitudeExtremeNormals<float>(40000, 4000))));
 #endif


 XLA_TEST_P(ExhaustiveC128UnaryTest, Log) {
   // TODO(b/138578313): Enable the test for all values after fixing the bug.
   known_incorrect_fn_ = [&](int64 v) {
     double f = this->ConvertValue(v);
     return std::fpclassify(f) == FP_NAN || std::abs(f) > 1.0e+300 ||
            std::abs(f) < 1.0e-300;
   };
   Run(Log, [](complex128 x) { return std::log<double>(x); });
 }

 XLA_TEST_P(ExhaustiveC128UnaryTest, Sqrt) {
   // Similar to the Tanh bug.
   known_incorrect_fn_ = [&](int64 v) {
     double f = this->ConvertValue(v);
     return std::abs(f) > std::numeric_limits<double>::max() / 2;
   };
   Run(Sqrt, [](complex128 x) { return std::sqrt<double>(x); });
 }

 XLA_TEST_P(ExhaustiveC128UnaryTest, Rsqrt) {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ == "CUDA") {
     // Edge case on CUDA backend where the Log of a complex number made up of
     // the smallest denormals is more accurate than the interpreter backend.
     error_spec_gen = [](complex128 x) {
       constexpr double denorm_min = std::numeric_limits<double>::denorm_min();
       if (std::abs(x.real()) == denorm_min &&
           std::abs(x.imag()) == denorm_min) {
         return ErrorSpec(0.5, 0.5);
       }
       return GetDefaultSpecGenerator()(x);
     };
   }
   Run(
       Rsqrt,
       [](complex128 x) { return complex128(1, 0) / std::sqrt<double>(x); },
       error_spec_gen);
 }

 XLA_TEST_P(ExhaustiveC128UnaryTest, Tanh) {
   SetParamsForTanh();
   Run(
       Tanh, +[](complex128 x) { return std::tanh(x); });
 }

 #if defined(UNARY_TEST_TARGET_COMPLEX)
 #if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64)
 INSTANTIATE_TEST_SUITE_P(
     SpecialValues, ExhaustiveC128UnaryTest,
     ::testing::Combine(
         ::testing::ValuesIn(CreateFpValuesForBoundaryTest<double>()),
         ::testing::ValuesIn(CreateFpValuesForBoundaryTest<double>())));

 INSTANTIATE_TEST_SUITE_P(
     SpecialAndNormalValues, ExhaustiveC128UnaryTest,
     ::testing::Combine(
         ::testing::ValuesIn(CreateFpValuesForBoundaryTest<double>()),
         ::testing::Values(GetNormals<double>(10000))));

 INSTANTIATE_TEST_SUITE_P(
     NormalAndSpecialValues, ExhaustiveC128UnaryTest,
     ::testing::Combine(
         ::testing::Values(GetNormals<double>(10000)),
         ::testing::ValuesIn(CreateFpValuesForBoundaryTest<double>())));

 INSTANTIATE_TEST_SUITE_P(
     F32NormalAndNormalValues, ExhaustiveC128UnaryTest,
     ::testing::Combine(::testing::Values(GetNormals<double>(10000)),
                        ::testing::Values(GetNormals<double>(10000))));

 // Tests a total of 40000 ^ 2 inputs, with 2000 ^ 2 inputs in each sub-test, to
 // keep the peak memory usage low.
 INSTANTIATE_TEST_SUITE_P(
     LargeAndSmallMagnituedNormalValues, ExhaustiveC128UnaryTest,
     ::testing::Combine(
         ::testing::ValuesIn(
             GetFpValuesForMagnitudeExtremeNormals<double>(40000, 2000)),
         ::testing::ValuesIn(
             GetFpValuesForMagnitudeExtremeNormals<double>(40000, 2000))));
 #endif
 #endif

 }  // namespace xla