tensorflow/compiler/xla/util.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "tensorflow/compiler/xla/util.h"

 #include <stdarg.h>

 #include <cmath>
 #include <limits>
 #include <numeric>

 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/numbers.h"
 #include "tensorflow/core/platform/stacktrace.h"

 namespace xla {

 Status WithLogBacktrace(const Status& status) {
   CHECK(!status.ok());
   VLOG(1) << status.ToString();
   VLOG(2) << tensorflow::CurrentStackTrace();
   return status;
 }

 ScopedLoggingTimer::ScopedLoggingTimer(const std::string& label, bool enabled,
                                        const char* file, int line,
                                        TimerStats* timer_stats)
     : enabled(enabled),
       file(file),
       line(line),
       label(label),
       timer_stats(timer_stats) {
   if (enabled) {
     start_micros = tensorflow::Env::Default()->NowMicros();
   }
 }

 void ScopedLoggingTimer::StopAndLog() {
   if (enabled) {
     uint64 end_micros = tensorflow::Env::Default()->NowMicros();
     double secs = (end_micros - start_micros) / 1000000.0;

     TimerStats& stats = *timer_stats;
     tensorflow::mutex_lock lock(stats.stats_mutex);
     stats.cumulative_secs += secs;
     if (secs > stats.max_secs) {
       stats.max_secs = secs;
     }
     stats.times_called++;

     LOG(INFO).AtLocation(file, line)
         << label
         << " time: " << tensorflow::strings::HumanReadableElapsedTime(secs)
         << " (cumulative: "
         << tensorflow::strings::HumanReadableElapsedTime(stats.cumulative_secs)
         << ", max: "
         << tensorflow::strings::HumanReadableElapsedTime(stats.max_secs)
         << ", #called: " << stats.times_called << ")";
     enabled = false;
   }
 }

 ScopedLoggingTimer::~ScopedLoggingTimer() { StopAndLog(); }

 Status AddStatus(Status prior, absl::string_view context) {
   CHECK(!prior.ok());
   return Status{prior.code(),
                 absl::StrCat(context, ": ", prior.error_message())};
 }

 Status AppendStatus(Status prior, absl::string_view context) {
   CHECK(!prior.ok());
   return Status{prior.code(),
                 absl::StrCat(prior.error_message(), ": ", context)};
 }

 string Reindent(absl::string_view original,
                 const absl::string_view indentation) {
   std::vector<string> pieces =
       absl::StrSplit(absl::string_view(original.data(), original.size()), '\n');
   return absl::StrJoin(pieces, "\n", [indentation](string* out, string s) {
     absl::StrAppend(out, indentation, absl::StripAsciiWhitespace(s));
   });
 }

 bool IsPermutation(absl::Span<const int64> permutation, int64 rank) {
   if (rank != permutation.size()) {
     return false;
   }
   absl::InlinedVector<int64, 8> trivial_permutation(rank);
   absl::c_iota(trivial_permutation, 0);
   return absl::c_is_permutation(permutation, trivial_permutation);
 }

 std::vector<int64> InversePermutation(
     absl::Span<const int64> input_permutation) {
   DCHECK(IsPermutation(input_permutation, input_permutation.size()));
   std::vector<int64> output_permutation(input_permutation.size(), -1);
   for (size_t i = 0; i < input_permutation.size(); ++i) {
     output_permutation.at(input_permutation.at(i)) = i;
   }
   return output_permutation;
 }

 std::vector<int64> ComposePermutations(absl::Span<const int64> p1,
                                        absl::Span<const int64> p2) {
   CHECK_EQ(p1.size(), p2.size());
   std::vector<int64> output;
   for (size_t i = 0; i < p1.size(); ++i) {
     output.push_back(p1.at(p2.at(i)));
   }
   return output;
 }

 bool IsIdentityPermutation(absl::Span<const int64> permutation) {
   for (int64 i = 0; i < permutation.size(); ++i) {
     if (permutation[i] != i) {
       return false;
     }
   }
   return true;
 }

 string RoundTripFpToString(tensorflow::bfloat16 value) {
   return absl::StrFormat("%.4g", static_cast<float>(value));
 }

 string RoundTripFpToString(Eigen::half value) {
   return absl::StrFormat("%.5g", static_cast<float>(value));
 }

 string RoundTripFpToString(float value) {
   char buffer[tensorflow::strings::kFastToBufferSize];
   tensorflow::strings::FloatToBuffer(value, buffer);
   return buffer;
 }

 string RoundTripFpToString(double value) {
   char buffer[tensorflow::strings::kFastToBufferSize];
   tensorflow::strings::DoubleToBuffer(value, buffer);
   return buffer;
 }

 PaddingConfig MakeNoPaddingConfig(int64 rank) {
   PaddingConfig padding_config;
   for (int64 dnum = 0; dnum < rank; ++dnum) {
     auto dimension = padding_config.add_dimensions();
     dimension->set_edge_padding_low(0);
     dimension->set_edge_padding_high(0);
     dimension->set_interior_padding(0);
   }
   return padding_config;
 }

 PaddingConfig MakeEdgePaddingConfig(
     absl::Span<const std::pair<int64, int64>> padding) {
   PaddingConfig padding_config;
   for (const std::pair<int64, int64>& dim : padding) {
     auto dimension = padding_config.add_dimensions();
     dimension->set_edge_padding_low(dim.first);
     dimension->set_edge_padding_high(dim.second);
     dimension->set_interior_padding(0);
   }
   return padding_config;
 }

 bool HasInteriorPadding(const PaddingConfig& config) {
   for (const auto& dim : config.dimensions()) {
     if (dim.interior_padding() != 0) {
       return true;
     }
   }
   return false;
 }

 namespace {
 string HumanReadableNumOps(double flops, double nanoseconds,
                            absl::string_view op_prefix) {
   if (nanoseconds == 0) {
     return absl::StrCat("NaN ", op_prefix, "OP/s");
   }
   double nano_flops = flops / nanoseconds;
   string throughput = tensorflow::strings::HumanReadableNum(
       static_cast<int64>(nano_flops * 1e9));
   absl::string_view sp(throughput);
   // Use the more common "G(FLOPS)", rather than "B(FLOPS)"
   if (absl::EndsWith(sp, "B") ||  // Ends in 'B', ignoring case
       absl::EndsWith(sp, "b")) {
     *throughput.rbegin() = 'G';
   }
   throughput += absl::StrCat(op_prefix, "OP/s");
   return throughput;
 }
 }  // namespace

 string HumanReadableNumFlops(double flops, double nanoseconds) {
   return HumanReadableNumOps(flops, nanoseconds, "FL");
 }

 string HumanReadableNumTranscendentalOps(double trops, double nanoseconds) {
   return HumanReadableNumOps(trops, nanoseconds, "TR");
 }

 void LogLines(int sev, absl::string_view text, const char* fname, int lineno) {
   const int orig_sev = sev;
   if (sev == tensorflow::FATAL) {
     sev = tensorflow::ERROR;
   }

   // Protect calls with a mutex so we don't interleave calls to LogLines from
   // multiple threads.
   static tensorflow::mutex log_lines_mu(tensorflow::LINKER_INITIALIZED);
   tensorflow::mutex_lock lock(log_lines_mu);

   size_t cur = 0;
   while (cur < text.size()) {
     size_t eol = text.find('\n', cur);
     if (eol == absl::string_view::npos) {
       eol = text.size();
     }
     auto msg = text.substr(cur, eol - cur);
     tensorflow::internal::LogString(fname, lineno, sev,
                                     string(msg.data(), msg.size()));
     cur = eol + 1;
   }

   if (orig_sev == tensorflow::FATAL) {
     tensorflow::internal::LogString(fname, lineno, orig_sev,
                                     "Aborting due to errors.");
   }
 }

 int64 Product(absl::Span<const int64> xs) {
   return std::accumulate(xs.begin(), xs.end(), static_cast<int64>(1),
                          std::multiplies<int64>());
 }

 absl::InlinedVector<std::pair<int64, int64>, 8> CommonFactors(
     absl::Span<const int64> a, absl::Span<const int64> b) {
   CHECK_EQ(Product(a), Product(b));
   if (0 == Product(a)) {
     return {std::make_pair(0, 0), std::make_pair(a.size(), b.size())};
   }

   absl::InlinedVector<std::pair<int64, int64>, 8> bounds;
   for (int64 i = 0, j = 0, prior_i = -1, prior_j = -1, partial_size_a = 1,
              partial_size_b = 1;
        ;) {
     if (partial_size_a == partial_size_b && (i > prior_i || j > prior_j)) {
       std::tie(prior_i, prior_j) = std::make_pair(i, j);
       bounds.emplace_back(i, j);
       continue;
     }
     bool in_bounds_i = i < a.size();
     bool in_bounds_j = j < b.size();
     if (!(in_bounds_i || in_bounds_j)) {
       break;
     }
     bool next_a =
         partial_size_a < partial_size_b ||
         (in_bounds_i &&
          (!in_bounds_j || (partial_size_a == partial_size_b && a[i] <= b[j])));
     bool next_b =
         partial_size_b < partial_size_a ||
         (in_bounds_j &&
          (!in_bounds_i || (partial_size_b == partial_size_a && b[j] <= a[i])));
     if (next_a) {
       partial_size_a *= a[i];
       ++i;
     }
     if (next_b) {
       partial_size_b *= b[j];
       ++j;
     }
   }
   return bounds;
 }

 string SanitizeFileName(string file_name) {
   for (char& c : file_name) {
     if (c == '/' || c == '\\' || c == '[' || c == ']' || c == ' ') {
       c = '_';
     }
   }
   return file_name;
 }

 // Utility function to split a double-precision float (F64) into a pair of F32s.
 // For a p-bit number, and a splitting point (p/2) <= s <= (p - 1), the
 // algorithm produces a (p - s)-bit value 'hi' and a non-overlapping (s - 1)-bit
 // value 'lo'. See Theorem 4 in [1] (attributed to Dekker) or [2] for the
 // original theorem by Dekker.
 //
 // For double-precision F64s, which contain a 53 bit mantissa (52 of them
 // explicit), we can represent the most significant 49 digits as the unevaluated
 // sum of two single-precision floats 'hi' and 'lo'. The 'hi' float stores the
 // most significant 24 bits and the sign bit of 'lo' together with its mantissa
 // store the remaining 25 bits. The exponent of the resulting representation is
 // still restricted to 8 bits of F32.
 //
 // References:
 // [1] A. Thall, Extended-Precision Floating-Point Numbers for GPU Computation,
 //     SIGGRAPH Research Posters, 2006.
 //     (http://andrewthall.org/papers/df64_qf128.pdf)
 // [2] T. J. Dekker, A floating point technique for extending the available
 //     precision, Numerische Mathematik, vol. 18, pp. 224–242, 1971.
 std::pair<float, float> SplitF64ToF32(double x) {
   // Early return if x is equal to infinity or -infinity.
   if (std::isinf(x)) {
     return std::make_pair(static_cast<float>(x), 0.0f);
   }

   // Following [1], the splitter is chosen as 2^{s} + 1, so that the most
   // significant (p - s) bits comprise the mantissa of 'hi'.
   static_assert(std::numeric_limits<double>::radix == 2,
                 "Double is not Binary FP");
   constexpr double kSplitter = (1 << (std::numeric_limits<double>::digits -
                                       std::numeric_limits<float>::digits)) +
                                1;

   // Only values within the range of F32 are supported, unless it is infinity.
   // Small values with large negative exponents would be rounded to zero.
   CHECK(std::isfinite(static_cast<float>(x))) << x;

   // The value of '(shifted - x)' should algebraically be exactly 2^{29} * x
   // but it can a bit smaller, because of rounding to 53 bits in computation of
   // (2^29 + 1) * x'. This overestimates the value of 'hi' by a multiple of
   // 2^{-29} (assuming exponent was 0), and makes 'lo' negative. An extra bit is
   // squeezed into the 'sign' bit of 'lo' to represent 25 bits of significand.
   const double shifted = kSplitter * x;
   // TODO(anudhyan): Write a test to ensure that compiler is not optimizing away
   // the following computation to 'hi = x;'.
   const float hi = shifted - (shifted - x);
   const float lo = x - hi;
   return std::make_pair(hi, lo);
 }

 }  // namespace xla
	/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	#include "tensorflow/compiler/xla/util.h"

	#include <stdarg.h>

	#include <cmath>
	#include <limits>
	#include <numeric>

	#include "absl/container/flat_hash_map.h"
	#include "absl/container/inlined_vector.h"
	#include "absl/strings/match.h"
	#include "absl/strings/str_cat.h"
	#include "absl/strings/str_format.h"
	#include "absl/strings/str_join.h"
	#include "absl/strings/str_split.h"
	#include "tensorflow/compiler/xla/types.h"
	#include "tensorflow/core/lib/bfloat16/bfloat16.h"
	#include "tensorflow/core/lib/core/errors.h"
	#include "tensorflow/core/lib/math/math_util.h"
	#include "tensorflow/core/lib/strings/numbers.h"
	#include "tensorflow/core/platform/env.h"
	#include "tensorflow/core/platform/mutex.h"
	#include "tensorflow/core/platform/numbers.h"
	#include "tensorflow/core/platform/stacktrace.h"

	namespace xla {

	Status WithLogBacktrace(const Status& status) {
	CHECK(!status.ok());
	VLOG(1) << status.ToString();
	VLOG(2) << tensorflow::CurrentStackTrace();
	return status;
	}

	ScopedLoggingTimer::ScopedLoggingTimer(const std::string& label, bool enabled,
	const char* file, int line,
	TimerStats* timer_stats)
	: enabled(enabled),
	file(file),
	line(line),
	label(label),
	timer_stats(timer_stats) {
	if (enabled) {
	start_micros = tensorflow::Env::Default()->NowMicros();
	}
	}

	void ScopedLoggingTimer::StopAndLog() {
	if (enabled) {
	uint64 end_micros = tensorflow::Env::Default()->NowMicros();
	double secs = (end_micros - start_micros) / 1000000.0;

	TimerStats& stats = *timer_stats;
	tensorflow::mutex_lock lock(stats.stats_mutex);
	stats.cumulative_secs += secs;
	if (secs > stats.max_secs) {
	stats.max_secs = secs;
	}
	stats.times_called++;

	LOG(INFO).AtLocation(file, line)
	<< label
	<< " time: " << tensorflow::strings::HumanReadableElapsedTime(secs)
	<< " (cumulative: "
	<< tensorflow::strings::HumanReadableElapsedTime(stats.cumulative_secs)
	<< ", max: "
	<< tensorflow::strings::HumanReadableElapsedTime(stats.max_secs)
	<< ", #called: " << stats.times_called << ")";
	enabled = false;
	}
	}

	ScopedLoggingTimer::~ScopedLoggingTimer() { StopAndLog(); }

	Status AddStatus(Status prior, absl::string_view context) {
	CHECK(!prior.ok());
	return Status{prior.code(),
	absl::StrCat(context, ": ", prior.error_message())};
	}

	Status AppendStatus(Status prior, absl::string_view context) {
	CHECK(!prior.ok());
	return Status{prior.code(),
	absl::StrCat(prior.error_message(), ": ", context)};
	}

	string Reindent(absl::string_view original,
	const absl::string_view indentation) {
	std::vector<string> pieces =
	absl::StrSplit(absl::string_view(original.data(), original.size()), '\n');
	return absl::StrJoin(pieces, "\n", [indentation](string* out, string s) {
	absl::StrAppend(out, indentation, absl::StripAsciiWhitespace(s));
	});
	}

	bool IsPermutation(absl::Span<const int64> permutation, int64 rank) {
	if (rank != permutation.size()) {
	return false;
	}
	absl::InlinedVector<int64, 8> trivial_permutation(rank);
	absl::c_iota(trivial_permutation, 0);
	return absl::c_is_permutation(permutation, trivial_permutation);
	}

	std::vector<int64> InversePermutation(
	absl::Span<const int64> input_permutation) {
	DCHECK(IsPermutation(input_permutation, input_permutation.size()));
	std::vector<int64> output_permutation(input_permutation.size(), -1);
	for (size_t i = 0; i < input_permutation.size(); ++i) {
	output_permutation.at(input_permutation.at(i)) = i;
	}
	return output_permutation;
	}

	std::vector<int64> ComposePermutations(absl::Span<const int64> p1,
	absl::Span<const int64> p2) {
	CHECK_EQ(p1.size(), p2.size());
	std::vector<int64> output;
	for (size_t i = 0; i < p1.size(); ++i) {
	output.push_back(p1.at(p2.at(i)));
	}
	return output;
	}

	bool IsIdentityPermutation(absl::Span<const int64> permutation) {
	for (int64 i = 0; i < permutation.size(); ++i) {
	if (permutation[i] != i) {
	return false;
	}
	}
	return true;
	}

	string RoundTripFpToString(tensorflow::bfloat16 value) {
	return absl::StrFormat("%.4g", static_cast<float>(value));
	}

	string RoundTripFpToString(Eigen::half value) {
	return absl::StrFormat("%.5g", static_cast<float>(value));
	}

	string RoundTripFpToString(float value) {
	char buffer[tensorflow::strings::kFastToBufferSize];
	tensorflow::strings::FloatToBuffer(value, buffer);
	return buffer;
	}

	string RoundTripFpToString(double value) {
	char buffer[tensorflow::strings::kFastToBufferSize];
	tensorflow::strings::DoubleToBuffer(value, buffer);
	return buffer;
	}

	PaddingConfig MakeNoPaddingConfig(int64 rank) {
	PaddingConfig padding_config;
	for (int64 dnum = 0; dnum < rank; ++dnum) {
	auto dimension = padding_config.add_dimensions();
	dimension->set_edge_padding_low(0);
	dimension->set_edge_padding_high(0);
	dimension->set_interior_padding(0);
	}
	return padding_config;
	}

	PaddingConfig MakeEdgePaddingConfig(
	absl::Span<const std::pair<int64, int64>> padding) {
	PaddingConfig padding_config;
	for (const std::pair<int64, int64>& dim : padding) {
	auto dimension = padding_config.add_dimensions();
	dimension->set_edge_padding_low(dim.first);
	dimension->set_edge_padding_high(dim.second);
	dimension->set_interior_padding(0);
	}
	return padding_config;
	}

	bool HasInteriorPadding(const PaddingConfig& config) {
	for (const auto& dim : config.dimensions()) {
	if (dim.interior_padding() != 0) {
	return true;
	}
	}
	return false;
	}

	namespace {
	string HumanReadableNumOps(double flops, double nanoseconds,
	absl::string_view op_prefix) {
	if (nanoseconds == 0) {
	return absl::StrCat("NaN ", op_prefix, "OP/s");
	}
	double nano_flops = flops / nanoseconds;
	string throughput = tensorflow::strings::HumanReadableNum(
	static_cast<int64>(nano_flops * 1e9));
	absl::string_view sp(throughput);
	// Use the more common "G(FLOPS)", rather than "B(FLOPS)"
	if (absl::EndsWith(sp, "B") \|\| // Ends in 'B', ignoring case
	absl::EndsWith(sp, "b")) {
	*throughput.rbegin() = 'G';
	}
	throughput += absl::StrCat(op_prefix, "OP/s");
	return throughput;
	}
	} // namespace

	string HumanReadableNumFlops(double flops, double nanoseconds) {
	return HumanReadableNumOps(flops, nanoseconds, "FL");
	}

	string HumanReadableNumTranscendentalOps(double trops, double nanoseconds) {
	return HumanReadableNumOps(trops, nanoseconds, "TR");
	}

	void LogLines(int sev, absl::string_view text, const char* fname, int lineno) {
	const int orig_sev = sev;
	if (sev == tensorflow::FATAL) {
	sev = tensorflow::ERROR;
	}

	// Protect calls with a mutex so we don't interleave calls to LogLines from
	// multiple threads.
	static tensorflow::mutex log_lines_mu(tensorflow::LINKER_INITIALIZED);
	tensorflow::mutex_lock lock(log_lines_mu);

	size_t cur = 0;
	while (cur < text.size()) {
	size_t eol = text.find('\n', cur);
	if (eol == absl::string_view::npos) {
	eol = text.size();
	}
	auto msg = text.substr(cur, eol - cur);
	tensorflow::internal::LogString(fname, lineno, sev,
	string(msg.data(), msg.size()));
	cur = eol + 1;
	}

	if (orig_sev == tensorflow::FATAL) {
	tensorflow::internal::LogString(fname, lineno, orig_sev,
	"Aborting due to errors.");
	}
	}

	int64 Product(absl::Span<const int64> xs) {
	return std::accumulate(xs.begin(), xs.end(), static_cast<int64>(1),
	std::multiplies<int64>());
	}

	absl::InlinedVector<std::pair<int64, int64>, 8> CommonFactors(
	absl::Span<const int64> a, absl::Span<const int64> b) {
	CHECK_EQ(Product(a), Product(b));
	if (0 == Product(a)) {
	return {std::make_pair(0, 0), std::make_pair(a.size(), b.size())};
	}

	absl::InlinedVector<std::pair<int64, int64>, 8> bounds;
	for (int64 i = 0, j = 0, prior_i = -1, prior_j = -1, partial_size_a = 1,
	partial_size_b = 1;
	;) {
	if (partial_size_a == partial_size_b && (i > prior_i \|\| j > prior_j)) {
	std::tie(prior_i, prior_j) = std::make_pair(i, j);
	bounds.emplace_back(i, j);
	continue;
	}
	bool in_bounds_i = i < a.size();
	bool in_bounds_j = j < b.size();
	if (!(in_bounds_i \|\| in_bounds_j)) {
	break;
	}
	bool next_a =
	partial_size_a < partial_size_b \|\|
	(in_bounds_i &&
	(!in_bounds_j \|\| (partial_size_a == partial_size_b && a[i] <= b[j])));
	bool next_b =
	partial_size_b < partial_size_a \|\|
	(in_bounds_j &&
	(!in_bounds_i \|\| (partial_size_b == partial_size_a && b[j] <= a[i])));
	if (next_a) {
	partial_size_a *= a[i];
	++i;
	}
	if (next_b) {
	partial_size_b *= b[j];
	++j;
	}
	}
	return bounds;
	}

	string SanitizeFileName(string file_name) {
	for (char& c : file_name) {
	if (c == '/' \|\| c == '\\' \|\| c == '[' \|\| c == ']' \|\| c == ' ') {
	c = '_';
	}
	}
	return file_name;
	}

	// Utility function to split a double-precision float (F64) into a pair of F32s.
	// For a p-bit number, and a splitting point (p/2) <= s <= (p - 1), the
	// algorithm produces a (p - s)-bit value 'hi' and a non-overlapping (s - 1)-bit
	// value 'lo'. See Theorem 4 in [1] (attributed to Dekker) or [2] for the
	// original theorem by Dekker.
	//
	// For double-precision F64s, which contain a 53 bit mantissa (52 of them
	// explicit), we can represent the most significant 49 digits as the unevaluated
	// sum of two single-precision floats 'hi' and 'lo'. The 'hi' float stores the
	// most significant 24 bits and the sign bit of 'lo' together with its mantissa
	// store the remaining 25 bits. The exponent of the resulting representation is
	// still restricted to 8 bits of F32.
	//
	// References:
	// [1] A. Thall, Extended-Precision Floating-Point Numbers for GPU Computation,
	// SIGGRAPH Research Posters, 2006.
	// (http://andrewthall.org/papers/df64_qf128.pdf)
	// [2] T. J. Dekker, A floating point technique for extending the available
	// precision, Numerische Mathematik, vol. 18, pp. 224–242, 1971.
	std::pair<float, float> SplitF64ToF32(double x) {
	// Early return if x is equal to infinity or -infinity.
	if (std::isinf(x)) {
	return std::make_pair(static_cast<float>(x), 0.0f);
	}

	// Following [1], the splitter is chosen as 2^{s} + 1, so that the most
	// significant (p - s) bits comprise the mantissa of 'hi'.
	static_assert(std::numeric_limits<double>::radix == 2,
	"Double is not Binary FP");
	constexpr double kSplitter = (1 << (std::numeric_limits<double>::digits -
	std::numeric_limits<float>::digits)) +
	1;

	// Only values within the range of F32 are supported, unless it is infinity.
	// Small values with large negative exponents would be rounded to zero.
	CHECK(std::isfinite(static_cast<float>(x))) << x;

	// The value of '(shifted - x)' should algebraically be exactly 2^{29} * x
	// but it can a bit smaller, because of rounding to 53 bits in computation of
	// (2^29 + 1) * x'. This overestimates the value of 'hi' by a multiple of
	// 2^{-29} (assuming exponent was 0), and makes 'lo' negative. An extra bit is
	// squeezed into the 'sign' bit of 'lo' to represent 25 bits of significand.
	const double shifted = kSplitter * x;
	// TODO(anudhyan): Write a test to ensure that compiler is not optimizing away
	// the following computation to 'hi = x;'.
	const float hi = shifted - (shifted - x);
	const float lo = x - hi;
	return std::make_pair(hi, lo);
	}

	} // namespace xla