Compare performance with Eigen
diff --git a/bench/from-ieee-array.cc b/bench/from-ieee-array.cc
index 49a9150..2e898a6 100644
--- a/bench/from-ieee-array.cc
+++ b/bench/from-ieee-array.cc
@@ -16,10 +16,11 @@
#endif
#ifdef FP16_COMPARATIVE_BENCHMARKS
- #include <third-party/float16-compressor.h>
#include <third-party/THHalf.h>
- #include <third-party/half.hpp>
#include <third-party/npy-halffloat.h>
+ #include <third-party/eigen-half.h>
+ #include <third-party/float16-compressor.h>
+ #include <third-party/half.hpp>
#endif
@@ -214,6 +215,33 @@
}
BENCHMARK(npy_halfbits_to_floatbits)->RangeMultiplier(2)->Range(1<<10, 64<<20);
+ static void Eigen_half_to_float(benchmark::State& state) {
+ const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
+ auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
+
+ std::vector<uint16_t> fp16(state.range(0));
+ std::vector<float> fp32(state.range(0));
+ std::generate(fp16.begin(), fp16.end(),
+ [&rng]{ return fp16_ieee_from_fp32_value(rng()); });
+
+ while (state.KeepRunning()) {
+ uint16_t* input = fp16.data();
+ benchmark::DoNotOptimize(input);
+
+ float* output = fp32.data();
+ const size_t n = state.range(0);
+ for (size_t i = 0; i < n; i++) {
+ output[i] =
+ Eigen::half_impl::half_to_float(
+ Eigen::half_impl::raw_uint16_to_half(input[i]));
+ }
+
+ benchmark::DoNotOptimize(output);
+ }
+ state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
+ }
+ BENCHMARK(Eigen_half_to_float)->RangeMultiplier(2)->Range(1<<10, 64<<20);
+
static void Float16Compressor_decompress(benchmark::State& state) {
const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
diff --git a/bench/ieee-element.cc b/bench/ieee-element.cc
index f85c5ab..fc660e9 100644
--- a/bench/ieee-element.cc
+++ b/bench/ieee-element.cc
@@ -10,10 +10,11 @@
#endif
#ifdef FP16_COMPARATIVE_BENCHMARKS
- #include <third-party/float16-compressor.h>
#include <third-party/THHalf.h>
- #include <third-party/half.hpp>
#include <third-party/npy-halffloat.h>
+ #include <third-party/eigen-half.h>
+ #include <third-party/float16-compressor.h>
+ #include <third-party/half.hpp>
#endif
static inline uint16_t next_xorshift16(uint16_t x) {
@@ -89,17 +90,6 @@
#endif
#ifdef FP16_COMPARATIVE_BENCHMARKS
- static void Float16Compressor_decompress(benchmark::State& state) {
- uint16_t fp16 = UINT16_C(0x7C00);
- while (state.KeepRunning()) {
- const float fp32 = Float16Compressor::decompress(fp16);
-
- fp16 = next_xorshift16(fp16);
- benchmark::DoNotOptimize(fp32);
- }
- }
- BENCHMARK(Float16Compressor_decompress);
-
static void TH_halfbits2float(benchmark::State& state) {
uint16_t fp16 = UINT16_C(0x7C00);
while (state.KeepRunning()) {
@@ -112,6 +102,41 @@
}
BENCHMARK(TH_halfbits2float);
+ static void npy_halfbits_to_floatbits(benchmark::State& state) {
+ uint16_t fp16 = UINT16_C(0x7C00);
+ while (state.KeepRunning()) {
+ const uint32_t fp32 = npy_halfbits_to_floatbits(fp16);
+
+ fp16 = next_xorshift16(fp16);
+ benchmark::DoNotOptimize(fp32);
+ }
+ }
+ BENCHMARK(npy_halfbits_to_floatbits);
+
+ static void Eigen_half_to_float(benchmark::State& state) {
+ uint16_t fp16 = UINT16_C(0x7C00);
+ while (state.KeepRunning()) {
+ const float fp32 =
+ Eigen::half_impl::half_to_float(
+ Eigen::half_impl::raw_uint16_to_half(fp16));
+
+ fp16 = next_xorshift16(fp16);
+ benchmark::DoNotOptimize(fp32);
+ }
+ }
+ BENCHMARK(Eigen_half_to_float);
+
+ static void Float16Compressor_decompress(benchmark::State& state) {
+ uint16_t fp16 = UINT16_C(0x7C00);
+ while (state.KeepRunning()) {
+ const float fp32 = Float16Compressor::decompress(fp16);
+
+ fp16 = next_xorshift16(fp16);
+ benchmark::DoNotOptimize(fp32);
+ }
+ }
+ BENCHMARK(Float16Compressor_decompress);
+
static void half_float_detail_half2float_table(benchmark::State& state) {
uint16_t fp16 = UINT16_C(0x7C00);
while (state.KeepRunning()) {
@@ -135,17 +160,6 @@
}
}
BENCHMARK(half_float_detail_half2float_branch);
-
- static void npy_halfbits_to_floatbits(benchmark::State& state) {
- uint16_t fp16 = UINT16_C(0x7C00);
- while (state.KeepRunning()) {
- const uint32_t fp32 = npy_halfbits_to_floatbits(fp16);
-
- fp16 = next_xorshift16(fp16);
- benchmark::DoNotOptimize(fp32);
- }
- }
- BENCHMARK(npy_halfbits_to_floatbits);
#endif
/* Conversion from IEEE FP32 to IEEE FP16 */
@@ -176,17 +190,6 @@
#endif
#ifdef FP16_COMPARATIVE_BENCHMARKS
- static void Float16Compressor_compress(benchmark::State& state) {
- uint32_t fp32 = UINT32_C(0x7F800000);
- while (state.KeepRunning()) {
- const uint16_t fp16 = Float16Compressor::compress(fp32_from_bits(fp32));
-
- fp32 = next_xorshift32(fp32);
- benchmark::DoNotOptimize(fp16);
- }
- }
- BENCHMARK(Float16Compressor_compress);
-
static void TH_float2halfbits(benchmark::State& state) {
uint32_t fp32 = UINT32_C(0x7F800000);
while (state.KeepRunning()) {
@@ -200,6 +203,41 @@
}
BENCHMARK(TH_float2halfbits);
+ static void npy_floatbits_to_halfbits(benchmark::State& state) {
+ uint32_t fp32 = UINT32_C(0x7F800000);
+ while (state.KeepRunning()) {
+ const uint16_t fp16 = npy_floatbits_to_halfbits(fp32);
+
+ fp32 = next_xorshift32(fp32);
+ benchmark::DoNotOptimize(fp16);
+ }
+ }
+ BENCHMARK(npy_floatbits_to_halfbits);
+
+ static void Eigen_float_to_half_rtne(benchmark::State& state) {
+ uint32_t fp32 = UINT32_C(0x7F800000);
+ while (state.KeepRunning()) {
+ const Eigen::half_impl::__half fp16 =
+ Eigen::half_impl::float_to_half_rtne(
+ fp32_from_bits(fp32));
+
+ fp32 = next_xorshift32(fp32);
+ benchmark::DoNotOptimize(fp16);
+ }
+ }
+ BENCHMARK(Eigen_float_to_half_rtne);
+
+ static void Float16Compressor_compress(benchmark::State& state) {
+ uint32_t fp32 = UINT32_C(0x7F800000);
+ while (state.KeepRunning()) {
+ const uint16_t fp16 = Float16Compressor::compress(fp32_from_bits(fp32));
+
+ fp32 = next_xorshift32(fp32);
+ benchmark::DoNotOptimize(fp16);
+ }
+ }
+ BENCHMARK(Float16Compressor_compress);
+
static void half_float_detail_float2half_table(benchmark::State& state) {
uint32_t fp32 = UINT32_C(0x7F800000);
while (state.KeepRunning()) {
@@ -225,17 +263,6 @@
}
}
BENCHMARK(half_float_detail_float2half_branch);
-
- static void npy_floatbits_to_halfbits(benchmark::State& state) {
- uint32_t fp32 = UINT32_C(0x7F800000);
- while (state.KeepRunning()) {
- const uint16_t fp16 = npy_floatbits_to_halfbits(fp32);
-
- fp32 = next_xorshift32(fp32);
- benchmark::DoNotOptimize(fp16);
- }
- }
- BENCHMARK(npy_floatbits_to_halfbits);
#endif
BENCHMARK_MAIN();
diff --git a/bench/to-ieee-array.cc b/bench/to-ieee-array.cc
index 1dedcc0..80f7c2a 100644
--- a/bench/to-ieee-array.cc
+++ b/bench/to-ieee-array.cc
@@ -16,10 +16,11 @@
#endif
#ifdef FP16_COMPARATIVE_BENCHMARKS
- #include <third-party/float16-compressor.h>
#include <third-party/THHalf.h>
- #include <third-party/half.hpp>
#include <third-party/npy-halffloat.h>
+ #include <third-party/eigen-half.h>
+ #include <third-party/float16-compressor.h>
+ #include <third-party/half.hpp>
#endif
@@ -150,6 +151,30 @@
}
BENCHMARK(npy_floatbits_to_halfbits)->RangeMultiplier(2)->Range(1<<10, 64<<20);
+ static void Eigen_float_to_half_rtne(benchmark::State& state) {
+ const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
+ auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
+
+ std::vector<float> fp32(state.range(0));
+ std::vector<uint16_t> fp16(state.range(0));
+ std::generate(fp32.begin(), fp32.end(), std::ref(rng));
+
+ while (state.KeepRunning()) {
+ float* input = fp32.data();
+ benchmark::DoNotOptimize(input);
+
+ uint16_t* output = fp16.data();
+ const size_t n = state.range(0);
+ for (size_t i = 0; i < n; i++) {
+ output[i] = Eigen::half_impl::float_to_half_rtne(input[i]).x;
+ }
+
+ benchmark::DoNotOptimize(output);
+ }
+ state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
+ }
+ BENCHMARK(Eigen_float_to_half_rtne)->RangeMultiplier(2)->Range(1<<10, 64<<20);
+
static void Float16Compressor_compress(benchmark::State& state) {
const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
diff --git a/third-party/eigen-half.h b/third-party/eigen-half.h
new file mode 100644
index 0000000..f7181cd
--- /dev/null
+++ b/third-party/eigen-half.h
@@ -0,0 +1,170 @@
+/*
+ * This implementation is extracted from Eigen:
+ * Repo: bitbucket.org/eigen/eigen
+ * File: Eigen/src/Core/arch/CUDA/Half.h
+ * Commit ID: 96e0f73a35de54f675d825bef5339b2f08e77eb4
+ *
+ * Removed a lot of redundant and cuda-specific code.
+ */
+
+#define EIGEN_STRONG_INLINE static inline
+#define EIGEN_DEVICE_FUNC
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+// The conversion routines are Copyright (c) Fabian Giesen, 2016.
+// The original license follows:
+//
+// Copyright (c) Fabian Giesen, 2016
+// All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Standard 16-bit float type, mostly useful for GPUs. Defines a new
+// type Eigen::half (inheriting from CUDA's __half struct) with
+// operator overloads such that it behaves basically as an arithmetic
+// type. It will be quite slow on CPUs (so it is recommended to stay
+// in fp32 for CPUs, except for simple parameter conversions, I/O
+// to disk and the likes), but fast on GPUs.
+
+
+#ifndef EIGEN_HALF_CUDA_H
+#define EIGEN_HALF_CUDA_H
+
+namespace Eigen {
+
+namespace half_impl {
+
+// Make our own __half definition that is similar to CUDA's.
+struct __half {
+ EIGEN_DEVICE_FUNC __half() : x(0) {}
+ explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
+ unsigned short x;
+};
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
+
+// Conversion routines, including fallbacks for the host or older CUDA.
+// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
+// these in hardware. If we need more performance on older/other CPUs, they are
+// also possible to vectorize directly.
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
+ __half h;
+ h.x = x;
+ return h;
+}
+
+union FP32 {
+ unsigned int u;
+ float f;
+};
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+ return __float2half(ff);
+
+#elif defined(EIGEN_HAS_FP16_C)
+ __half h;
+ h.x = _cvtss_sh(ff, 0);
+ return h;
+
+#else
+ FP32 f; f.f = ff;
+
+ const FP32 f32infty = { 255 << 23 };
+ const FP32 f16max = { (127 + 16) << 23 };
+ const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
+ unsigned int sign_mask = 0x80000000u;
+ __half o;
+ o.x = static_cast<unsigned short>(0x0u);
+
+ unsigned int sign = f.u & sign_mask;
+ f.u ^= sign;
+
+ // NOTE all the integer compares in this function can be safely
+ // compiled into signed compares since all operands are below
+ // 0x80000000. Important if you want fast straight SSE2 code
+ // (since there's no unsigned PCMPGTD).
+
+ if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set)
+ o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
+ } else { // (De)normalized number or zero
+ if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero
+ // use a magic value to align our 10 mantissa bits at the bottom of
+ // the float. as long as FP addition is round-to-nearest-even this
+ // just works.
+ f.f += denorm_magic.f;
+
+ // and one integer subtract of the bias later, we have our final float!
+ o.x = static_cast<unsigned short>(f.u - denorm_magic.u);
+ } else {
+ unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
+
+ // update exponent, rounding bias part 1
+ f.u += ((unsigned int)(15 - 127) << 23) + 0xfff;
+ // rounding bias part 2
+ f.u += mant_odd;
+ // take the bits!
+ o.x = static_cast<unsigned short>(f.u >> 13);
+ }
+ }
+
+ o.x |= static_cast<unsigned short>(sign >> 16);
+ return o;
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+ return __half2float(h);
+
+#elif defined(EIGEN_HAS_FP16_C)
+ return _cvtsh_ss(h.x);
+
+#else
+ const FP32 magic = { 113 << 23 };
+ const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
+ FP32 o;
+
+ o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
+ unsigned int exp = shifted_exp & o.u; // just the exponent
+ o.u += (127 - 15) << 23; // exponent adjust
+
+ // handle exponent special cases
+ if (exp == shifted_exp) { // Inf/NaN?
+ o.u += (128 - 16) << 23; // extra exp adjust
+ } else if (exp == 0) { // Zero/Denormal?
+ o.u += 1 << 23; // extra exp adjust
+ o.f -= magic.f; // renormalize
+ }
+
+ o.u |= (h.x & 0x8000) << 16; // sign bit
+ return o.f;
+#endif
+}
+
+} // end namespace half_impl
+
+} // end namespace Eigen
+
+#endif // EIGEN_HALF_CUDA_H