Compare performance with Eigen
diff --git a/bench/from-ieee-array.cc b/bench/from-ieee-array.cc
index 49a9150..2e898a6 100644
--- a/bench/from-ieee-array.cc
+++ b/bench/from-ieee-array.cc
@@ -16,10 +16,11 @@
 #endif
 
 #ifdef FP16_COMPARATIVE_BENCHMARKS
-	#include <third-party/float16-compressor.h>
 	#include <third-party/THHalf.h>
-	#include <third-party/half.hpp>
 	#include <third-party/npy-halffloat.h>
+	#include <third-party/eigen-half.h>
+	#include <third-party/float16-compressor.h>
+	#include <third-party/half.hpp>
 #endif
 
 
@@ -214,6 +215,33 @@
 	}
 	BENCHMARK(npy_halfbits_to_floatbits)->RangeMultiplier(2)->Range(1<<10, 64<<20);
 
+	static void Eigen_half_to_float(benchmark::State& state) {
+		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
+		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
+
+		std::vector<uint16_t> fp16(state.range(0));
+		std::vector<float> fp32(state.range(0));
+		std::generate(fp16.begin(), fp16.end(),
+			[&rng]{ return fp16_ieee_from_fp32_value(rng()); });
+
+		while (state.KeepRunning()) {
+			uint16_t* input = fp16.data();
+			benchmark::DoNotOptimize(input);
+
+			float* output = fp32.data();
+			const size_t n = state.range(0);
+			for (size_t i = 0; i < n; i++) {
+				output[i] =
+					Eigen::half_impl::half_to_float(
+						Eigen::half_impl::raw_uint16_to_half(input[i]));
+			}
+
+			benchmark::DoNotOptimize(output);
+		}
+		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
+	}
+	BENCHMARK(Eigen_half_to_float)->RangeMultiplier(2)->Range(1<<10, 64<<20);
+
 	static void Float16Compressor_decompress(benchmark::State& state) {
 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
diff --git a/bench/ieee-element.cc b/bench/ieee-element.cc
index f85c5ab..fc660e9 100644
--- a/bench/ieee-element.cc
+++ b/bench/ieee-element.cc
@@ -10,10 +10,11 @@
 #endif
 
 #ifdef FP16_COMPARATIVE_BENCHMARKS
-	#include <third-party/float16-compressor.h>
 	#include <third-party/THHalf.h>
-	#include <third-party/half.hpp>
 	#include <third-party/npy-halffloat.h>
+	#include <third-party/eigen-half.h>
+	#include <third-party/float16-compressor.h>
+	#include <third-party/half.hpp>
 #endif
 
 static inline uint16_t next_xorshift16(uint16_t x) {
@@ -89,17 +90,6 @@
 #endif
 
 #ifdef FP16_COMPARATIVE_BENCHMARKS
-	static void Float16Compressor_decompress(benchmark::State& state) {
-		uint16_t fp16 = UINT16_C(0x7C00);
-		while (state.KeepRunning()) {
-			const float fp32 = Float16Compressor::decompress(fp16);
-
-			fp16 = next_xorshift16(fp16);
-			benchmark::DoNotOptimize(fp32);
-		}
-	}
-	BENCHMARK(Float16Compressor_decompress);
-
 	static void TH_halfbits2float(benchmark::State& state) {
 		uint16_t fp16 = UINT16_C(0x7C00);
 		while (state.KeepRunning()) {
@@ -112,6 +102,41 @@
 	}
 	BENCHMARK(TH_halfbits2float);
 
+	static void npy_halfbits_to_floatbits(benchmark::State& state) {
+		uint16_t fp16 = UINT16_C(0x7C00);
+		while (state.KeepRunning()) {
+			const uint32_t fp32 = npy_halfbits_to_floatbits(fp16);
+
+			fp16 = next_xorshift16(fp16);
+			benchmark::DoNotOptimize(fp32);
+		}
+	}
+	BENCHMARK(npy_halfbits_to_floatbits);
+
+	static void Eigen_half_to_float(benchmark::State& state) {
+		uint16_t fp16 = UINT16_C(0x7C00);
+		while (state.KeepRunning()) {
+			const float fp32 =
+				Eigen::half_impl::half_to_float(
+					Eigen::half_impl::raw_uint16_to_half(fp16));
+
+			fp16 = next_xorshift16(fp16);
+			benchmark::DoNotOptimize(fp32);
+		}
+	}
+	BENCHMARK(Eigen_half_to_float);
+
+	static void Float16Compressor_decompress(benchmark::State& state) {
+		uint16_t fp16 = UINT16_C(0x7C00);
+		while (state.KeepRunning()) {
+			const float fp32 = Float16Compressor::decompress(fp16);
+
+			fp16 = next_xorshift16(fp16);
+			benchmark::DoNotOptimize(fp32);
+		}
+	}
+	BENCHMARK(Float16Compressor_decompress);
+
 	static void half_float_detail_half2float_table(benchmark::State& state) {
 		uint16_t fp16 = UINT16_C(0x7C00);
 		while (state.KeepRunning()) {
@@ -135,17 +160,6 @@
 		}
 	}
 	BENCHMARK(half_float_detail_half2float_branch);
-
-	static void npy_halfbits_to_floatbits(benchmark::State& state) {
-		uint16_t fp16 = UINT16_C(0x7C00);
-		while (state.KeepRunning()) {
-			const uint32_t fp32 = npy_halfbits_to_floatbits(fp16);
-
-			fp16 = next_xorshift16(fp16);
-			benchmark::DoNotOptimize(fp32);
-		}
-	}
-	BENCHMARK(npy_halfbits_to_floatbits);
 #endif
 
 /* Conversion from IEEE FP32 to IEEE FP16 */
@@ -176,17 +190,6 @@
 #endif
 
 #ifdef FP16_COMPARATIVE_BENCHMARKS
-	static void Float16Compressor_compress(benchmark::State& state) {
-		uint32_t fp32 = UINT32_C(0x7F800000);
-		while (state.KeepRunning()) {
-			const uint16_t fp16 = Float16Compressor::compress(fp32_from_bits(fp32));
-
-			fp32 = next_xorshift32(fp32);
-			benchmark::DoNotOptimize(fp16);
-		}
-	}
-	BENCHMARK(Float16Compressor_compress);
-
 	static void TH_float2halfbits(benchmark::State& state) {
 		uint32_t fp32 = UINT32_C(0x7F800000);
 		while (state.KeepRunning()) {
@@ -200,6 +203,41 @@
 	}
 	BENCHMARK(TH_float2halfbits);
 
+	static void npy_floatbits_to_halfbits(benchmark::State& state) {
+		uint32_t fp32 = UINT32_C(0x7F800000);
+		while (state.KeepRunning()) {
+			const uint16_t fp16 = npy_floatbits_to_halfbits(fp32);
+
+			fp32 = next_xorshift32(fp32);
+			benchmark::DoNotOptimize(fp16);
+		}
+	}
+	BENCHMARK(npy_floatbits_to_halfbits);
+
+	static void Eigen_float_to_half_rtne(benchmark::State& state) {
+		uint32_t fp32 = UINT32_C(0x7F800000);
+		while (state.KeepRunning()) {
+			const Eigen::half_impl::__half fp16 =
+				Eigen::half_impl::float_to_half_rtne(
+					fp32_from_bits(fp32));
+
+			fp32 = next_xorshift32(fp32);
+			benchmark::DoNotOptimize(fp16);
+		}
+	}
+	BENCHMARK(Eigen_float_to_half_rtne);
+
+	static void Float16Compressor_compress(benchmark::State& state) {
+		uint32_t fp32 = UINT32_C(0x7F800000);
+		while (state.KeepRunning()) {
+			const uint16_t fp16 = Float16Compressor::compress(fp32_from_bits(fp32));
+
+			fp32 = next_xorshift32(fp32);
+			benchmark::DoNotOptimize(fp16);
+		}
+	}
+	BENCHMARK(Float16Compressor_compress);
+
 	static void half_float_detail_float2half_table(benchmark::State& state) {
 		uint32_t fp32 = UINT32_C(0x7F800000);
 		while (state.KeepRunning()) {
@@ -225,17 +263,6 @@
 		}
 	}
 	BENCHMARK(half_float_detail_float2half_branch);
-
-	static void npy_floatbits_to_halfbits(benchmark::State& state) {
-		uint32_t fp32 = UINT32_C(0x7F800000);
-		while (state.KeepRunning()) {
-			const uint16_t fp16 = npy_floatbits_to_halfbits(fp32);
-
-			fp32 = next_xorshift32(fp32);
-			benchmark::DoNotOptimize(fp16);
-		}
-	}
-	BENCHMARK(npy_floatbits_to_halfbits);
 #endif
 
 BENCHMARK_MAIN();
diff --git a/bench/to-ieee-array.cc b/bench/to-ieee-array.cc
index 1dedcc0..80f7c2a 100644
--- a/bench/to-ieee-array.cc
+++ b/bench/to-ieee-array.cc
@@ -16,10 +16,11 @@
 #endif
 
 #ifdef FP16_COMPARATIVE_BENCHMARKS
-	#include <third-party/float16-compressor.h>
 	#include <third-party/THHalf.h>
-	#include <third-party/half.hpp>
 	#include <third-party/npy-halffloat.h>
+	#include <third-party/eigen-half.h>
+	#include <third-party/float16-compressor.h>
+	#include <third-party/half.hpp>
 #endif
 
 
@@ -150,6 +151,30 @@
 	}
 	BENCHMARK(npy_floatbits_to_halfbits)->RangeMultiplier(2)->Range(1<<10, 64<<20);
 
+	static void Eigen_float_to_half_rtne(benchmark::State& state) {
+		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
+		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
+
+		std::vector<float> fp32(state.range(0));
+		std::vector<uint16_t> fp16(state.range(0));
+		std::generate(fp32.begin(), fp32.end(), std::ref(rng));
+
+		while (state.KeepRunning()) {
+			float* input = fp32.data();
+			benchmark::DoNotOptimize(input);
+
+			uint16_t* output = fp16.data();
+			const size_t n = state.range(0);
+			for (size_t i = 0; i < n; i++) {
+				output[i] = Eigen::half_impl::float_to_half_rtne(input[i]).x;
+			}
+
+			benchmark::DoNotOptimize(output);
+		}
+		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
+	}
+	BENCHMARK(Eigen_float_to_half_rtne)->RangeMultiplier(2)->Range(1<<10, 64<<20);
+
 	static void Float16Compressor_compress(benchmark::State& state) {
 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
diff --git a/third-party/eigen-half.h b/third-party/eigen-half.h
new file mode 100644
index 0000000..f7181cd
--- /dev/null
+++ b/third-party/eigen-half.h
@@ -0,0 +1,170 @@
+/*
+ * This implementation is extracted from Eigen:
+ *   Repo: bitbucket.org/eigen/eigen
+ *   File: Eigen/src/Core/arch/CUDA/Half.h
+ *   Commit ID: 96e0f73a35de54f675d825bef5339b2f08e77eb4
+ *
+ * Removed a lot of redundant and cuda-specific code.
+ */
+
+#define EIGEN_STRONG_INLINE static inline
+#define EIGEN_DEVICE_FUNC
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+// The conversion routines are Copyright (c) Fabian Giesen, 2016.
+// The original license follows:
+//
+// Copyright (c) Fabian Giesen, 2016
+// All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Standard 16-bit float type, mostly useful for GPUs. Defines a new
+// type Eigen::half (inheriting from CUDA's __half struct) with
+// operator overloads such that it behaves basically as an arithmetic
+// type. It will be quite slow on CPUs (so it is recommended to stay
+// in fp32 for CPUs, except for simple parameter conversions, I/O
+// to disk and the likes), but fast on GPUs.
+
+
+#ifndef EIGEN_HALF_CUDA_H
+#define EIGEN_HALF_CUDA_H
+
+namespace Eigen {
+
+namespace half_impl {
+
+// Make our own __half definition that is similar to CUDA's.
+struct __half {
+  EIGEN_DEVICE_FUNC __half() : x(0) {}
+  explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
+  unsigned short x;
+};
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
+
+// Conversion routines, including fallbacks for the host or older CUDA.
+// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
+// these in hardware. If we need more performance on older/other CPUs, they are
+// also possible to vectorize directly.
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
+  __half h;
+  h.x = x;
+  return h;
+}
+
+union FP32 {
+  unsigned int u;
+  float f;
+};
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+  return __float2half(ff);
+
+#elif defined(EIGEN_HAS_FP16_C)
+  __half h;
+  h.x = _cvtss_sh(ff, 0);
+  return h;
+
+#else
+  FP32 f; f.f = ff;
+
+  const FP32 f32infty = { 255 << 23 };
+  const FP32 f16max = { (127 + 16) << 23 };
+  const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
+  unsigned int sign_mask = 0x80000000u;
+  __half o;
+  o.x = static_cast<unsigned short>(0x0u);
+
+  unsigned int sign = f.u & sign_mask;
+  f.u ^= sign;
+
+  // NOTE all the integer compares in this function can be safely
+  // compiled into signed compares since all operands are below
+  // 0x80000000. Important if you want fast straight SSE2 code
+  // (since there's no unsigned PCMPGTD).
+
+  if (f.u >= f16max.u) {  // result is Inf or NaN (all exponent bits set)
+    o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
+  } else {  // (De)normalized number or zero
+    if (f.u < (113 << 23)) {  // resulting FP16 is subnormal or zero
+      // use a magic value to align our 10 mantissa bits at the bottom of
+      // the float. as long as FP addition is round-to-nearest-even this
+      // just works.
+      f.f += denorm_magic.f;
+
+      // and one integer subtract of the bias later, we have our final float!
+      o.x = static_cast<unsigned short>(f.u - denorm_magic.u);
+    } else {
+      unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
+
+      // update exponent, rounding bias part 1
+      f.u += ((unsigned int)(15 - 127) << 23) + 0xfff;
+      // rounding bias part 2
+      f.u += mant_odd;
+      // take the bits!
+      o.x = static_cast<unsigned short>(f.u >> 13);
+    }
+  }
+
+  o.x |= static_cast<unsigned short>(sign >> 16);
+  return o;
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+  return __half2float(h);
+
+#elif defined(EIGEN_HAS_FP16_C)
+  return _cvtsh_ss(h.x);
+
+#else
+  const FP32 magic = { 113 << 23 };
+  const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
+  FP32 o;
+
+  o.u = (h.x & 0x7fff) << 13;             // exponent/mantissa bits
+  unsigned int exp = shifted_exp & o.u;   // just the exponent
+  o.u += (127 - 15) << 23;                // exponent adjust
+
+  // handle exponent special cases
+  if (exp == shifted_exp) {     // Inf/NaN?
+    o.u += (128 - 16) << 23;    // extra exp adjust
+  } else if (exp == 0) {        // Zero/Denormal?
+    o.u += 1 << 23;             // extra exp adjust
+    o.f -= magic.f;             // renormalize
+  }
+
+  o.u |= (h.x & 0x8000) << 16;    // sign bit
+  return o.f;
+#endif
+}
+
+} // end namespace half_impl
+
+} // end namespace Eigen
+
+#endif // EIGEN_HALF_CUDA_H