2x faster FP32->IEEE FP16 conversion

Additionally, it now correctly handles NaNs on PNaCl and
Emscripten/Asm.js
diff --git a/include/fp16/fp16.h b/include/fp16/fp16.h
index ab10fec..6fde4ca 100644
--- a/include/fp16/fp16.h
+++ b/include/fp16/fp16.h
@@ -207,24 +207,24 @@
  * floating-point operations and bitcasts between integer and floating-point variables.
  */
 static inline uint16_t fp16_ieee_from_fp32_value(float f) {
-	float base = fabsf(f);
 	const float scale_to_inf = 0x1.0p+112f;
-	base *= scale_to_inf;
-	const float scale_to_zero = 0x1.0p-112f * 0x1.0p+2f;
-	base *= scale_to_zero;
-	if (!(base == base)) {
-		base = nanf("0x200");
+	const float scale_to_zero = 0x1.0p-110f;
+	float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+	const uint32_t w = fp32_to_bits(f);
+	const uint32_t shl1_w = w + w;
+	const uint32_t sign = w & UINT32_C(0x80000000);
+	uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+	if (bias < UINT32_C(0x71000000)) {
+		bias = UINT32_C(0x71000000);
 	}
 
-	const uint32_t sign = fp32_to_bits(f) & UINT32_C(0x80000000);
-	float bias = f * (0x1.0p+23f * 0x1.0p-10f * 0x1.0p+2f);
-	bias = fp32_from_bits(fp32_to_bits(bias) & UINT32_C(0x7F800000));
-	if (bias < (0x1p-1f * 0x1.0p+2f)) {
-		bias = (0x1p-1f * 0x1.0p+2f);
-	}
-	bias += base;
-	const uint32_t exp_f = fp32_to_bits(bias) >> 13;// - (((0x7F - 0xF) + (23 - 10 + 1 + 2)) << 10);
-	return (sign >> 16) | ((exp_f & UINT32_C(0x00007C00)) + (fp32_to_bits(bias) & UINT32_C(0x00000FFF)));
+	base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+	const uint32_t bits = fp32_to_bits(base);
+	const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+	const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+	const uint32_t nonsign = exp_bits + mantissa_bits;
+	return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
 }
 
 /*