| //! Bit fiddling on positive IEEE 754 floats. Negative numbers aren't and needn't be handled. |
| //! Normal floating point numbers have a canonical representation as (frac, exp) such that the |
| //! value is 2<sup>exp</sup> * (1 + sum(frac[N-i] / 2<sup>i</sup>)) where N is the number of bits. |
| //! Subnormals are slightly different and weird, but the same principle applies. |
| //! |
| //! Here, however, we represent them as (sig, k) with f positive, such that the value is f * |
| //! 2<sup>e</sup>. Besides making the "hidden bit" explicit, this changes the exponent by the |
| //! so-called mantissa shift. |
| //! |
| //! Put another way, normally floats are written as (1) but here they are written as (2): |
| //! |
| //! 1. `1.101100...11 * 2^m` |
| //! 2. `1101100...11 * 2^n` |
| //! |
| //! We call (1) the **fractional representation** and (2) the **integral representation**. |
| //! |
| //! Many functions in this module only handle normal numbers. The dec2flt routines conservatively |
| //! take the universally-correct slow path (Algorithm M) for very small and very large numbers. |
| //! That algorithm needs only next_float() which does handle subnormals and zeros. |
| use crate::cmp::Ordering::{Less, Equal, Greater}; |
| use crate::convert::{TryFrom, TryInto}; |
| use crate::ops::{Add, Mul, Div, Neg}; |
| use crate::fmt::{Debug, LowerExp}; |
| use crate::num::diy_float::Fp; |
| use crate::num::FpCategory::{Infinite, Zero, Subnormal, Normal, Nan}; |
| use crate::num::FpCategory; |
| use crate::num::dec2flt::num::{self, Big}; |
| use crate::num::dec2flt::table; |
| |
| #[derive(Copy, Clone, Debug)] |
| pub struct Unpacked { |
| pub sig: u64, |
| pub k: i16, |
| } |
| |
| impl Unpacked { |
| pub fn new(sig: u64, k: i16) -> Self { |
| Unpacked { sig, k } |
| } |
| } |
| |
| /// A helper trait to avoid duplicating basically all the conversion code for `f32` and `f64`. |
| /// |
| /// See the parent module's doc comment for why this is necessary. |
| /// |
| /// Should **never ever** be implemented for other types or be used outside the dec2flt module. |
| pub trait RawFloat |
| : Copy |
| + Debug |
| + LowerExp |
| + Mul<Output=Self> |
| + Div<Output=Self> |
| + Neg<Output=Self> |
| { |
| const INFINITY: Self; |
| const NAN: Self; |
| const ZERO: Self; |
| |
| /// Type used by `to_bits` and `from_bits`. |
| type Bits: Add<Output = Self::Bits> + From<u8> + TryFrom<u64>; |
| |
| /// Performs a raw transmutation to an integer. |
| fn to_bits(self) -> Self::Bits; |
| |
| /// Performs a raw transmutation from an integer. |
| fn from_bits(v: Self::Bits) -> Self; |
| |
| /// Returns the category that this number falls into. |
| fn classify(self) -> FpCategory; |
| |
| /// Returns the mantissa, exponent and sign as integers. |
| fn integer_decode(self) -> (u64, i16, i8); |
| |
| /// Decodes the float. |
| fn unpack(self) -> Unpacked; |
| |
| /// Casts from a small integer that can be represented exactly. Panic if the integer can't be |
| /// represented, the other code in this module makes sure to never let that happen. |
| fn from_int(x: u64) -> Self; |
| |
| /// Gets the value 10<sup>e</sup> from a pre-computed table. |
| /// Panics for `e >= CEIL_LOG5_OF_MAX_SIG`. |
| fn short_fast_pow10(e: usize) -> Self; |
| |
| /// What the name says. It's easier to hard code than juggling intrinsics and |
| /// hoping LLVM constant folds it. |
| const CEIL_LOG5_OF_MAX_SIG: i16; |
| |
| // A conservative bound on the decimal digits of inputs that can't produce overflow or zero or |
| /// subnormals. Probably the decimal exponent of the maximum normal value, hence the name. |
| const MAX_NORMAL_DIGITS: usize; |
| |
| /// When the most significant decimal digit has a place value greater than this, the number |
| /// is certainly rounded to infinity. |
| const INF_CUTOFF: i64; |
| |
| /// When the most significant decimal digit has a place value less than this, the number |
| /// is certainly rounded to zero. |
| const ZERO_CUTOFF: i64; |
| |
| /// The number of bits in the exponent. |
| const EXP_BITS: u8; |
| |
| /// The number of bits in the significand, *including* the hidden bit. |
| const SIG_BITS: u8; |
| |
| /// The number of bits in the significand, *excluding* the hidden bit. |
| const EXPLICIT_SIG_BITS: u8; |
| |
| /// The maximum legal exponent in fractional representation. |
| const MAX_EXP: i16; |
| |
| /// The minimum legal exponent in fractional representation, excluding subnormals. |
| const MIN_EXP: i16; |
| |
| /// `MAX_EXP` for integral representation, i.e., with the shift applied. |
| const MAX_EXP_INT: i16; |
| |
| /// `MAX_EXP` encoded (i.e., with offset bias) |
| const MAX_ENCODED_EXP: i16; |
| |
| /// `MIN_EXP` for integral representation, i.e., with the shift applied. |
| const MIN_EXP_INT: i16; |
| |
| /// The maximum normalized significand in integral representation. |
| const MAX_SIG: u64; |
| |
| /// The minimal normalized significand in integral representation. |
| const MIN_SIG: u64; |
| } |
| |
| // Mostly a workaround for #34344. |
| macro_rules! other_constants { |
| ($type: ident) => { |
| const EXPLICIT_SIG_BITS: u8 = Self::SIG_BITS - 1; |
| const MAX_EXP: i16 = (1 << (Self::EXP_BITS - 1)) - 1; |
| const MIN_EXP: i16 = -Self::MAX_EXP + 1; |
| const MAX_EXP_INT: i16 = Self::MAX_EXP - (Self::SIG_BITS as i16 - 1); |
| const MAX_ENCODED_EXP: i16 = (1 << Self::EXP_BITS) - 1; |
| const MIN_EXP_INT: i16 = Self::MIN_EXP - (Self::SIG_BITS as i16 - 1); |
| const MAX_SIG: u64 = (1 << Self::SIG_BITS) - 1; |
| const MIN_SIG: u64 = 1 << (Self::SIG_BITS - 1); |
| |
| const INFINITY: Self = $crate::$type::INFINITY; |
| const NAN: Self = $crate::$type::NAN; |
| const ZERO: Self = 0.0; |
| } |
| } |
| |
| impl RawFloat for f32 { |
| type Bits = u32; |
| |
| const SIG_BITS: u8 = 24; |
| const EXP_BITS: u8 = 8; |
| const CEIL_LOG5_OF_MAX_SIG: i16 = 11; |
| const MAX_NORMAL_DIGITS: usize = 35; |
| const INF_CUTOFF: i64 = 40; |
| const ZERO_CUTOFF: i64 = -48; |
| other_constants!(f32); |
| |
| /// Returns the mantissa, exponent and sign as integers. |
| fn integer_decode(self) -> (u64, i16, i8) { |
| let bits = self.to_bits(); |
| let sign: i8 = if bits >> 31 == 0 { 1 } else { -1 }; |
| let mut exponent: i16 = ((bits >> 23) & 0xff) as i16; |
| let mantissa = if exponent == 0 { |
| (bits & 0x7fffff) << 1 |
| } else { |
| (bits & 0x7fffff) | 0x800000 |
| }; |
| // Exponent bias + mantissa shift |
| exponent -= 127 + 23; |
| (mantissa as u64, exponent, sign) |
| } |
| |
| fn unpack(self) -> Unpacked { |
| let (sig, exp, _sig) = self.integer_decode(); |
| Unpacked::new(sig, exp) |
| } |
| |
| fn from_int(x: u64) -> f32 { |
| // rkruppe is uncertain whether `as` rounds correctly on all platforms. |
| debug_assert!(x as f32 == fp_to_float(Fp { f: x, e: 0 })); |
| x as f32 |
| } |
| |
| fn short_fast_pow10(e: usize) -> Self { |
| table::F32_SHORT_POWERS[e] |
| } |
| |
| fn classify(self) -> FpCategory { self.classify() } |
| fn to_bits(self) -> Self::Bits { self.to_bits() } |
| fn from_bits(v: Self::Bits) -> Self { Self::from_bits(v) } |
| } |
| |
| |
| impl RawFloat for f64 { |
| type Bits = u64; |
| |
| const SIG_BITS: u8 = 53; |
| const EXP_BITS: u8 = 11; |
| const CEIL_LOG5_OF_MAX_SIG: i16 = 23; |
| const MAX_NORMAL_DIGITS: usize = 305; |
| const INF_CUTOFF: i64 = 310; |
| const ZERO_CUTOFF: i64 = -326; |
| other_constants!(f64); |
| |
| /// Returns the mantissa, exponent and sign as integers. |
| fn integer_decode(self) -> (u64, i16, i8) { |
| let bits = self.to_bits(); |
| let sign: i8 = if bits >> 63 == 0 { 1 } else { -1 }; |
| let mut exponent: i16 = ((bits >> 52) & 0x7ff) as i16; |
| let mantissa = if exponent == 0 { |
| (bits & 0xfffffffffffff) << 1 |
| } else { |
| (bits & 0xfffffffffffff) | 0x10000000000000 |
| }; |
| // Exponent bias + mantissa shift |
| exponent -= 1023 + 52; |
| (mantissa, exponent, sign) |
| } |
| |
| fn unpack(self) -> Unpacked { |
| let (sig, exp, _sig) = self.integer_decode(); |
| Unpacked::new(sig, exp) |
| } |
| |
| fn from_int(x: u64) -> f64 { |
| // rkruppe is uncertain whether `as` rounds correctly on all platforms. |
| debug_assert!(x as f64 == fp_to_float(Fp { f: x, e: 0 })); |
| x as f64 |
| } |
| |
| fn short_fast_pow10(e: usize) -> Self { |
| table::F64_SHORT_POWERS[e] |
| } |
| |
| fn classify(self) -> FpCategory { self.classify() } |
| fn to_bits(self) -> Self::Bits { self.to_bits() } |
| fn from_bits(v: Self::Bits) -> Self { Self::from_bits(v) } |
| } |
| |
| /// Converts an `Fp` to the closest machine float type. |
| /// Does not handle subnormal results. |
| pub fn fp_to_float<T: RawFloat>(x: Fp) -> T { |
| let x = x.normalize(); |
| // x.f is 64 bit, so x.e has a mantissa shift of 63 |
| let e = x.e + 63; |
| if e > T::MAX_EXP { |
| panic!("fp_to_float: exponent {} too large", e) |
| } else if e > T::MIN_EXP { |
| encode_normal(round_normal::<T>(x)) |
| } else { |
| panic!("fp_to_float: exponent {} too small", e) |
| } |
| } |
| |
| /// Round the 64-bit significand to T::SIG_BITS bits with half-to-even. |
| /// Does not handle exponent overflow. |
| pub fn round_normal<T: RawFloat>(x: Fp) -> Unpacked { |
| let excess = 64 - T::SIG_BITS as i16; |
| let half: u64 = 1 << (excess - 1); |
| let (q, rem) = (x.f >> excess, x.f & ((1 << excess) - 1)); |
| assert_eq!(q << excess | rem, x.f); |
| // Adjust mantissa shift |
| let k = x.e + excess; |
| if rem < half { |
| Unpacked::new(q, k) |
| } else if rem == half && (q % 2) == 0 { |
| Unpacked::new(q, k) |
| } else if q == T::MAX_SIG { |
| Unpacked::new(T::MIN_SIG, k + 1) |
| } else { |
| Unpacked::new(q + 1, k) |
| } |
| } |
| |
| /// Inverse of `RawFloat::unpack()` for normalized numbers. |
| /// Panics if the significand or exponent are not valid for normalized numbers. |
| pub fn encode_normal<T: RawFloat>(x: Unpacked) -> T { |
| debug_assert!(T::MIN_SIG <= x.sig && x.sig <= T::MAX_SIG, |
| "encode_normal: significand not normalized"); |
| // Remove the hidden bit |
| let sig_enc = x.sig & !(1 << T::EXPLICIT_SIG_BITS); |
| // Adjust the exponent for exponent bias and mantissa shift |
| let k_enc = x.k + T::MAX_EXP + T::EXPLICIT_SIG_BITS as i16; |
| debug_assert!(k_enc != 0 && k_enc < T::MAX_ENCODED_EXP, |
| "encode_normal: exponent out of range"); |
| // Leave sign bit at 0 ("+"), our numbers are all positive |
| let bits = (k_enc as u64) << T::EXPLICIT_SIG_BITS | sig_enc; |
| T::from_bits(bits.try_into().unwrap_or_else(|_| unreachable!())) |
| } |
| |
| /// Construct a subnormal. A mantissa of 0 is allowed and constructs zero. |
| pub fn encode_subnormal<T: RawFloat>(significand: u64) -> T { |
| assert!(significand < T::MIN_SIG, "encode_subnormal: not actually subnormal"); |
| // Encoded exponent is 0, the sign bit is 0, so we just have to reinterpret the bits. |
| T::from_bits(significand.try_into().unwrap_or_else(|_| unreachable!())) |
| } |
| |
| /// Approximate a bignum with an Fp. Rounds within 0.5 ULP with half-to-even. |
| pub fn big_to_fp(f: &Big) -> Fp { |
| let end = f.bit_length(); |
| assert!(end != 0, "big_to_fp: unexpectedly, input is zero"); |
| let start = end.saturating_sub(64); |
| let leading = num::get_bits(f, start, end); |
| // We cut off all bits prior to the index `start`, i.e., we effectively right-shift by |
| // an amount of `start`, so this is also the exponent we need. |
| let e = start as i16; |
| let rounded_down = Fp { f: leading, e }.normalize(); |
| // Round (half-to-even) depending on the truncated bits. |
| match num::compare_with_half_ulp(f, start) { |
| Less => rounded_down, |
| Equal if leading % 2 == 0 => rounded_down, |
| Equal | Greater => match leading.checked_add(1) { |
| Some(f) => Fp { f, e }.normalize(), |
| None => Fp { f: 1 << 63, e: e + 1 }, |
| } |
| } |
| } |
| |
| /// Finds the largest floating point number strictly smaller than the argument. |
| /// Does not handle subnormals, zero, or exponent underflow. |
| pub fn prev_float<T: RawFloat>(x: T) -> T { |
| match x.classify() { |
| Infinite => panic!("prev_float: argument is infinite"), |
| Nan => panic!("prev_float: argument is NaN"), |
| Subnormal => panic!("prev_float: argument is subnormal"), |
| Zero => panic!("prev_float: argument is zero"), |
| Normal => { |
| let Unpacked { sig, k } = x.unpack(); |
| if sig == T::MIN_SIG { |
| encode_normal(Unpacked::new(T::MAX_SIG, k - 1)) |
| } else { |
| encode_normal(Unpacked::new(sig - 1, k)) |
| } |
| } |
| } |
| } |
| |
| // Find the smallest floating point number strictly larger than the argument. |
| // This operation is saturating, i.e., next_float(inf) == inf. |
| // Unlike most code in this module, this function does handle zero, subnormals, and infinities. |
| // However, like all other code here, it does not deal with NaN and negative numbers. |
| pub fn next_float<T: RawFloat>(x: T) -> T { |
| match x.classify() { |
| Nan => panic!("next_float: argument is NaN"), |
| Infinite => T::INFINITY, |
| // This seems too good to be true, but it works. |
| // 0.0 is encoded as the all-zero word. Subnormals are 0x000m...m where m is the mantissa. |
| // In particular, the smallest subnormal is 0x0...01 and the largest is 0x000F...F. |
| // The smallest normal number is 0x0010...0, so this corner case works as well. |
| // If the increment overflows the mantissa, the carry bit increments the exponent as we |
| // want, and the mantissa bits become zero. Because of the hidden bit convention, this |
| // too is exactly what we want! |
| // Finally, f64::MAX + 1 = 7eff...f + 1 = 7ff0...0 = f64::INFINITY. |
| Zero | Subnormal | Normal => { |
| T::from_bits(x.to_bits() + T::Bits::from(1u8)) |
| } |
| } |
| } |