blob: 330c6e4f80bb701164ac4ec561073f5521c54be6 [file] [log] [blame]
/// Creates an unsigned division function that uses a combination of hardware division and
/// binary long division to divide integers larger than what hardware division by itself can do. This
/// function is intended for microarchitectures that have division hardware, but not fast enough
/// multiplication hardware for `impl_trifecta` to be faster.
#[allow(unused_macros)]
macro_rules! impl_delegate {
(
$fn:ident, // name of the unsigned division function
$zero_div_fn:ident, // function called when division by zero is attempted
$half_normalization_shift:ident, // function for finding the normalization shift of $uX
$half_division:ident, // function for division of a $uX by a $uX
$n_h:expr, // the number of bits in $iH or $uH
$uH:ident, // unsigned integer with half the bit width of $uX
$uX:ident, // unsigned integer with half the bit width of $uD.
$uD:ident, // unsigned integer type for the inputs and outputs of `$fn`
$iD:ident // signed integer type with the same bitwidth as `$uD`
) => {
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
/// tuple.
pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) {
// The two possibility algorithm, undersubtracting long division algorithm, or any kind
// of reciprocal based algorithm will not be fastest, because they involve large
// multiplications that we assume to not be fast enough relative to the divisions to
// outweigh setup times.
// the number of bits in a $uX
let n = $n_h * 2;
let duo_lo = duo as $uX;
let duo_hi = (duo >> n) as $uX;
let div_lo = div as $uX;
let div_hi = (div >> n) as $uX;
match (div_lo == 0, div_hi == 0, duo_hi == 0) {
(true, true, _) => $zero_div_fn(),
(_, false, true) => {
// `duo` < `div`
return (0, duo);
}
(false, true, true) => {
// delegate to smaller division
let tmp = $half_division(duo_lo, div_lo);
return (tmp.0 as $uD, tmp.1 as $uD);
}
(false, true, false) => {
if duo_hi < div_lo {
// `quo_hi` will always be 0. This performs a binary long division algorithm
// to zero `duo_hi` followed by a half division.
// We can calculate the normalization shift using only `$uX` size functions.
// If we calculated the normalization shift using
// `$half_normalization_shift(duo_hi, div_lo false)`, it would break the
// assumption the function has that the first argument is more than the
// second argument. If the arguments are switched, the assumption holds true
// since `duo_hi < div_lo`.
let norm_shift = $half_normalization_shift(div_lo, duo_hi, false);
let shl = if norm_shift == 0 {
// Consider what happens if the msbs of `duo_hi` and `div_lo` align with
// no shifting. The normalization shift will always return
// `norm_shift == 0` regardless of whether it is fully normalized,
// because `duo_hi < div_lo`. In that edge case, `n - norm_shift` would
// result in shift overflow down the line. For the edge case, because
// both `duo_hi < div_lo` and we are comparing all the significant bits
// of `duo_hi` and `div`, we can make `shl = n - 1`.
n - 1
} else {
// We also cannot just use `shl = n - norm_shift - 1` in the general
// case, because when we are not in the edge case comparing all the
// significant bits, then the full `duo < div` may not be true and thus
// breaks the division algorithm.
n - norm_shift
};
// The 3 variable restoring division algorithm (see binary_long.rs) is ideal
// for this task, since `pow` and `quo` can be `$uX` and the delegation
// check is simple.
let mut div: $uD = div << shl;
let mut pow_lo: $uX = 1 << shl;
let mut quo_lo: $uX = 0;
let mut duo = duo;
loop {
let sub = duo.wrapping_sub(div);
if 0 <= (sub as $iD) {
duo = sub;
quo_lo |= pow_lo;
let duo_hi = (duo >> n) as $uX;
if duo_hi == 0 {
// Delegate to get the rest of the quotient. Note that the
// `div_lo` here is the original unshifted `div`.
let tmp = $half_division(duo as $uX, div_lo);
return ((quo_lo | tmp.0) as $uD, tmp.1 as $uD);
}
}
div >>= 1;
pow_lo >>= 1;
}
} else if duo_hi == div_lo {
// `quo_hi == 1`. This branch is cheap and helps with edge cases.
let tmp = $half_division(duo as $uX, div as $uX);
return ((1 << n) | (tmp.0 as $uD), tmp.1 as $uD);
} else {
// `div_lo < duo_hi`
// `rem_hi == 0`
if (div_lo >> $n_h) == 0 {
// Short division of $uD by a $uH, using $uX by $uX division
let div_0 = div_lo as $uH as $uX;
let (quo_hi, rem_3) = $half_division(duo_hi, div_0);
let duo_mid = ((duo >> $n_h) as $uH as $uX) | (rem_3 << $n_h);
let (quo_1, rem_2) = $half_division(duo_mid, div_0);
let duo_lo = (duo as $uH as $uX) | (rem_2 << $n_h);
let (quo_0, rem_1) = $half_division(duo_lo, div_0);
return (
(quo_0 as $uD) | ((quo_1 as $uD) << $n_h) | ((quo_hi as $uD) << n),
rem_1 as $uD,
);
}
// This is basically a short division composed of a half division for the hi
// part, specialized 3 variable binary long division in the middle, and
// another half division for the lo part.
let duo_lo = duo as $uX;
let tmp = $half_division(duo_hi, div_lo);
let quo_hi = tmp.0;
let mut duo = (duo_lo as $uD) | ((tmp.1 as $uD) << n);
// This check is required to avoid breaking the long division below.
if duo < div {
return ((quo_hi as $uD) << n, duo);
}
// The half division handled all shift alignments down to `n`, so this
// division can continue with a shift of `n - 1`.
let mut div: $uD = div << (n - 1);
let mut pow_lo: $uX = 1 << (n - 1);
let mut quo_lo: $uX = 0;
loop {
let sub = duo.wrapping_sub(div);
if 0 <= (sub as $iD) {
duo = sub;
quo_lo |= pow_lo;
let duo_hi = (duo >> n) as $uX;
if duo_hi == 0 {
// Delegate to get the rest of the quotient. Note that the
// `div_lo` here is the original unshifted `div`.
let tmp = $half_division(duo as $uX, div_lo);
return (
(tmp.0) as $uD | (quo_lo as $uD) | ((quo_hi as $uD) << n),
tmp.1 as $uD,
);
}
}
div >>= 1;
pow_lo >>= 1;
}
}
}
(_, false, false) => {
// Full $uD by $uD binary long division. `quo_hi` will always be 0.
if duo < div {
return (0, duo);
}
let div_original = div;
let shl = $half_normalization_shift(duo_hi, div_hi, false);
let mut duo = duo;
let mut div: $uD = div << shl;
let mut pow_lo: $uX = 1 << shl;
let mut quo_lo: $uX = 0;
loop {
let sub = duo.wrapping_sub(div);
if 0 <= (sub as $iD) {
duo = sub;
quo_lo |= pow_lo;
if duo < div_original {
return (quo_lo as $uD, duo);
}
}
div >>= 1;
pow_lo >>= 1;
}
}
}
}
};
}
public_test_dep! {
/// Returns `n / d` and sets `*rem = n % d`.
///
/// This specialization exists because:
/// - The LLVM backend for 32-bit SPARC cannot compile functions that return `(u128, u128)`,
/// so we have to use an old fashioned `&mut u128` argument to return the remainder.
/// - 64-bit SPARC does not have u64 * u64 => u128 widening multiplication, which makes the
/// delegate algorithm strategy the only reasonably fast way to perform `u128` division.
// used on SPARC
#[allow(dead_code)]
pub(crate) fn u128_divide_sparc(duo: u128, div: u128, rem: &mut u128) -> u128 {
use super::*;
let duo_lo = duo as u64;
let duo_hi = (duo >> 64) as u64;
let div_lo = div as u64;
let div_hi = (div >> 64) as u64;
match (div_lo == 0, div_hi == 0, duo_hi == 0) {
(true, true, _) => zero_div_fn(),
(_, false, true) => {
*rem = duo;
return 0;
}
(false, true, true) => {
let tmp = u64_by_u64_div_rem(duo_lo, div_lo);
*rem = tmp.1 as u128;
return tmp.0 as u128;
}
(false, true, false) => {
if duo_hi < div_lo {
let norm_shift = u64_normalization_shift(div_lo, duo_hi, false);
let shl = if norm_shift == 0 {
64 - 1
} else {
64 - norm_shift
};
let mut div: u128 = div << shl;
let mut pow_lo: u64 = 1 << shl;
let mut quo_lo: u64 = 0;
let mut duo = duo;
loop {
let sub = duo.wrapping_sub(div);
if 0 <= (sub as i128) {
duo = sub;
quo_lo |= pow_lo;
let duo_hi = (duo >> 64) as u64;
if duo_hi == 0 {
let tmp = u64_by_u64_div_rem(duo as u64, div_lo);
*rem = tmp.1 as u128;
return (quo_lo | tmp.0) as u128;
}
}
div >>= 1;
pow_lo >>= 1;
}
} else if duo_hi == div_lo {
let tmp = u64_by_u64_div_rem(duo as u64, div as u64);
*rem = tmp.1 as u128;
return (1 << 64) | (tmp.0 as u128);
} else {
if (div_lo >> 32) == 0 {
let div_0 = div_lo as u32 as u64;
let (quo_hi, rem_3) = u64_by_u64_div_rem(duo_hi, div_0);
let duo_mid = ((duo >> 32) as u32 as u64) | (rem_3 << 32);
let (quo_1, rem_2) = u64_by_u64_div_rem(duo_mid, div_0);
let duo_lo = (duo as u32 as u64) | (rem_2 << 32);
let (quo_0, rem_1) = u64_by_u64_div_rem(duo_lo, div_0);
*rem = rem_1 as u128;
return (quo_0 as u128) | ((quo_1 as u128) << 32) | ((quo_hi as u128) << 64);
}
let duo_lo = duo as u64;
let tmp = u64_by_u64_div_rem(duo_hi, div_lo);
let quo_hi = tmp.0;
let mut duo = (duo_lo as u128) | ((tmp.1 as u128) << 64);
if duo < div {
*rem = duo;
return (quo_hi as u128) << 64;
}
let mut div: u128 = div << (64 - 1);
let mut pow_lo: u64 = 1 << (64 - 1);
let mut quo_lo: u64 = 0;
loop {
let sub = duo.wrapping_sub(div);
if 0 <= (sub as i128) {
duo = sub;
quo_lo |= pow_lo;
let duo_hi = (duo >> 64) as u64;
if duo_hi == 0 {
let tmp = u64_by_u64_div_rem(duo as u64, div_lo);
*rem = tmp.1 as u128;
return (tmp.0) as u128 | (quo_lo as u128) | ((quo_hi as u128) << 64);
}
}
div >>= 1;
pow_lo >>= 1;
}
}
}
(_, false, false) => {
if duo < div {
*rem = duo;
return 0;
}
let div_original = div;
let shl = u64_normalization_shift(duo_hi, div_hi, false);
let mut duo = duo;
let mut div: u128 = div << shl;
let mut pow_lo: u64 = 1 << shl;
let mut quo_lo: u64 = 0;
loop {
let sub = duo.wrapping_sub(div);
if 0 <= (sub as i128) {
duo = sub;
quo_lo |= pow_lo;
if duo < div_original {
*rem = duo;
return quo_lo as u128;
}
}
div >>= 1;
pow_lo >>= 1;
}
}
}
}
}