blob: 760f5f5b7a3262f5276525ad1134bcf6f5a6f856 [file] [log] [blame]
// TODO: when `unsafe_block_in_unsafe_fn` is stabilized, remove this
#![allow(unused_unsafe)]
// The functions are complex with many branches, and explicit
// `return`s makes it clear where function exit points are
#![allow(clippy::needless_return)]
#![allow(clippy::comparison_chain)]
// Clippy is confused by the complex configuration
#![allow(clippy::if_same_then_else)]
#![allow(clippy::needless_bool)]
//! This `specialized_div_rem` module is originally from version 1.0.0 of the
//! `specialized-div-rem` crate. Note that `for` loops with ranges are not used in this
//! module, since unoptimized compilation may generate references to `memcpy`.
//!
//! The purpose of these macros is to easily change the both the division algorithm used
//! for a given integer size and the half division used by that algorithm. The way
//! functions call each other is also constructed such that linkers will find the chain of
//! software and hardware divisions needed for every size of signed and unsigned division.
//! For example, most target compilations do the following:
//!
//! - Many 128 bit division functions like `u128::wrapping_div` use
//! `std::intrinsics::unchecked_div`, which gets replaced by `__udivti3` because there
//! is not a 128 bit by 128 bit hardware division function in most architectures.
//! `__udivti3` uses `u128_div_rem` (this extra level of function calls exists because
//! `__umodti3` and `__udivmodti4` also exist, and `specialized_div_rem` supplies just
//! one function to calculate both the quotient and remainder. If configuration flags
//! enable it, `impl_trifecta!` defines `u128_div_rem` to use the trifecta algorithm,
//! which requires the half sized division `u64_by_u64_div_rem`. If the architecture
//! supplies a 64 bit hardware division instruction, `u64_by_u64_div_rem` will be
//! reduced to those instructions. Note that we do not specify the half size division
//! directly to be `__udivdi3`, because hardware division would never be introduced.
//! - If the architecture does not supply a 64 bit hardware division instruction, u64
//! divisions will use functions such as `__udivdi3`. This will call `u64_div_rem`
//! which is defined by `impl_delegate!`. The half division for this algorithm is
//! `u32_by_u32_div_rem` which in turn becomes hardware division instructions or more
//! software division algorithms.
//! - If the architecture does not supply a 32 bit hardware instruction, linkers will
//! look for `__udivsi3`. `impl_binary_long!` is used, but this algorithm uses no half
//! division, so the chain of calls ends here.
//!
//! On some architectures like x86_64, an asymmetrically sized division is supplied, in
//! which 128 bit numbers can be divided by 64 bit numbers. `impl_asymmetric!` is used to
//! extend the 128 by 64 bit division to a full 128 by 128 bit division.
// `allow(dead_code)` is used in various places, because the configuration code would otherwise be
// ridiculously complex
#[macro_use]
mod norm_shift;
#[macro_use]
mod binary_long;
#[macro_use]
mod delegate;
// used on SPARC
#[allow(unused_imports)]
#[cfg(not(feature = "public-test-deps"))]
pub(crate) use self::delegate::u128_divide_sparc;
#[cfg(feature = "public-test-deps")]
pub use self::delegate::u128_divide_sparc;
#[macro_use]
mod trifecta;
#[macro_use]
mod asymmetric;
/// The behavior of all divisions by zero is controlled by this function. This function should be
/// impossible to reach by Rust users, unless `compiler-builtins` public division functions or
/// `core/std::unchecked_div/rem` are directly used without a zero check in front.
fn zero_div_fn() -> ! {
// Calling the intrinsic directly, to avoid the `assert_unsafe_precondition` that cannot be used
// here because it involves non-`inline` functions
// (https://github.com/rust-lang/compiler-builtins/issues/491).
unsafe { core::intrinsics::unreachable() }
}
const USE_LZ: bool = {
if cfg!(target_arch = "arm") {
if cfg!(target_feature = "thumb-mode") {
// ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is
// supported. This is needed to successfully differentiate between targets like
// `thumbv8.base` and `thumbv8.main`.
cfg!(target_feature = "v6t2")
} else {
// Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is
// supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target
// feature does not seem to work.
cfg!(target_feature = "v5te")
}
} else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) {
// LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
cfg!(target_feature = "vis3")
} else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
// The 'Zbb' Basic Bit-Manipulation extension on RISC-V
// determines if a CLZ assembly instruction exists
cfg!(target_feature = "zbb")
} else {
// All other common targets Rust supports should have CLZ instructions
true
}
};
impl_normalization_shift!(
u32_normalization_shift,
USE_LZ,
32,
u32,
i32,
allow(dead_code)
);
impl_normalization_shift!(
u64_normalization_shift,
USE_LZ,
64,
u64,
i64,
allow(dead_code)
);
/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
/// dependencies.
#[inline]
fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
if let Some(quo) = duo.checked_div(div) {
if let Some(rem) = duo.checked_rem(div) {
return (quo, rem);
}
}
zero_div_fn()
}
// Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a
// microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
// faster if the target pointer width is at least 64.
#[cfg(all(
not(any(target_pointer_width = "16", target_pointer_width = "32")),
not(all(not(feature = "no-asm"), target_arch = "x86_64")),
not(any(target_arch = "sparc", target_arch = "sparc64"))
))]
impl_trifecta!(
u128_div_rem,
zero_div_fn,
u64_by_u64_div_rem,
32,
u32,
u64,
u128
);
// If the pointer width less than 64, then the target architecture almost certainly does not have
// the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster.
#[cfg(all(
any(target_pointer_width = "16", target_pointer_width = "32"),
not(all(not(feature = "no-asm"), target_arch = "x86_64")),
not(any(target_arch = "sparc", target_arch = "sparc64"))
))]
impl_delegate!(
u128_div_rem,
zero_div_fn,
u64_normalization_shift,
u64_by_u64_div_rem,
32,
u32,
u64,
u128,
i128
);
/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
///
/// # Safety
///
/// If the quotient does not fit in a `u64`, a floating point exception occurs.
/// If `div == 0`, then a division by zero exception occurs.
#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
#[inline]
unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) {
let duo_lo = duo as u64;
let duo_hi = (duo >> 64) as u64;
let quo: u64;
let rem: u64;
unsafe {
// divides the combined registers rdx:rax (`duo` is split into two 64 bit parts to do this)
// by `div`. The quotient is stored in rax and the remainder in rdx.
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
core::arch::asm!(
"div {0}",
in(reg) div,
inlateout("rax") duo_lo => quo,
inlateout("rdx") duo_hi => rem,
options(att_syntax, pure, nomem, nostack)
);
}
(quo, rem)
}
// use `asymmetric` instead of `trifecta` on x86_64
#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
impl_asymmetric!(
u128_div_rem,
zero_div_fn,
u64_by_u64_div_rem,
u128_by_u64_div_rem,
32,
u32,
u64,
u128
);
/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
/// dependencies.
#[inline]
#[allow(dead_code)]
fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) {
if let Some(quo) = duo.checked_div(div) {
if let Some(rem) = duo.checked_rem(div) {
return (quo, rem);
}
}
zero_div_fn()
}
// When not on x86 and the pointer width is not 64, use `delegate` since the division size is larger
// than register size.
#[cfg(all(
not(all(not(feature = "no-asm"), target_arch = "x86")),
not(target_pointer_width = "64")
))]
impl_delegate!(
u64_div_rem,
zero_div_fn,
u32_normalization_shift,
u32_by_u32_div_rem,
16,
u16,
u32,
u64,
i64
);
// When not on x86 and the pointer width is 64, use `binary_long`.
#[cfg(all(
not(all(not(feature = "no-asm"), target_arch = "x86")),
target_pointer_width = "64"
))]
impl_binary_long!(
u64_div_rem,
zero_div_fn,
u64_normalization_shift,
64,
u64,
i64
);
/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
///
/// # Safety
///
/// If the quotient does not fit in a `u32`, a floating point exception occurs.
/// If `div == 0`, then a division by zero exception occurs.
#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
#[inline]
unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) {
let duo_lo = duo as u32;
let duo_hi = (duo >> 32) as u32;
let quo: u32;
let rem: u32;
unsafe {
// divides the combined registers rdx:rax (`duo` is split into two 32 bit parts to do this)
// by `div`. The quotient is stored in rax and the remainder in rdx.
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
core::arch::asm!(
"div {0}",
in(reg) div,
inlateout("rax") duo_lo => quo,
inlateout("rdx") duo_hi => rem,
options(att_syntax, pure, nomem, nostack)
);
}
(quo, rem)
}
// use `asymmetric` instead of `delegate` on x86
#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
impl_asymmetric!(
u64_div_rem,
zero_div_fn,
u32_by_u32_div_rem,
u64_by_u32_div_rem,
16,
u16,
u32,
u64
);
// 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division
impl_binary_long!(
u32_div_rem,
zero_div_fn,
u32_normalization_shift,
32,
u32,
i32,
allow(dead_code)
);