| // crate minimums: sse2, x86_64 |
| |
| use core::arch::x86_64::{__m128i, __m256i}; |
| use crate::types::*; |
| |
| mod sse2; |
| |
| #[derive(Copy, Clone)] |
| pub struct YesS3; |
| #[derive(Copy, Clone)] |
| pub struct NoS3; |
| |
| #[derive(Copy, Clone)] |
| pub struct YesS4; |
| #[derive(Copy, Clone)] |
| pub struct NoS4; |
| |
| #[derive(Copy, Clone)] |
| pub struct YesA1; |
| #[derive(Copy, Clone)] |
| pub struct NoA1; |
| |
| #[derive(Copy, Clone)] |
| pub struct YesA2; |
| #[derive(Copy, Clone)] |
| pub struct NoA2; |
| |
| #[derive(Copy, Clone)] |
| pub struct YesNI; |
| #[derive(Copy, Clone)] |
| pub struct NoNI; |
| |
| use core::marker::PhantomData; |
| |
| #[derive(Copy, Clone)] |
| pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>); |
| impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI> |
| where |
| sse2::u128x1_sse2<S3, S4, NI>: Swap64, |
| sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>, |
| sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>, |
| sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4, |
| sse2::u128x1_sse2<S3, S4, NI>: BSwap, |
| sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>, |
| sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>, |
| sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>, |
| sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>, |
| sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>, |
| { |
| type u32x4 = sse2::u32x4_sse2<S3, S4, NI>; |
| type u64x2 = sse2::u64x2_sse2<S3, S4, NI>; |
| type u128x1 = sse2::u128x1_sse2<S3, S4, NI>; |
| |
| type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>; |
| type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>; |
| type u64x4 = sse2::u64x4_sse2<S3, S4, NI>; |
| type u128x2 = sse2::u128x2_sse2<S3, S4, NI>; |
| |
| type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>; |
| type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>; |
| type u128x4 = sse2::u128x4_sse2<S3, S4, NI>; |
| |
| #[inline(always)] |
| unsafe fn instance() -> Self { |
| SseMachine(PhantomData) |
| } |
| } |
| |
| #[derive(Copy, Clone)] |
| pub struct Avx2Machine<NI>(PhantomData<NI>); |
| impl<NI: Copy> Machine for Avx2Machine<NI> |
| where |
| sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64, |
| sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>, |
| sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>, |
| sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4, |
| { |
| type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>; |
| type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>; |
| type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>; |
| |
| type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>; |
| type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>; |
| type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>; |
| type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>; |
| |
| type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>; |
| type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>; |
| type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>; |
| |
| #[inline(always)] |
| unsafe fn instance() -> Self { |
| Avx2Machine(PhantomData) |
| } |
| } |
| |
| pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>; |
| pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>; |
| pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>; |
| /// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything |
| /// to avoid expensive SSE/VEX conflicts. |
| pub type AVX = SseMachine<YesS3, YesS4, NoNI>; |
| pub type AVX2 = Avx2Machine<NoNI>; |
| |
| /// Generic wrapper for unparameterized storage of any of the possible impls. |
| /// Converting into and out of this type should be essentially free, although it may be more |
| /// aligned than a particular impl requires. |
| #[allow(non_camel_case_types)] |
| #[derive(Copy, Clone)] |
| pub union vec128_storage { |
| u32x4: [u32; 4], |
| u64x2: [u64; 2], |
| u128x1: [u128; 1], |
| sse2: __m128i, |
| } |
| impl Store<vec128_storage> for vec128_storage { |
| #[inline(always)] |
| unsafe fn unpack(p: vec128_storage) -> Self { |
| p |
| } |
| } |
| impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage { |
| #[inline(always)] |
| fn into(self) -> &'a [u32; 4] { |
| unsafe { &self.u32x4 } |
| } |
| } |
| impl Into<vec128_storage> for [u32; 4] { |
| #[inline(always)] |
| fn into(self) -> vec128_storage { |
| vec128_storage { u32x4: self } |
| } |
| } |
| impl Default for vec128_storage { |
| #[inline(always)] |
| fn default() -> Self { |
| vec128_storage { u128x1: [0] } |
| } |
| } |
| impl Eq for vec128_storage {} |
| impl PartialEq for vec128_storage { |
| #[inline(always)] |
| fn eq(&self, rhs: &Self) -> bool { |
| unsafe { self.u128x1 == rhs.u128x1 } |
| } |
| } |
| |
| #[allow(non_camel_case_types)] |
| #[derive(Copy, Clone)] |
| pub union vec256_storage { |
| u32x8: [u32; 8], |
| u64x4: [u64; 4], |
| u128x2: [u128; 2], |
| sse2: [vec128_storage; 2], |
| avx: __m256i, |
| } |
| impl Into<vec256_storage> for [u64; 4] { |
| #[inline(always)] |
| fn into(self) -> vec256_storage { |
| vec256_storage { u64x4: self } |
| } |
| } |
| impl Default for vec256_storage { |
| #[inline(always)] |
| fn default() -> Self { |
| vec256_storage { u128x2: [0, 0] } |
| } |
| } |
| impl vec256_storage { |
| pub fn new128(xs: [vec128_storage; 2]) -> Self { |
| Self { sse2: xs } |
| } |
| pub fn split128(self) -> [vec128_storage; 2] { |
| unsafe { self.sse2 } |
| } |
| } |
| impl Eq for vec256_storage {} |
| impl PartialEq for vec256_storage { |
| #[inline(always)] |
| fn eq(&self, rhs: &Self) -> bool { |
| unsafe { self.sse2 == rhs.sse2 } |
| } |
| } |
| |
| #[allow(non_camel_case_types)] |
| #[derive(Copy, Clone)] |
| pub union vec512_storage { |
| u32x16: [u32; 16], |
| u64x8: [u64; 8], |
| u128x4: [u128; 4], |
| sse2: [vec128_storage; 4], |
| avx: [vec256_storage; 2], |
| } |
| impl Default for vec512_storage { |
| #[inline(always)] |
| fn default() -> Self { |
| vec512_storage { |
| u128x4: [0, 0, 0, 0], |
| } |
| } |
| } |
| impl vec512_storage { |
| pub fn new128(xs: [vec128_storage; 4]) -> Self { |
| Self { sse2: xs } |
| } |
| pub fn split128(self) -> [vec128_storage; 4] { |
| unsafe { self.sse2 } |
| } |
| } |
| impl Eq for vec512_storage {} |
| impl PartialEq for vec512_storage { |
| #[inline(always)] |
| fn eq(&self, rhs: &Self) -> bool { |
| unsafe { self.avx == rhs.avx } |
| } |
| } |
| |
| macro_rules! impl_into { |
| ($storage:ident, $array:ty, $name:ident) => { |
| impl Into<$array> for $storage { |
| #[inline(always)] |
| fn into(self) -> $array { |
| unsafe { self.$name } |
| } |
| } |
| }; |
| } |
| impl_into!(vec128_storage, [u32; 4], u32x4); |
| impl_into!(vec128_storage, [u64; 2], u64x2); |
| impl_into!(vec128_storage, [u128; 1], u128x1); |
| impl_into!(vec256_storage, [u32; 8], u32x8); |
| impl_into!(vec256_storage, [u64; 4], u64x4); |
| impl_into!(vec256_storage, [u128; 2], u128x2); |
| impl_into!(vec512_storage, [u32; 16], u32x16); |
| impl_into!(vec512_storage, [u64; 8], u64x8); |
| impl_into!(vec512_storage, [u128; 4], u128x4); |
| |
| /// Generate the full set of optimized implementations to take advantage of the most important |
| /// hardware feature sets. |
| /// |
| /// This dispatcher is suitable for maximizing throughput. |
| #[macro_export] |
| macro_rules! dispatch { |
| ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { |
| #[cfg(feature = "std")] |
| $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { |
| #[inline(always)] |
| fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body |
| use std::arch::x86_64::*; |
| #[target_feature(enable = "avx2")] |
| unsafe fn impl_avx2($($arg: $argty),*) -> $ret { |
| let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*); |
| _mm256_zeroupper(); |
| ret |
| } |
| #[target_feature(enable = "avx")] |
| #[target_feature(enable = "sse4.1")] |
| #[target_feature(enable = "ssse3")] |
| unsafe fn impl_avx($($arg: $argty),*) -> $ret { |
| let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*); |
| _mm256_zeroupper(); |
| ret |
| } |
| #[target_feature(enable = "sse4.1")] |
| #[target_feature(enable = "ssse3")] |
| unsafe fn impl_sse41($($arg: $argty),*) -> $ret { |
| fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) |
| } |
| #[target_feature(enable = "ssse3")] |
| unsafe fn impl_ssse3($($arg: $argty),*) -> $ret { |
| fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) |
| } |
| #[target_feature(enable = "sse2")] |
| unsafe fn impl_sse2($($arg: $argty),*) -> $ret { |
| fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) |
| } |
| unsafe { |
| if is_x86_feature_detected!("avx2") { |
| impl_avx2($($arg),*) |
| } else if is_x86_feature_detected!("avx") { |
| impl_avx($($arg),*) |
| } else if is_x86_feature_detected!("sse4.1") { |
| impl_sse41($($arg),*) |
| } else if is_x86_feature_detected!("ssse3") { |
| impl_ssse3($($arg),*) |
| } else if is_x86_feature_detected!("sse2") { |
| impl_sse2($($arg),*) |
| } else { |
| unimplemented!() |
| } |
| } |
| } |
| #[cfg(not(feature = "std"))] |
| #[inline(always)] |
| $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { |
| unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body |
| unsafe { |
| if cfg!(target_feature = "avx2") { |
| fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) |
| } else if cfg!(target_feature = "avx") { |
| fn_impl($crate::x86_64::AVX::instance(), $($arg),*) |
| } else if cfg!(target_feature = "sse4.1") { |
| fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) |
| } else if cfg!(target_feature = "ssse3") { |
| fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) |
| } else { |
| fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) |
| } |
| } |
| } |
| }; |
| ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { |
| dispatch!($mach, $MTy, { |
| $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body |
| }); |
| } |
| } |
| |
| /// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit |
| /// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX. |
| /// |
| /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware |
| /// features (e.g. because they are done infrequently), so minimizing their contribution to code |
| /// size is more important. |
| #[macro_export] |
| macro_rules! dispatch_light128 { |
| ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { |
| #[cfg(feature = "std")] |
| $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret { |
| #[inline(always)] |
| fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body |
| use std::arch::x86_64::*; |
| #[target_feature(enable = "avx")] |
| unsafe fn impl_avx($($arg: $argty),*) -> $ret { |
| fn_impl($crate::x86_64::AVX::instance(), $($arg),*) |
| } |
| #[target_feature(enable = "sse2")] |
| unsafe fn impl_sse2($($arg: $argty),*) -> $ret { |
| fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) |
| } |
| unsafe { |
| if is_x86_feature_detected!("avx") { |
| impl_avx($($arg),*) |
| } else if is_x86_feature_detected!("sse2") { |
| impl_sse2($($arg),*) |
| } else { |
| unimplemented!() |
| } |
| } |
| } |
| #[cfg(not(feature = "std"))] |
| #[inline(always)] |
| $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { |
| unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body |
| unsafe { |
| if cfg!(target_feature = "avx2") { |
| fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) |
| } else if cfg!(target_feature = "avx") { |
| fn_impl($crate::x86_64::AVX::instance(), $($arg),*) |
| } else if cfg!(target_feature = "sse4.1") { |
| fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) |
| } else if cfg!(target_feature = "ssse3") { |
| fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) |
| } else { |
| fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) |
| } |
| } |
| } |
| }; |
| ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { |
| dispatch_light128!($mach, $MTy, { |
| $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body |
| }); |
| } |
| } |
| |
| /// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit |
| /// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2. |
| /// |
| /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware |
| /// features (e.g. because they are done infrequently), so minimizing their contribution to code |
| /// size is more important. |
| #[macro_export] |
| macro_rules! dispatch_light256 { |
| ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { |
| #[cfg(feature = "std")] |
| $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret { |
| #[inline(always)] |
| fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body |
| use std::arch::x86_64::*; |
| #[target_feature(enable = "avx")] |
| unsafe fn impl_avx($($arg: $argty),*) -> $ret { |
| fn_impl($crate::x86_64::AVX::instance(), $($arg),*) |
| } |
| #[target_feature(enable = "sse2")] |
| unsafe fn impl_sse2($($arg: $argty),*) -> $ret { |
| fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) |
| } |
| unsafe { |
| if is_x86_feature_detected!("avx") { |
| impl_avx($($arg),*) |
| } else if is_x86_feature_detected!("sse2") { |
| impl_sse2($($arg),*) |
| } else { |
| unimplemented!() |
| } |
| } |
| } |
| #[cfg(not(feature = "std"))] |
| #[inline(always)] |
| $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { |
| unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body |
| unsafe { |
| if cfg!(target_feature = "avx2") { |
| fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) |
| } else if cfg!(target_feature = "avx") { |
| fn_impl($crate::x86_64::AVX::instance(), $($arg),*) |
| } else if cfg!(target_feature = "sse4.1") { |
| fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) |
| } else if cfg!(target_feature = "ssse3") { |
| fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) |
| } else { |
| fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) |
| } |
| } |
| } |
| }; |
| ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { |
| dispatch_light256!($mach, $MTy, { |
| $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body |
| }); |
| } |
| } |