src/x86_64/mod.rs - platform/external/rust/crates/ppv-lite86 - Git at Google

 // crate minimums: sse2, x86_64

 use core::arch::x86_64::{__m128i, __m256i};
 use crate::types::*;

 mod sse2;

 #[derive(Copy, Clone)]
 pub struct YesS3;
 #[derive(Copy, Clone)]
 pub struct NoS3;

 #[derive(Copy, Clone)]
 pub struct YesS4;
 #[derive(Copy, Clone)]
 pub struct NoS4;

 #[derive(Copy, Clone)]
 pub struct YesA1;
 #[derive(Copy, Clone)]
 pub struct NoA1;

 #[derive(Copy, Clone)]
 pub struct YesA2;
 #[derive(Copy, Clone)]
 pub struct NoA2;

 #[derive(Copy, Clone)]
 pub struct YesNI;
 #[derive(Copy, Clone)]
 pub struct NoNI;

 use core::marker::PhantomData;

 #[derive(Copy, Clone)]
 pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>);
 impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI>
 where
     sse2::u128x1_sse2<S3, S4, NI>: Swap64,
     sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
     sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
     sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4,
     sse2::u128x1_sse2<S3, S4, NI>: BSwap,
     sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>,
     sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>,
     sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>,
     sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>,
     sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>,
 {
     type u32x4 = sse2::u32x4_sse2<S3, S4, NI>;
     type u64x2 = sse2::u64x2_sse2<S3, S4, NI>;
     type u128x1 = sse2::u128x1_sse2<S3, S4, NI>;

     type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>;
     type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>;
     type u64x4 = sse2::u64x4_sse2<S3, S4, NI>;
     type u128x2 = sse2::u128x2_sse2<S3, S4, NI>;

     type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>;
     type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>;
     type u128x4 = sse2::u128x4_sse2<S3, S4, NI>;

     #[inline(always)]
     unsafe fn instance() -> Self {
         SseMachine(PhantomData)
     }
 }

 #[derive(Copy, Clone)]
 pub struct Avx2Machine<NI>(PhantomData<NI>);
 impl<NI: Copy> Machine for Avx2Machine<NI>
 where
     sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64,
     sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
     sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
     sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4,
 {
     type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>;
     type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
     type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;

     type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>;
     type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
     type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
     type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;

     type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>;
     type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>;
     type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>;

     #[inline(always)]
     unsafe fn instance() -> Self {
         Avx2Machine(PhantomData)
     }
 }

 pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>;
 pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>;
 pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>;
 /// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything
 /// to avoid expensive SSE/VEX conflicts.
 pub type AVX = SseMachine<YesS3, YesS4, NoNI>;
 pub type AVX2 = Avx2Machine<NoNI>;

 /// Generic wrapper for unparameterized storage of any of the possible impls.
 /// Converting into and out of this type should be essentially free, although it may be more
 /// aligned than a particular impl requires.
 #[allow(non_camel_case_types)]
 #[derive(Copy, Clone)]
 pub union vec128_storage {
     u32x4: [u32; 4],
     u64x2: [u64; 2],
     u128x1: [u128; 1],
     sse2: __m128i,
 }
 impl Store<vec128_storage> for vec128_storage {
     #[inline(always)]
     unsafe fn unpack(p: vec128_storage) -> Self {
         p
     }
 }
 impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage {
     #[inline(always)]
     fn into(self) -> &'a [u32; 4] {
         unsafe { &self.u32x4 }
     }
 }
 impl Into<vec128_storage> for [u32; 4] {
     #[inline(always)]
     fn into(self) -> vec128_storage {
         vec128_storage { u32x4: self }
     }
 }
 impl Default for vec128_storage {
     #[inline(always)]
     fn default() -> Self {
         vec128_storage { u128x1: [0] }
     }
 }
 impl Eq for vec128_storage {}
 impl PartialEq for vec128_storage {
     #[inline(always)]
     fn eq(&self, rhs: &Self) -> bool {
         unsafe { self.u128x1 == rhs.u128x1 }
     }
 }

 #[allow(non_camel_case_types)]
 #[derive(Copy, Clone)]
 pub union vec256_storage {
     u32x8: [u32; 8],
     u64x4: [u64; 4],
     u128x2: [u128; 2],
     sse2: [vec128_storage; 2],
     avx: __m256i,
 }
 impl Into<vec256_storage> for [u64; 4] {
     #[inline(always)]
     fn into(self) -> vec256_storage {
         vec256_storage { u64x4: self }
     }
 }
 impl Default for vec256_storage {
     #[inline(always)]
     fn default() -> Self {
         vec256_storage { u128x2: [0, 0] }
     }
 }
 impl vec256_storage {
     pub fn new128(xs: [vec128_storage; 2]) -> Self {
         Self { sse2: xs }
     }
     pub fn split128(self) -> [vec128_storage; 2] {
         unsafe { self.sse2 }
     }
 }
 impl Eq for vec256_storage {}
 impl PartialEq for vec256_storage {
     #[inline(always)]
     fn eq(&self, rhs: &Self) -> bool {
         unsafe { self.sse2 == rhs.sse2 }
     }
 }

 #[allow(non_camel_case_types)]
 #[derive(Copy, Clone)]
 pub union vec512_storage {
     u32x16: [u32; 16],
     u64x8: [u64; 8],
     u128x4: [u128; 4],
     sse2: [vec128_storage; 4],
     avx: [vec256_storage; 2],
 }
 impl Default for vec512_storage {
     #[inline(always)]
     fn default() -> Self {
         vec512_storage {
             u128x4: [0, 0, 0, 0],
         }
     }
 }
 impl vec512_storage {
     pub fn new128(xs: [vec128_storage; 4]) -> Self {
         Self { sse2: xs }
     }
     pub fn split128(self) -> [vec128_storage; 4] {
         unsafe { self.sse2 }
     }
 }
 impl Eq for vec512_storage {}
 impl PartialEq for vec512_storage {
     #[inline(always)]
     fn eq(&self, rhs: &Self) -> bool {
         unsafe { self.avx == rhs.avx }
     }
 }

 macro_rules! impl_into {
     ($storage:ident, $array:ty, $name:ident) => {
         impl Into<$array> for $storage {
             #[inline(always)]
             fn into(self) -> $array {
                 unsafe { self.$name }
             }
         }
     };
 }
 impl_into!(vec128_storage, [u32; 4], u32x4);
 impl_into!(vec128_storage, [u64; 2], u64x2);
 impl_into!(vec128_storage, [u128; 1], u128x1);
 impl_into!(vec256_storage, [u32; 8], u32x8);
 impl_into!(vec256_storage, [u64; 4], u64x4);
 impl_into!(vec256_storage, [u128; 2], u128x2);
 impl_into!(vec512_storage, [u32; 16], u32x16);
 impl_into!(vec512_storage, [u64; 8], u64x8);
 impl_into!(vec512_storage, [u128; 4], u128x4);

 /// Generate the full set of optimized implementations to take advantage of the most important
 /// hardware feature sets.
 ///
 /// This dispatcher is suitable for maximizing throughput.
 #[macro_export]
 macro_rules! dispatch {
     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
         #[cfg(feature = "std")]
         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
             #[inline(always)]
             fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
             use std::arch::x86_64::*;
             #[target_feature(enable = "avx2")]
             unsafe fn impl_avx2($($arg: $argty),*) -> $ret {
                 let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*);
                 _mm256_zeroupper();
                 ret
             }
             #[target_feature(enable = "avx")]
             #[target_feature(enable = "sse4.1")]
             #[target_feature(enable = "ssse3")]
             unsafe fn impl_avx($($arg: $argty),*) -> $ret {
                 let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*);
                 _mm256_zeroupper();
                 ret
             }
             #[target_feature(enable = "sse4.1")]
             #[target_feature(enable = "ssse3")]
             unsafe fn impl_sse41($($arg: $argty),*) -> $ret {
                 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
             }
             #[target_feature(enable = "ssse3")]
             unsafe fn impl_ssse3($($arg: $argty),*) -> $ret {
                 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
             }
             #[target_feature(enable = "sse2")]
             unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
                 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
             }
             unsafe {
                 if is_x86_feature_detected!("avx2") {
                     impl_avx2($($arg),*)
                 } else if is_x86_feature_detected!("avx") {
                     impl_avx($($arg),*)
                 } else if is_x86_feature_detected!("sse4.1") {
                     impl_sse41($($arg),*)
                 } else if is_x86_feature_detected!("ssse3") {
                     impl_ssse3($($arg),*)
                 } else if is_x86_feature_detected!("sse2") {
                     impl_sse2($($arg),*)
                 } else {
                     unimplemented!()
                 }
             }
         }
         #[cfg(not(feature = "std"))]
         #[inline(always)]
         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
             unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
             unsafe {
                 if cfg!(target_feature = "avx2") {
                     fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
                 } else if cfg!(target_feature = "avx") {
                     fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
                 } else if cfg!(target_feature = "sse4.1") {
                     fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
                 } else if cfg!(target_feature = "ssse3") {
                     fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
                 } else {
                     fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
                 }
             }
         }
     };
     ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
         dispatch!($mach, $MTy, {
             $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
         });
     }
 }

 /// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit
 /// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX.
 ///
 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
 /// features (e.g. because they are done infrequently), so minimizing their contribution to code
 /// size is more important.
 #[macro_export]
 macro_rules! dispatch_light128 {
     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
         #[cfg(feature = "std")]
         $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
             #[inline(always)]
             fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
             use std::arch::x86_64::*;
             #[target_feature(enable = "avx")]
             unsafe fn impl_avx($($arg: $argty),*) -> $ret {
                 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
             }
             #[target_feature(enable = "sse2")]
             unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
                 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
             }
             unsafe {
                 if is_x86_feature_detected!("avx") {
                     impl_avx($($arg),*)
                 } else if is_x86_feature_detected!("sse2") {
                     impl_sse2($($arg),*)
                 } else {
                     unimplemented!()
                 }
             }
         }
         #[cfg(not(feature = "std"))]
         #[inline(always)]
         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
             unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
             unsafe {
                 if cfg!(target_feature = "avx2") {
                     fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
                 } else if cfg!(target_feature = "avx") {
                     fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
                 } else if cfg!(target_feature = "sse4.1") {
                     fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
                 } else if cfg!(target_feature = "ssse3") {
                     fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
                 } else {
                     fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
                 }
             }
         }
     };
     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
         dispatch_light128!($mach, $MTy, {
             $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
         });
     }
 }

 /// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit
 /// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2.
 ///
 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
 /// features (e.g. because they are done infrequently), so minimizing their contribution to code
 /// size is more important.
 #[macro_export]
 macro_rules! dispatch_light256 {
     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
         #[cfg(feature = "std")]
         $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret {
             #[inline(always)]
             fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
             use std::arch::x86_64::*;
             #[target_feature(enable = "avx")]
             unsafe fn impl_avx($($arg: $argty),*) -> $ret {
                 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
             }
             #[target_feature(enable = "sse2")]
             unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
                 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
             }
             unsafe {
                 if is_x86_feature_detected!("avx") {
                     impl_avx($($arg),*)
                 } else if is_x86_feature_detected!("sse2") {
                     impl_sse2($($arg),*)
                 } else {
                     unimplemented!()
                 }
             }
         }
         #[cfg(not(feature = "std"))]
         #[inline(always)]
         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
             unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
             unsafe {
                 if cfg!(target_feature = "avx2") {
                     fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
                 } else if cfg!(target_feature = "avx") {
                     fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
                 } else if cfg!(target_feature = "sse4.1") {
                     fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
                 } else if cfg!(target_feature = "ssse3") {
                     fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
                 } else {
                     fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
                 }
             }
         }
     };
     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
         dispatch_light256!($mach, $MTy, {
             $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
         });
     }
 }
	// crate minimums: sse2, x86_64

	use core::arch::x86_64::{__m128i, __m256i};
	use crate::types::*;

	mod sse2;

	#[derive(Copy, Clone)]
	pub struct YesS3;
	#[derive(Copy, Clone)]
	pub struct NoS3;

	#[derive(Copy, Clone)]
	pub struct YesS4;
	#[derive(Copy, Clone)]
	pub struct NoS4;

	#[derive(Copy, Clone)]
	pub struct YesA1;
	#[derive(Copy, Clone)]
	pub struct NoA1;

	#[derive(Copy, Clone)]
	pub struct YesA2;
	#[derive(Copy, Clone)]
	pub struct NoA2;

	#[derive(Copy, Clone)]
	pub struct YesNI;
	#[derive(Copy, Clone)]
	pub struct NoNI;

	use core::marker::PhantomData;

	#[derive(Copy, Clone)]
	pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>);
	impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI>
	where
	sse2::u128x1_sse2<S3, S4, NI>: Swap64,
	sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
	sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
	sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4,
	sse2::u128x1_sse2<S3, S4, NI>: BSwap,
	sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>,
	sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>,
	sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>,
	sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>,
	sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>,
	{
	type u32x4 = sse2::u32x4_sse2<S3, S4, NI>;
	type u64x2 = sse2::u64x2_sse2<S3, S4, NI>;
	type u128x1 = sse2::u128x1_sse2<S3, S4, NI>;

	type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>;
	type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>;
	type u64x4 = sse2::u64x4_sse2<S3, S4, NI>;
	type u128x2 = sse2::u128x2_sse2<S3, S4, NI>;

	type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>;
	type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>;
	type u128x4 = sse2::u128x4_sse2<S3, S4, NI>;

	#[inline(always)]
	unsafe fn instance() -> Self {
	SseMachine(PhantomData)
	}
	}

	#[derive(Copy, Clone)]
	pub struct Avx2Machine<NI>(PhantomData<NI>);
	impl<NI: Copy> Machine for Avx2Machine<NI>
	where
	sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64,
	sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
	sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
	sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4,
	{
	type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>;
	type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
	type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;

	type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>;
	type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
	type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
	type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;

	type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>;
	type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>;
	type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>;

	#[inline(always)]
	unsafe fn instance() -> Self {
	Avx2Machine(PhantomData)
	}
	}

	pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>;
	pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>;
	pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>;
	/// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything
	/// to avoid expensive SSE/VEX conflicts.
	pub type AVX = SseMachine<YesS3, YesS4, NoNI>;
	pub type AVX2 = Avx2Machine<NoNI>;

	/// Generic wrapper for unparameterized storage of any of the possible impls.
	/// Converting into and out of this type should be essentially free, although it may be more
	/// aligned than a particular impl requires.
	#[allow(non_camel_case_types)]
	#[derive(Copy, Clone)]
	pub union vec128_storage {
	u32x4: [u32; 4],
	u64x2: [u64; 2],
	u128x1: [u128; 1],
	sse2: __m128i,
	}
	impl Store<vec128_storage> for vec128_storage {
	#[inline(always)]
	unsafe fn unpack(p: vec128_storage) -> Self {
	p
	}
	}
	impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage {
	#[inline(always)]
	fn into(self) -> &'a [u32; 4] {
	unsafe { &self.u32x4 }
	}
	}
	impl Into<vec128_storage> for [u32; 4] {
	#[inline(always)]
	fn into(self) -> vec128_storage {
	vec128_storage { u32x4: self }
	}
	}
	impl Default for vec128_storage {
	#[inline(always)]
	fn default() -> Self {
	vec128_storage { u128x1: [0] }
	}
	}
	impl Eq for vec128_storage {}
	impl PartialEq for vec128_storage {
	#[inline(always)]
	fn eq(&self, rhs: &Self) -> bool {
	unsafe { self.u128x1 == rhs.u128x1 }
	}
	}

	#[allow(non_camel_case_types)]
	#[derive(Copy, Clone)]
	pub union vec256_storage {
	u32x8: [u32; 8],
	u64x4: [u64; 4],
	u128x2: [u128; 2],
	sse2: [vec128_storage; 2],
	avx: __m256i,
	}
	impl Into<vec256_storage> for [u64; 4] {
	#[inline(always)]
	fn into(self) -> vec256_storage {
	vec256_storage { u64x4: self }
	}
	}
	impl Default for vec256_storage {
	#[inline(always)]
	fn default() -> Self {
	vec256_storage { u128x2: [0, 0] }
	}
	}
	impl vec256_storage {
	pub fn new128(xs: [vec128_storage; 2]) -> Self {
	Self { sse2: xs }
	}
	pub fn split128(self) -> [vec128_storage; 2] {
	unsafe { self.sse2 }
	}
	}
	impl Eq for vec256_storage {}
	impl PartialEq for vec256_storage {
	#[inline(always)]
	fn eq(&self, rhs: &Self) -> bool {
	unsafe { self.sse2 == rhs.sse2 }
	}
	}

	#[allow(non_camel_case_types)]
	#[derive(Copy, Clone)]
	pub union vec512_storage {
	u32x16: [u32; 16],
	u64x8: [u64; 8],
	u128x4: [u128; 4],
	sse2: [vec128_storage; 4],
	avx: [vec256_storage; 2],
	}
	impl Default for vec512_storage {
	#[inline(always)]
	fn default() -> Self {
	vec512_storage {
	u128x4: [0, 0, 0, 0],
	}
	}
	}
	impl vec512_storage {
	pub fn new128(xs: [vec128_storage; 4]) -> Self {
	Self { sse2: xs }
	}
	pub fn split128(self) -> [vec128_storage; 4] {
	unsafe { self.sse2 }
	}
	}
	impl Eq for vec512_storage {}
	impl PartialEq for vec512_storage {
	#[inline(always)]
	fn eq(&self, rhs: &Self) -> bool {
	unsafe { self.avx == rhs.avx }
	}
	}

	macro_rules! impl_into {
	($storage:ident, $array:ty, $name:ident) => {
	impl Into<$array> for $storage {
	#[inline(always)]
	fn into(self) -> $array {
	unsafe { self.$name }
	}
	}
	};
	}
	impl_into!(vec128_storage, [u32; 4], u32x4);
	impl_into!(vec128_storage, [u64; 2], u64x2);
	impl_into!(vec128_storage, [u128; 1], u128x1);
	impl_into!(vec256_storage, [u32; 8], u32x8);
	impl_into!(vec256_storage, [u64; 4], u64x4);
	impl_into!(vec256_storage, [u128; 2], u128x2);
	impl_into!(vec512_storage, [u32; 16], u32x16);
	impl_into!(vec512_storage, [u64; 8], u64x8);
	impl_into!(vec512_storage, [u128; 4], u128x4);

	/// Generate the full set of optimized implementations to take advantage of the most important
	/// hardware feature sets.
	///
	/// This dispatcher is suitable for maximizing throughput.
	#[macro_export]
	macro_rules! dispatch {
	($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))]) fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
	#[cfg(feature = "std")]
	$($pub$(($krate))) fn $name($($arg: $argty),*) -> $ret {
	#[inline(always)]
	fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
	use std::arch::x86_64::*;
	#[target_feature(enable = "avx2")]
	unsafe fn impl_avx2($($arg: $argty),*) -> $ret {
	let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*);
	_mm256_zeroupper();
	ret
	}
	#[target_feature(enable = "avx")]
	#[target_feature(enable = "sse4.1")]
	#[target_feature(enable = "ssse3")]
	unsafe fn impl_avx($($arg: $argty),*) -> $ret {
	let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*);
	_mm256_zeroupper();
	ret
	}
	#[target_feature(enable = "sse4.1")]
	#[target_feature(enable = "ssse3")]
	unsafe fn impl_sse41($($arg: $argty),*) -> $ret {
	fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
	}
	#[target_feature(enable = "ssse3")]
	unsafe fn impl_ssse3($($arg: $argty),*) -> $ret {
	fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
	}
	#[target_feature(enable = "sse2")]
	unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
	fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
	}
	unsafe {
	if is_x86_feature_detected!("avx2") {
	impl_avx2($($arg),*)
	} else if is_x86_feature_detected!("avx") {
	impl_avx($($arg),*)
	} else if is_x86_feature_detected!("sse4.1") {
	impl_sse41($($arg),*)
	} else if is_x86_feature_detected!("ssse3") {
	impl_ssse3($($arg),*)
	} else if is_x86_feature_detected!("sse2") {
	impl_sse2($($arg),*)
	} else {
	unimplemented!()
	}
	}
	}
	#[cfg(not(feature = "std"))]
	#[inline(always)]
	$($pub$(($krate))) fn $name($($arg: $argty),*) -> $ret {
	unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
	unsafe {
	if cfg!(target_feature = "avx2") {
	fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
	} else if cfg!(target_feature = "avx") {
	fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
	} else if cfg!(target_feature = "sse4.1") {
	fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
	} else if cfg!(target_feature = "ssse3") {
	fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
	} else {
	fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
	}
	}
	}
	};
	($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))]) fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
	dispatch!($mach, $MTy, {
	$([$pub $(($krate))]) fn $name($($arg: $argty),*) -> () $body
	});
	}
	}

	/// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit
	/// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX.
	///
	/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
	/// features (e.g. because they are done infrequently), so minimizing their contribution to code
	/// size is more important.
	#[macro_export]
	macro_rules! dispatch_light128 {
	($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))]) fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
	#[cfg(feature = "std")]
	$($pub $(($krate))) fn $name($($arg: $argty),*) -> $ret {
	#[inline(always)]
	fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
	use std::arch::x86_64::*;
	#[target_feature(enable = "avx")]
	unsafe fn impl_avx($($arg: $argty),*) -> $ret {
	fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
	}
	#[target_feature(enable = "sse2")]
	unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
	fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
	}
	unsafe {
	if is_x86_feature_detected!("avx") {
	impl_avx($($arg),*)
	} else if is_x86_feature_detected!("sse2") {
	impl_sse2($($arg),*)
	} else {
	unimplemented!()
	}
	}
	}
	#[cfg(not(feature = "std"))]
	#[inline(always)]
	$($pub$(($krate))) fn $name($($arg: $argty),*) -> $ret {
	unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
	unsafe {
	if cfg!(target_feature = "avx2") {
	fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
	} else if cfg!(target_feature = "avx") {
	fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
	} else if cfg!(target_feature = "sse4.1") {
	fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
	} else if cfg!(target_feature = "ssse3") {
	fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
	} else {
	fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
	}
	}
	}
	};
	($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))]) fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
	dispatch_light128!($mach, $MTy, {
	$([$pub $(($krate))]) fn $name($($arg: $argty),*) -> () $body
	});
	}
	}

	/// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit
	/// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2.
	///
	/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
	/// features (e.g. because they are done infrequently), so minimizing their contribution to code
	/// size is more important.
	#[macro_export]
	macro_rules! dispatch_light256 {
	($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))]) fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
	#[cfg(feature = "std")]
	$([$pub $(($krate))]) fn $name($($arg: $argty),*) -> $ret {
	#[inline(always)]
	fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
	use std::arch::x86_64::*;
	#[target_feature(enable = "avx")]
	unsafe fn impl_avx($($arg: $argty),*) -> $ret {
	fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
	}
	#[target_feature(enable = "sse2")]
	unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
	fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
	}
	unsafe {
	if is_x86_feature_detected!("avx") {
	impl_avx($($arg),*)
	} else if is_x86_feature_detected!("sse2") {
	impl_sse2($($arg),*)
	} else {
	unimplemented!()
	}
	}
	}
	#[cfg(not(feature = "std"))]
	#[inline(always)]
	$($pub$(($krate))) fn $name($($arg: $argty),*) -> $ret {
	unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
	unsafe {
	if cfg!(target_feature = "avx2") {
	fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
	} else if cfg!(target_feature = "avx") {
	fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
	} else if cfg!(target_feature = "sse4.1") {
	fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
	} else if cfg!(target_feature = "ssse3") {
	fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
	} else {
	fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
	}
	}
	}
	};
	($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))]) fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
	dispatch_light256!($mach, $MTy, {
	$([$pub $(($krate))]) fn $name($($arg: $argty),*) -> () $body
	});
	}
	}