media/libdrm/mobile2/src/util/ustl-1.0/simd.h - platform/frameworks/base - Git at Google

 // This file is part of the ustl library, an STL implementation.
 //
 // Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net>
 // This file is free software, distributed under the MIT License.
 //
 /// \file simd.h
 /// \brief SIMD-type algorithms, with hardware acceleration, if available.
 ///
 /// All algorithms are container-based because iterator syntax is just too
 /// damn verbose and because the specializations need to be able to tell
 /// how many elements are in the container in order to choose proper SIMD
 /// instruction set (i.e.: 4 floats select SSE, while 2 floats select 3dNow!)
 /// Specializations are only for the tuple template because the container
 /// must be of a fixed and compile-time-known size for the compiler to be
 /// able to choose the specialization.
 ///

 #ifndef SIMD_H_39BE2D970DF4BD00508CCFFB482496F9
 #define SIMD_H_39BE2D970DF4BD00508CCFFB482496F9

 #include "uassert.h"
 #include "ulimits.h"
 #if HAVE_MATH_H
     #include <math.h>
 #endif

 #if PLATFORM_ANDROID
 #include <stdio.h>
 #undef CPU_HAS_MMX
 #endif

 namespace ustl {
 namespace simd {

 //----------------------------------------------------------------------
 // Generic algorithms
 //----------------------------------------------------------------------

 /// Applies \p op to each element in \p op1.
 template <typename Ctr, typename UnaryOperation>
 inline void packop (Ctr& op1, UnaryOperation op)
 {
     foreach (typename Ctr::iterator, i, op1)
 	op (*i);
 }

 /// Applies \p op to each element in \p op1 and \p op2 and stores in \p op2.
 template <typename Ctr, typename BinaryOperation>
 inline void packop (const Ctr& op1, Ctr& op2, BinaryOperation op)
 {
     assert (op2.size() <= op1.size());
     typename Ctr::const_iterator i1 (op1.begin());
     typename Ctr::iterator i2 (op2.begin());
     for (; i2 != op2.end(); ++i1, ++i2)
 	*i2 = op (*i2, *i1);
 }

 /// Applies \p op to corresponding elements in \p op1 and \p op2 and stores in \p result.
 template <typename Ctr, typename BinaryOperation>
 inline void packop (const Ctr& op1, const Ctr& op2, Ctr& result, BinaryOperation op)
 {
     assert (op1.size() <= op2.size() && op1.size() <= result.size());
     passign (op1, result);
     packop (op2, result);
 }

 /// Copies \p op1 into \p result.
 template <typename Ctr>
 inline void passign (const Ctr& op1, Ctr& result)
 {
     assert (op1.size() <= result.size());
     typename Ctr::iterator d (result.begin());
     foreach (typename Ctr::const_iterator, s, op1)
 	*d++ = *s;
 }

 /// Copies \p result.size() elements from \p op1 to \p result.
 template <typename Ctr>
 inline void ipassign (typename Ctr::const_iterator op1, Ctr& result)
 {
     foreach (typename Ctr::iterator, d, result)
 	*d = *op1++;
 }

 template <typename Ctr1, typename Ctr2, typename ConvertFunction>
 inline void pconvert (const Ctr1& op1, Ctr2& op2, ConvertFunction f)
 {
     assert (op1.size() <= op2.size());
     typename Ctr1::const_iterator i1 (op1.begin());
     typename Ctr2::iterator i2 (op2.begin());
     for (; i1 != op1.end(); ++i1, ++i2)
 	*i2 = f (*i1);
 }

 // Functionoids for SIMD operations, like saturation arithmetic, shifts, etc.
 STD_BINARY_FUNCTOR (fpadds, T, ((b > numeric_limits<T>::max() - a) ? numeric_limits<T>::max() : a + b))
 STD_BINARY_FUNCTOR (fpsubs, T, ((a < numeric_limits<T>::min() + b) ? numeric_limits<T>::min() : a - b))
 STD_BINARY_FUNCTOR (fpshl,  T, (a << b))
 STD_BINARY_FUNCTOR (fpshr,  T, (a >> b))
 STD_BINARY_FUNCTOR (fpmin,  T, (min (a, b)))
 STD_BINARY_FUNCTOR (fpmax,  T, (max (a, b)))
 STD_BINARY_FUNCTOR (fpavg,  T, ((a + b + 1) / 2))
 STD_CONVERSION_FUNCTOR (fcast, (D(a)))
 #if HAVE_MATH_H
 STD_UNARY_FUNCTOR (fpreciprocal,T, (1 / a))
 STD_UNARY_FUNCTOR (fpsqrt,	T, (reset_mmx(), T (sqrt (a))))
 STD_UNARY_FUNCTOR (fprecipsqrt,	T, (reset_mmx(), 1 / T(sqrt (a))))
 STD_UNARY_FUNCTOR (fsin,	T, (reset_mmx(), T (sin (a))))
 STD_UNARY_FUNCTOR (fcos,	T, (reset_mmx(), T (cos (a))))
 STD_UNARY_FUNCTOR (ftan,	T, (reset_mmx(), T (tan (a))))
 #if HAVE_RINTF
 STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rintf(a))))
 #else
 STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rint(a))))
 #endif
 template <> inline int32_t fround<double,int32_t>::operator()(const double& a) const { reset_mmx(); return (int32_t(rint(a))); }
 #endif
 template <> inline float fpavg<float>::operator()(const float& a, const float& b) const { return ((a + b) / 2); }
 template <> inline double fpavg<double>::operator()(const double& a, const double& b) const { return ((a + b) / 2); }

 #define SIMD_PACKEDOP1(name, operation)		\
 template <typename Ctr>				\
 inline void name (Ctr& op1)			\
 {						\
     typedef typename Ctr::value_type value_t;	\
     packop (op1, operation<value_t>());		\
 }
 #define SIMD_PACKEDOP2(name, operation)		\
 template <typename Ctr>				\
 inline void name (const Ctr& op1, Ctr& op2)	\
 {						\
     typedef typename Ctr::value_type value_t;	\
     packop (op1, op2, operation<value_t>());	\
 }
 #define SIMD_PACKEDOP3(name, operation)			\
 template <typename Ctr>					\
 inline void name (const Ctr& op1, const Ctr& op2, Ctr& result)	\
 {							\
     typedef typename Ctr::value_type value_t;		\
     packop (op1, op2, result, operation<value_t>());	\
 }
 #define SIMD_SINGLEOP1(name, operation)		\
 template <typename T>				\
 inline T name (T op)				\
 {						\
     operation<T> obj;				\
     return (obj(op));				\
 }
 #define SIMD_CONVERTOP(name, operation)		\
 template <typename Ctr1, typename Ctr2>		\
 inline void name (const Ctr1& op1, Ctr2& op2)	\
 {						\
     typedef typename Ctr1::value_type value1_t;	\
     typedef typename Ctr2::value_type value2_t;	\
     pconvert (op1, op2, operation<value1_t, value2_t>());\
 }

 SIMD_PACKEDOP2 (padd, plus)
 SIMD_PACKEDOP2 (psub, minus)
 SIMD_PACKEDOP2 (pmul, multiplies)
 SIMD_PACKEDOP2 (pdiv, divides)
 SIMD_PACKEDOP2 (pand, bitwise_and)
 SIMD_PACKEDOP2 (por, bitwise_or)
 SIMD_PACKEDOP2 (pxor, bitwise_xor)
 SIMD_PACKEDOP2 (pshl, fpshl)
 SIMD_PACKEDOP2 (pshr, fpshr)
 SIMD_PACKEDOP2 (psubs, fpsubs)
 SIMD_PACKEDOP2 (pmin, fpmin)
 SIMD_PACKEDOP2 (pmax, fpmax)
 SIMD_PACKEDOP2 (pavg, fpavg)

 SIMD_PACKEDOP3 (padd, plus)
 SIMD_PACKEDOP3 (psub, minus)
 SIMD_PACKEDOP3 (pmul, multiplies)
 SIMD_PACKEDOP3 (pdiv, divides)
 SIMD_PACKEDOP3 (pand, bitwise_and)
 SIMD_PACKEDOP3 (por, bitwise_or)
 SIMD_PACKEDOP3 (pxor, bitwise_xor)
 SIMD_PACKEDOP3 (pshl, fpshl)
 SIMD_PACKEDOP3 (pshr, fpshr)
 SIMD_PACKEDOP3 (padds, fpadds)
 SIMD_PACKEDOP3 (psubs, fpsubs)
 SIMD_PACKEDOP3 (pmin, fpmin)
 SIMD_PACKEDOP3 (pmax, fpmax)
 SIMD_PACKEDOP3 (pavg, fpavg)

 #if HAVE_MATH_H
 SIMD_PACKEDOP1 (precip, fpreciprocal)
 SIMD_PACKEDOP1 (psqrt, fpsqrt)
 SIMD_PACKEDOP1 (precipsqrt, fprecipsqrt)
 SIMD_PACKEDOP1 (psin, fsin)
 SIMD_PACKEDOP1 (pcos, fcos)
 SIMD_PACKEDOP1 (ptan, ftan)

 SIMD_SINGLEOP1 (srecip, fpreciprocal)
 SIMD_SINGLEOP1 (ssqrt, fpsqrt)
 SIMD_SINGLEOP1 (srecipsqrt, fprecipsqrt)
 SIMD_SINGLEOP1 (ssin, fsin)
 SIMD_SINGLEOP1 (scos, fcos)
 SIMD_SINGLEOP1 (stan, ftan)

 SIMD_CONVERTOP (pround, fround)

 template <typename T> inline int32_t sround (T op) { fround<T,int32_t> obj; return (obj (op)); }
 #endif

 #undef SIMD_SINGLEOP1
 #undef SIMD_PACKEDOP3
 #undef SIMD_PACKEDOP2
 #undef SIMD_PACKEDOP1

 //----------------------------------------------------------------------
 // Vector types to cast tuple data to
 //----------------------------------------------------------------------

 #if HAVE_VECTOR_EXTENSIONS && __GNUC__ >= 4
 #define VECTOR_ATTRIBUTE(mode,vs)	__attribute__((vector_size(vs)))
 #else
 #define VECTOR_ATTRIBUTE(mode,vs)
 #endif
 typedef uint8_t v8qi_t VECTOR_ATTRIBUTE (V8QI,8);
 typedef uint16_t v4hi_t VECTOR_ATTRIBUTE (V4HI,8);
 typedef uint16_t v8hi_t VECTOR_ATTRIBUTE (V8HI,16);
 typedef uint32_t v2si_t VECTOR_ATTRIBUTE (V2SI,8);
 typedef uint32_t v4si_t VECTOR_ATTRIBUTE (V4SI,16);
 #if HAVE_INT64_T
 typedef uint64_t v1di_t VECTOR_ATTRIBUTE (V1DI,8);
 #endif
 typedef float v2sf_t VECTOR_ATTRIBUTE (V2SF,8);
 typedef float v4sf_t VECTOR_ATTRIBUTE (V4SF,16);
 typedef double v2df_t VECTOR_ATTRIBUTE (V2DF,16);
 #undef VECTOR_ATTRIBUTE

 //----------------------------------------------------------------------
 // Hardware accelerated specializations
 //----------------------------------------------------------------------

 #define SIMD_PKOP2_SPEC(n, type, optype)	\
 template <>					\
 inline void packop (const tuple<n,type>& oin, tuple<n,type>& oout, optype<type>)
 #define SIMD_PASSIGN_SPEC(n, type)		\
 template <>					\
 inline void passign (const tuple<n,type>& oin, tuple<n,type>& oout)
 #define SIMD_IPASSIGN_SPEC(n, type)		\
 template <>					\
 inline void ipassign (tuple<n,type>::const_iterator oin, tuple<n,type>& oout)
 #define SIMD_CONVERT_SPEC(n, type1, type2, optype)	\
 template <>					\
 inline void pconvert (const tuple<n,type1>& oin, tuple<n,type2>& oout, optype<type1,type2>)

 #if CPU_HAS_MMX
 #define STD_MMX_ARGS	"=m"(oout[0]) : "m"(oin[0]) : "mm0", "st", "memory"
 #define DBL_MMX_ARGS	"=m"(oout[0]), "=m"(oout[2]) : "m"(oin[0]), "m"(oin[2]) : "mm0", "mm1", "st", "st(1)", "memory"
 #define MMX_PKOP2_SPEC(n,type,optype,instruction)	\
 SIMD_PKOP2_SPEC(n,type,optype)		\
 { asm ("movq %0, %%mm0\n\t" #instruction " %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
 #define MMX_DBL_PKOP2_SPEC(n,type,optype,instruction)	\
 SIMD_PKOP2_SPEC(n,type,optype)		\
 { asm ("movq %0, %%mm0\n\tmovq %1, %%mm1\n\t" #instruction " %2, %%mm0\n\t" #instruction " %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
 #define MMX_PASSIGN_SPEC(n,type)	\
 SIMD_PASSIGN_SPEC(n,type)		\
 { asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
 #define MMX_DBL_PASSIGN_SPEC(n,type)	\
 SIMD_PASSIGN_SPEC(n,type)		\
 { asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
 #define MMX_IPASSIGN_SPEC(n,type)	\
 SIMD_IPASSIGN_SPEC(n,type)		\
 { asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
 #define MMX_DBL_IPASSIGN_SPEC(n,type)	\
 SIMD_IPASSIGN_SPEC(n,type)		\
 { asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }

 MMX_PASSIGN_SPEC(8,uint8_t)
 MMX_PKOP2_SPEC(8,uint8_t,plus,paddb)
 MMX_PKOP2_SPEC(8,uint8_t,minus,psubb)
 MMX_PKOP2_SPEC(8,uint8_t,bitwise_and,pand)
 MMX_PKOP2_SPEC(8,uint8_t,bitwise_or,por)
 MMX_PKOP2_SPEC(8,uint8_t,bitwise_xor,pxor)
 MMX_PKOP2_SPEC(8,uint8_t,fpadds,paddusb)
 MMX_PKOP2_SPEC(8,uint8_t,fpsubs,psubusb)

 MMX_PASSIGN_SPEC(8,int8_t)
 MMX_PKOP2_SPEC(8,int8_t,plus,paddb)
 MMX_PKOP2_SPEC(8,int8_t,minus,psubb)
 MMX_PKOP2_SPEC(8,int8_t,bitwise_and,pand)
 MMX_PKOP2_SPEC(8,int8_t,bitwise_or,por)
 MMX_PKOP2_SPEC(8,int8_t,bitwise_xor,pxor)
 MMX_PKOP2_SPEC(8,int8_t,fpadds,paddsb)
 MMX_PKOP2_SPEC(8,int8_t,fpsubs,psubsb)

 MMX_PASSIGN_SPEC(4,uint16_t)
 MMX_PKOP2_SPEC(4,uint16_t,plus,paddw)
 MMX_PKOP2_SPEC(4,uint16_t,minus,psubw)
 MMX_PKOP2_SPEC(4,uint16_t,bitwise_and,pand)
 MMX_PKOP2_SPEC(4,uint16_t,bitwise_or,por)
 MMX_PKOP2_SPEC(4,uint16_t,bitwise_xor,pxor)
 /// \todo psllw does not work like other operations, it uses the first element for shift count.
 //MMX_PKOP2_SPEC(4,uint16_t,fpshl,psllw)
 //MMX_PKOP2_SPEC(4,uint16_t,fpshr,psrlw)
 MMX_PKOP2_SPEC(4,uint16_t,fpadds,paddusw)
 MMX_PKOP2_SPEC(4,uint16_t,fpsubs,psubusw)

 MMX_PASSIGN_SPEC(4,int16_t)
 MMX_PKOP2_SPEC(4,int16_t,plus,paddw)
 MMX_PKOP2_SPEC(4,int16_t,minus,psubw)
 MMX_PKOP2_SPEC(4,int16_t,bitwise_and,pand)
 MMX_PKOP2_SPEC(4,int16_t,bitwise_or,por)
 MMX_PKOP2_SPEC(4,int16_t,bitwise_xor,pxor)
 //MMX_PKOP2_SPEC(4,int16_t,fpshl,psllw)
 //MMX_PKOP2_SPEC(4,int16_t,fpshr,psrlw)
 MMX_PKOP2_SPEC(4,int16_t,fpadds,paddsw)
 MMX_PKOP2_SPEC(4,int16_t,fpsubs,psubsw)

 MMX_PASSIGN_SPEC(2,uint32_t)
 MMX_PKOP2_SPEC(2,uint32_t,plus,paddd)
 MMX_PKOP2_SPEC(2,uint32_t,minus,psubd)
 MMX_PKOP2_SPEC(2,uint32_t,bitwise_and,pand)
 MMX_PKOP2_SPEC(2,uint32_t,bitwise_or,por)
 MMX_PKOP2_SPEC(2,uint32_t,bitwise_xor,pxor)
 //MMX_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
 //MMX_PKOP2_SPEC(2,uint32_t,fpshr,psrld)

 MMX_PASSIGN_SPEC(2,int32_t)
 MMX_PKOP2_SPEC(2,int32_t,plus,paddd)
 MMX_PKOP2_SPEC(2,int32_t,minus,psubd)
 MMX_PKOP2_SPEC(2,int32_t,bitwise_and,pand)
 MMX_PKOP2_SPEC(2,int32_t,bitwise_or,por)
 MMX_PKOP2_SPEC(2,int32_t,bitwise_xor,pxor)
 //MMX_PKOP2_SPEC(2,int32_t,fpshl,pslld)
 //MMX_PKOP2_SPEC(2,int32_t,fpshr,psrld)

 MMX_DBL_PKOP2_SPEC(4,uint32_t,plus,paddd)
 MMX_DBL_PKOP2_SPEC(4,uint32_t,minus,psubd)
 MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_and,pand)
 MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_or,por)
 MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_xor,pxor)
 //MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
 //MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshr,psrld)

 MMX_DBL_PKOP2_SPEC(4,int32_t,plus,paddd)
 MMX_DBL_PKOP2_SPEC(4,int32_t,minus,psubd)
 MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_and,pand)
 MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_or,por)
 MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_xor,pxor)
 //MMX_DBL_PKOP2_SPEC(2,int32_t,fpshl,pslld)
 //MMX_DBL_PKOP2_SPEC(2,int32_t,fpshr,psrld)

 #if CPU_HAS_SSE || CPU_HAS_3DNOW
 MMX_PKOP2_SPEC(8,uint8_t,fpavg,pavgb)
 MMX_PKOP2_SPEC(8,int8_t,fpavg,pavgb)
 MMX_PKOP2_SPEC(4,uint16_t,fpavg,pavgw)
 MMX_PKOP2_SPEC(4,int16_t,fpavg,pavgw)
 MMX_PKOP2_SPEC(8,uint8_t,fpmin,pminub)
 MMX_PKOP2_SPEC(8,uint8_t,fpmax,pmaxub)
 MMX_PKOP2_SPEC(4,int16_t,fpmax,pmaxsw)
 MMX_PKOP2_SPEC(4,int16_t,fpmin,pminsw)
 #endif // CPU_HAS_SSE || CPU_HAS_3DNOW

 #if CPU_HAS_3DNOW
 MMX_PASSIGN_SPEC(2,float)
 MMX_PKOP2_SPEC(2,float,plus,pfadd)
 MMX_PKOP2_SPEC(2,float,minus,pfsub)
 MMX_PKOP2_SPEC(2,float,multiplies,pfmul)
 MMX_PKOP2_SPEC(2,float,fpmin,pfmin)
 MMX_PKOP2_SPEC(2,float,fpmax,pfmax)
 #ifndef CPU_HAS_SSE
 MMX_DBL_PKOP2_SPEC(4,float,plus,pfadd)
 MMX_DBL_PKOP2_SPEC(4,float,minus,pfsub)
 MMX_DBL_PKOP2_SPEC(4,float,multiplies,pfmul)
 MMX_DBL_PKOP2_SPEC(4,float,fpmin,pfmin)
 MMX_DBL_PKOP2_SPEC(4,float,fpmax,pfmax)
 #endif
 #endif // CPU_HAS_3DNOW

 MMX_IPASSIGN_SPEC(8,uint8_t)
 MMX_IPASSIGN_SPEC(4,uint16_t)
 MMX_IPASSIGN_SPEC(2,uint32_t)
 MMX_IPASSIGN_SPEC(2,float)

 #ifndef CPU_HAS_SSE
 MMX_DBL_PASSIGN_SPEC(4,float)
 MMX_DBL_PASSIGN_SPEC(4,uint32_t)
 MMX_DBL_PASSIGN_SPEC(4,int32_t)
 MMX_DBL_IPASSIGN_SPEC(4,float)
 MMX_DBL_IPASSIGN_SPEC(4,uint32_t)
 MMX_DBL_IPASSIGN_SPEC(4,int32_t)
 #endif

 #undef MMX_IPASSIGN_SPEC
 #undef MMX_PASSIGN_SPEC
 #undef MMX_PKOP2_SPEC
 #undef STD_MMX_ARGS
 #endif // CPU_HAS_MMX

 #if CPU_HAS_SSE
 #define STD_SSE_ARGS	"=m"(oout[0]) : "m"(oin[0]) : "xmm0", "memory"
 #define SSE_PKOP2_SPEC(n,type,optype,instruction)	\
 SIMD_PKOP2_SPEC(n,type,optype)		\
 { asm ("movups %0, %%xmm0\n\tmovups %1, %%xmm1\n\t" #instruction " %%xmm1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
 #define SSE_PASSIGN_SPEC(n,type)			\
 SIMD_PASSIGN_SPEC(n,type)		\
 { asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
 #define SSE_IPASSIGN_SPEC(n,type)	\
 SIMD_IPASSIGN_SPEC(n,type)		\
 { asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
 SSE_PASSIGN_SPEC(4,float)
 SSE_PASSIGN_SPEC(4,int32_t)
 SSE_PASSIGN_SPEC(4,uint32_t)
 SSE_PKOP2_SPEC(4,float,plus,addps)
 SSE_PKOP2_SPEC(4,float,minus,subps)
 SSE_PKOP2_SPEC(4,float,multiplies,mulps)
 SSE_PKOP2_SPEC(4,float,divides,divps)
 SSE_PKOP2_SPEC(4,float,bitwise_and,andps)
 SSE_PKOP2_SPEC(4,float,bitwise_or,orps)
 SSE_PKOP2_SPEC(4,float,bitwise_xor,xorps)
 SSE_PKOP2_SPEC(4,float,fpmax,maxps)
 SSE_PKOP2_SPEC(4,float,fpmin,minps)

 SIMD_CONVERT_SPEC(4,float,int32_t,fround) {
     asm ("cvtps2pi %2, %%mm0\n\t"
 	 "cvtps2pi %3, %%mm1\n\t"
 	 "movq %%mm0, %0\n\t"
 	 "movq %%mm1, %1"
 	 : DBL_MMX_ARGS);
     reset_mmx();
 }
 SIMD_CONVERT_SPEC(4,int32_t,float,fround) {
     asm ("cvtpi2ps %2, %%xmm0\n\t"
 	 "shufps $0x4E,%%xmm0,%%xmm0\n\t"
 	 "cvtpi2ps %1, %%xmm0\n\t"
 	 "movups %%xmm0, %0"
 	 : "=m"(oout[0]) : "m"(oin[0]), "m"(oin[2]) : "xmm0", "memory");
 }
 template <> inline int32_t fround<float,int32_t>::operator()(const float& a) const {
     register int32_t rv;
     asm ("movss %1, %%xmm0\n\t"
 	 "cvtss2si %%xmm0, %0"
 	 : "=r"(rv) : "m"(a) : "xmm0" );
     return (rv);
 }
 template <> inline uint32_t fround<float,uint32_t>::operator()(const float& a) const {
     register uint32_t rv;
     asm ("movss %1, %%xmm0\n\t"
 	 "cvtss2si %%xmm0, %0"
 	 : "=r"(rv) : "m"(a) : "xmm0" );
     return (rv);
 }

 SSE_IPASSIGN_SPEC(4,float)
 SSE_IPASSIGN_SPEC(4,int32_t)
 SSE_IPASSIGN_SPEC(4,uint32_t)

 #undef SSE_IPASSIGN_SPEC
 #undef SSE_PASSIGN_SPEC
 #undef SSE_PKOP2_SPEC
 #undef STD_SSE_ARGS
 #endif // CPU_HAS_SSE

 #undef SIMD_PACKEDOP_SPEC

 } // namespace simd
 } // namespace ustl

 #endif
	// This file is part of the ustl library, an STL implementation.
	//
	// Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net>
	// This file is free software, distributed under the MIT License.
	//
	/// \file simd.h
	/// \brief SIMD-type algorithms, with hardware acceleration, if available.
	///
	/// All algorithms are container-based because iterator syntax is just too
	/// damn verbose and because the specializations need to be able to tell
	/// how many elements are in the container in order to choose proper SIMD
	/// instruction set (i.e.: 4 floats select SSE, while 2 floats select 3dNow!)
	/// Specializations are only for the tuple template because the container
	/// must be of a fixed and compile-time-known size for the compiler to be
	/// able to choose the specialization.
	///

	#ifndef SIMD_H_39BE2D970DF4BD00508CCFFB482496F9
	#define SIMD_H_39BE2D970DF4BD00508CCFFB482496F9

	#include "uassert.h"
	#include "ulimits.h"
	#if HAVE_MATH_H
	#include <math.h>
	#endif

	#if PLATFORM_ANDROID
	#include <stdio.h>
	#undef CPU_HAS_MMX
	#endif

	namespace ustl {
	namespace simd {

	//----------------------------------------------------------------------
	// Generic algorithms
	//----------------------------------------------------------------------

	/// Applies \p op to each element in \p op1.
	template <typename Ctr, typename UnaryOperation>
	inline void packop (Ctr& op1, UnaryOperation op)
	{
	foreach (typename Ctr::iterator, i, op1)
	op (*i);
	}

	/// Applies \p op to each element in \p op1 and \p op2 and stores in \p op2.
	template <typename Ctr, typename BinaryOperation>
	inline void packop (const Ctr& op1, Ctr& op2, BinaryOperation op)
	{
	assert (op2.size() <= op1.size());
	typename Ctr::const_iterator i1 (op1.begin());
	typename Ctr::iterator i2 (op2.begin());
	for (; i2 != op2.end(); ++i1, ++i2)
	i2 = op (i2, *i1);
	}

	/// Applies \p op to corresponding elements in \p op1 and \p op2 and stores in \p result.
	template <typename Ctr, typename BinaryOperation>
	inline void packop (const Ctr& op1, const Ctr& op2, Ctr& result, BinaryOperation op)
	{
	assert (op1.size() <= op2.size() && op1.size() <= result.size());
	passign (op1, result);
	packop (op2, result);
	}

	/// Copies \p op1 into \p result.
	template <typename Ctr>
	inline void passign (const Ctr& op1, Ctr& result)
	{
	assert (op1.size() <= result.size());
	typename Ctr::iterator d (result.begin());
	foreach (typename Ctr::const_iterator, s, op1)
	d++ = s;
	}

	/// Copies \p result.size() elements from \p op1 to \p result.
	template <typename Ctr>
	inline void ipassign (typename Ctr::const_iterator op1, Ctr& result)
	{
	foreach (typename Ctr::iterator, d, result)
	d = op1++;
	}

	template <typename Ctr1, typename Ctr2, typename ConvertFunction>
	inline void pconvert (const Ctr1& op1, Ctr2& op2, ConvertFunction f)
	{
	assert (op1.size() <= op2.size());
	typename Ctr1::const_iterator i1 (op1.begin());
	typename Ctr2::iterator i2 (op2.begin());
	for (; i1 != op1.end(); ++i1, ++i2)
	i2 = f (i1);
	}

	// Functionoids for SIMD operations, like saturation arithmetic, shifts, etc.
	STD_BINARY_FUNCTOR (fpadds, T, ((b > numeric_limits<T>::max() - a) ? numeric_limits<T>::max() : a + b))
	STD_BINARY_FUNCTOR (fpsubs, T, ((a < numeric_limits<T>::min() + b) ? numeric_limits<T>::min() : a - b))
	STD_BINARY_FUNCTOR (fpshl, T, (a << b))
	STD_BINARY_FUNCTOR (fpshr, T, (a >> b))
	STD_BINARY_FUNCTOR (fpmin, T, (min (a, b)))
	STD_BINARY_FUNCTOR (fpmax, T, (max (a, b)))
	STD_BINARY_FUNCTOR (fpavg, T, ((a + b + 1) / 2))
	STD_CONVERSION_FUNCTOR (fcast, (D(a)))
	#if HAVE_MATH_H
	STD_UNARY_FUNCTOR (fpreciprocal,T, (1 / a))
	STD_UNARY_FUNCTOR (fpsqrt, T, (reset_mmx(), T (sqrt (a))))
	STD_UNARY_FUNCTOR (fprecipsqrt, T, (reset_mmx(), 1 / T(sqrt (a))))
	STD_UNARY_FUNCTOR (fsin, T, (reset_mmx(), T (sin (a))))
	STD_UNARY_FUNCTOR (fcos, T, (reset_mmx(), T (cos (a))))
	STD_UNARY_FUNCTOR (ftan, T, (reset_mmx(), T (tan (a))))
	#if HAVE_RINTF
	STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rintf(a))))
	#else
	STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rint(a))))
	#endif
	template <> inline int32_t fround<double,int32_t>::operator()(const double& a) const { reset_mmx(); return (int32_t(rint(a))); }
	#endif
	template <> inline float fpavg<float>::operator()(const float& a, const float& b) const { return ((a + b) / 2); }
	template <> inline double fpavg<double>::operator()(const double& a, const double& b) const { return ((a + b) / 2); }

	#define SIMD_PACKEDOP1(name, operation) \
	template <typename Ctr> \
	inline void name (Ctr& op1) \
	{ \
	typedef typename Ctr::value_type value_t; \
	packop (op1, operation<value_t>()); \
	}
	#define SIMD_PACKEDOP2(name, operation) \
	template <typename Ctr> \
	inline void name (const Ctr& op1, Ctr& op2) \
	{ \
	typedef typename Ctr::value_type value_t; \
	packop (op1, op2, operation<value_t>()); \
	}
	#define SIMD_PACKEDOP3(name, operation) \
	template <typename Ctr> \
	inline void name (const Ctr& op1, const Ctr& op2, Ctr& result) \
	{ \
	typedef typename Ctr::value_type value_t; \
	packop (op1, op2, result, operation<value_t>()); \
	}
	#define SIMD_SINGLEOP1(name, operation) \
	template <typename T> \
	inline T name (T op) \
	{ \
	operation<T> obj; \
	return (obj(op)); \
	}
	#define SIMD_CONVERTOP(name, operation) \
	template <typename Ctr1, typename Ctr2> \
	inline void name (const Ctr1& op1, Ctr2& op2) \
	{ \
	typedef typename Ctr1::value_type value1_t; \
	typedef typename Ctr2::value_type value2_t; \
	pconvert (op1, op2, operation<value1_t, value2_t>());\
	}

	SIMD_PACKEDOP2 (padd, plus)
	SIMD_PACKEDOP2 (psub, minus)
	SIMD_PACKEDOP2 (pmul, multiplies)
	SIMD_PACKEDOP2 (pdiv, divides)
	SIMD_PACKEDOP2 (pand, bitwise_and)
	SIMD_PACKEDOP2 (por, bitwise_or)
	SIMD_PACKEDOP2 (pxor, bitwise_xor)
	SIMD_PACKEDOP2 (pshl, fpshl)
	SIMD_PACKEDOP2 (pshr, fpshr)
	SIMD_PACKEDOP2 (psubs, fpsubs)
	SIMD_PACKEDOP2 (pmin, fpmin)
	SIMD_PACKEDOP2 (pmax, fpmax)
	SIMD_PACKEDOP2 (pavg, fpavg)

	SIMD_PACKEDOP3 (padd, plus)
	SIMD_PACKEDOP3 (psub, minus)
	SIMD_PACKEDOP3 (pmul, multiplies)
	SIMD_PACKEDOP3 (pdiv, divides)
	SIMD_PACKEDOP3 (pand, bitwise_and)
	SIMD_PACKEDOP3 (por, bitwise_or)
	SIMD_PACKEDOP3 (pxor, bitwise_xor)
	SIMD_PACKEDOP3 (pshl, fpshl)
	SIMD_PACKEDOP3 (pshr, fpshr)
	SIMD_PACKEDOP3 (padds, fpadds)
	SIMD_PACKEDOP3 (psubs, fpsubs)
	SIMD_PACKEDOP3 (pmin, fpmin)
	SIMD_PACKEDOP3 (pmax, fpmax)
	SIMD_PACKEDOP3 (pavg, fpavg)

	#if HAVE_MATH_H
	SIMD_PACKEDOP1 (precip, fpreciprocal)
	SIMD_PACKEDOP1 (psqrt, fpsqrt)
	SIMD_PACKEDOP1 (precipsqrt, fprecipsqrt)
	SIMD_PACKEDOP1 (psin, fsin)
	SIMD_PACKEDOP1 (pcos, fcos)
	SIMD_PACKEDOP1 (ptan, ftan)

	SIMD_SINGLEOP1 (srecip, fpreciprocal)
	SIMD_SINGLEOP1 (ssqrt, fpsqrt)
	SIMD_SINGLEOP1 (srecipsqrt, fprecipsqrt)
	SIMD_SINGLEOP1 (ssin, fsin)
	SIMD_SINGLEOP1 (scos, fcos)
	SIMD_SINGLEOP1 (stan, ftan)

	SIMD_CONVERTOP (pround, fround)

	template <typename T> inline int32_t sround (T op) { fround<T,int32_t> obj; return (obj (op)); }
	#endif

	#undef SIMD_SINGLEOP1
	#undef SIMD_PACKEDOP3
	#undef SIMD_PACKEDOP2
	#undef SIMD_PACKEDOP1

	//----------------------------------------------------------------------
	// Vector types to cast tuple data to
	//----------------------------------------------------------------------

	#if HAVE_VECTOR_EXTENSIONS && __GNUC__ >= 4
	#define VECTOR_ATTRIBUTE(mode,vs) __attribute__((vector_size(vs)))
	#else
	#define VECTOR_ATTRIBUTE(mode,vs)
	#endif
	typedef uint8_t v8qi_t VECTOR_ATTRIBUTE (V8QI,8);
	typedef uint16_t v4hi_t VECTOR_ATTRIBUTE (V4HI,8);
	typedef uint16_t v8hi_t VECTOR_ATTRIBUTE (V8HI,16);
	typedef uint32_t v2si_t VECTOR_ATTRIBUTE (V2SI,8);
	typedef uint32_t v4si_t VECTOR_ATTRIBUTE (V4SI,16);
	#if HAVE_INT64_T
	typedef uint64_t v1di_t VECTOR_ATTRIBUTE (V1DI,8);
	#endif
	typedef float v2sf_t VECTOR_ATTRIBUTE (V2SF,8);
	typedef float v4sf_t VECTOR_ATTRIBUTE (V4SF,16);
	typedef double v2df_t VECTOR_ATTRIBUTE (V2DF,16);
	#undef VECTOR_ATTRIBUTE

	//----------------------------------------------------------------------
	// Hardware accelerated specializations
	//----------------------------------------------------------------------

	#define SIMD_PKOP2_SPEC(n, type, optype) \
	template <> \
	inline void packop (const tuple<n,type>& oin, tuple<n,type>& oout, optype<type>)
	#define SIMD_PASSIGN_SPEC(n, type) \
	template <> \
	inline void passign (const tuple<n,type>& oin, tuple<n,type>& oout)
	#define SIMD_IPASSIGN_SPEC(n, type) \
	template <> \
	inline void ipassign (tuple<n,type>::const_iterator oin, tuple<n,type>& oout)
	#define SIMD_CONVERT_SPEC(n, type1, type2, optype) \
	template <> \
	inline void pconvert (const tuple<n,type1>& oin, tuple<n,type2>& oout, optype<type1,type2>)

	#if CPU_HAS_MMX
	#define STD_MMX_ARGS "=m"(oout[0]) : "m"(oin[0]) : "mm0", "st", "memory"
	#define DBL_MMX_ARGS "=m"(oout[0]), "=m"(oout[2]) : "m"(oin[0]), "m"(oin[2]) : "mm0", "mm1", "st", "st(1)", "memory"
	#define MMX_PKOP2_SPEC(n,type,optype,instruction) \
	SIMD_PKOP2_SPEC(n,type,optype) \
	{ asm ("movq %0, %%mm0\n\t" #instruction " %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
	#define MMX_DBL_PKOP2_SPEC(n,type,optype,instruction) \
	SIMD_PKOP2_SPEC(n,type,optype) \
	{ asm ("movq %0, %%mm0\n\tmovq %1, %%mm1\n\t" #instruction " %2, %%mm0\n\t" #instruction " %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
	#define MMX_PASSIGN_SPEC(n,type) \
	SIMD_PASSIGN_SPEC(n,type) \
	{ asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
	#define MMX_DBL_PASSIGN_SPEC(n,type) \
	SIMD_PASSIGN_SPEC(n,type) \
	{ asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
	#define MMX_IPASSIGN_SPEC(n,type) \
	SIMD_IPASSIGN_SPEC(n,type) \
	{ asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
	#define MMX_DBL_IPASSIGN_SPEC(n,type) \
	SIMD_IPASSIGN_SPEC(n,type) \
	{ asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }

	MMX_PASSIGN_SPEC(8,uint8_t)
	MMX_PKOP2_SPEC(8,uint8_t,plus,paddb)
	MMX_PKOP2_SPEC(8,uint8_t,minus,psubb)
	MMX_PKOP2_SPEC(8,uint8_t,bitwise_and,pand)
	MMX_PKOP2_SPEC(8,uint8_t,bitwise_or,por)
	MMX_PKOP2_SPEC(8,uint8_t,bitwise_xor,pxor)
	MMX_PKOP2_SPEC(8,uint8_t,fpadds,paddusb)
	MMX_PKOP2_SPEC(8,uint8_t,fpsubs,psubusb)

	MMX_PASSIGN_SPEC(8,int8_t)
	MMX_PKOP2_SPEC(8,int8_t,plus,paddb)
	MMX_PKOP2_SPEC(8,int8_t,minus,psubb)
	MMX_PKOP2_SPEC(8,int8_t,bitwise_and,pand)
	MMX_PKOP2_SPEC(8,int8_t,bitwise_or,por)
	MMX_PKOP2_SPEC(8,int8_t,bitwise_xor,pxor)
	MMX_PKOP2_SPEC(8,int8_t,fpadds,paddsb)
	MMX_PKOP2_SPEC(8,int8_t,fpsubs,psubsb)

	MMX_PASSIGN_SPEC(4,uint16_t)
	MMX_PKOP2_SPEC(4,uint16_t,plus,paddw)
	MMX_PKOP2_SPEC(4,uint16_t,minus,psubw)
	MMX_PKOP2_SPEC(4,uint16_t,bitwise_and,pand)
	MMX_PKOP2_SPEC(4,uint16_t,bitwise_or,por)
	MMX_PKOP2_SPEC(4,uint16_t,bitwise_xor,pxor)
	/// \todo psllw does not work like other operations, it uses the first element for shift count.
	//MMX_PKOP2_SPEC(4,uint16_t,fpshl,psllw)
	//MMX_PKOP2_SPEC(4,uint16_t,fpshr,psrlw)
	MMX_PKOP2_SPEC(4,uint16_t,fpadds,paddusw)
	MMX_PKOP2_SPEC(4,uint16_t,fpsubs,psubusw)

	MMX_PASSIGN_SPEC(4,int16_t)
	MMX_PKOP2_SPEC(4,int16_t,plus,paddw)
	MMX_PKOP2_SPEC(4,int16_t,minus,psubw)
	MMX_PKOP2_SPEC(4,int16_t,bitwise_and,pand)
	MMX_PKOP2_SPEC(4,int16_t,bitwise_or,por)
	MMX_PKOP2_SPEC(4,int16_t,bitwise_xor,pxor)
	//MMX_PKOP2_SPEC(4,int16_t,fpshl,psllw)
	//MMX_PKOP2_SPEC(4,int16_t,fpshr,psrlw)
	MMX_PKOP2_SPEC(4,int16_t,fpadds,paddsw)
	MMX_PKOP2_SPEC(4,int16_t,fpsubs,psubsw)

	MMX_PASSIGN_SPEC(2,uint32_t)
	MMX_PKOP2_SPEC(2,uint32_t,plus,paddd)
	MMX_PKOP2_SPEC(2,uint32_t,minus,psubd)
	MMX_PKOP2_SPEC(2,uint32_t,bitwise_and,pand)
	MMX_PKOP2_SPEC(2,uint32_t,bitwise_or,por)
	MMX_PKOP2_SPEC(2,uint32_t,bitwise_xor,pxor)
	//MMX_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
	//MMX_PKOP2_SPEC(2,uint32_t,fpshr,psrld)

	MMX_PASSIGN_SPEC(2,int32_t)
	MMX_PKOP2_SPEC(2,int32_t,plus,paddd)
	MMX_PKOP2_SPEC(2,int32_t,minus,psubd)
	MMX_PKOP2_SPEC(2,int32_t,bitwise_and,pand)
	MMX_PKOP2_SPEC(2,int32_t,bitwise_or,por)
	MMX_PKOP2_SPEC(2,int32_t,bitwise_xor,pxor)
	//MMX_PKOP2_SPEC(2,int32_t,fpshl,pslld)
	//MMX_PKOP2_SPEC(2,int32_t,fpshr,psrld)

	MMX_DBL_PKOP2_SPEC(4,uint32_t,plus,paddd)
	MMX_DBL_PKOP2_SPEC(4,uint32_t,minus,psubd)
	MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_and,pand)
	MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_or,por)
	MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_xor,pxor)
	//MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
	//MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshr,psrld)

	MMX_DBL_PKOP2_SPEC(4,int32_t,plus,paddd)
	MMX_DBL_PKOP2_SPEC(4,int32_t,minus,psubd)
	MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_and,pand)
	MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_or,por)
	MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_xor,pxor)
	//MMX_DBL_PKOP2_SPEC(2,int32_t,fpshl,pslld)
	//MMX_DBL_PKOP2_SPEC(2,int32_t,fpshr,psrld)

	#if CPU_HAS_SSE \|\| CPU_HAS_3DNOW
	MMX_PKOP2_SPEC(8,uint8_t,fpavg,pavgb)
	MMX_PKOP2_SPEC(8,int8_t,fpavg,pavgb)
	MMX_PKOP2_SPEC(4,uint16_t,fpavg,pavgw)
	MMX_PKOP2_SPEC(4,int16_t,fpavg,pavgw)
	MMX_PKOP2_SPEC(8,uint8_t,fpmin,pminub)
	MMX_PKOP2_SPEC(8,uint8_t,fpmax,pmaxub)
	MMX_PKOP2_SPEC(4,int16_t,fpmax,pmaxsw)
	MMX_PKOP2_SPEC(4,int16_t,fpmin,pminsw)
	#endif // CPU_HAS_SSE \|\| CPU_HAS_3DNOW

	#if CPU_HAS_3DNOW
	MMX_PASSIGN_SPEC(2,float)
	MMX_PKOP2_SPEC(2,float,plus,pfadd)
	MMX_PKOP2_SPEC(2,float,minus,pfsub)
	MMX_PKOP2_SPEC(2,float,multiplies,pfmul)
	MMX_PKOP2_SPEC(2,float,fpmin,pfmin)
	MMX_PKOP2_SPEC(2,float,fpmax,pfmax)
	#ifndef CPU_HAS_SSE
	MMX_DBL_PKOP2_SPEC(4,float,plus,pfadd)
	MMX_DBL_PKOP2_SPEC(4,float,minus,pfsub)
	MMX_DBL_PKOP2_SPEC(4,float,multiplies,pfmul)
	MMX_DBL_PKOP2_SPEC(4,float,fpmin,pfmin)
	MMX_DBL_PKOP2_SPEC(4,float,fpmax,pfmax)
	#endif
	#endif // CPU_HAS_3DNOW

	MMX_IPASSIGN_SPEC(8,uint8_t)
	MMX_IPASSIGN_SPEC(4,uint16_t)
	MMX_IPASSIGN_SPEC(2,uint32_t)
	MMX_IPASSIGN_SPEC(2,float)

	#ifndef CPU_HAS_SSE
	MMX_DBL_PASSIGN_SPEC(4,float)
	MMX_DBL_PASSIGN_SPEC(4,uint32_t)
	MMX_DBL_PASSIGN_SPEC(4,int32_t)
	MMX_DBL_IPASSIGN_SPEC(4,float)
	MMX_DBL_IPASSIGN_SPEC(4,uint32_t)
	MMX_DBL_IPASSIGN_SPEC(4,int32_t)
	#endif

	#undef MMX_IPASSIGN_SPEC
	#undef MMX_PASSIGN_SPEC
	#undef MMX_PKOP2_SPEC
	#undef STD_MMX_ARGS
	#endif // CPU_HAS_MMX

	#if CPU_HAS_SSE
	#define STD_SSE_ARGS "=m"(oout[0]) : "m"(oin[0]) : "xmm0", "memory"
	#define SSE_PKOP2_SPEC(n,type,optype,instruction) \
	SIMD_PKOP2_SPEC(n,type,optype) \
	{ asm ("movups %0, %%xmm0\n\tmovups %1, %%xmm1\n\t" #instruction " %%xmm1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
	#define SSE_PASSIGN_SPEC(n,type) \
	SIMD_PASSIGN_SPEC(n,type) \
	{ asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
	#define SSE_IPASSIGN_SPEC(n,type) \
	SIMD_IPASSIGN_SPEC(n,type) \
	{ asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
	SSE_PASSIGN_SPEC(4,float)
	SSE_PASSIGN_SPEC(4,int32_t)
	SSE_PASSIGN_SPEC(4,uint32_t)
	SSE_PKOP2_SPEC(4,float,plus,addps)
	SSE_PKOP2_SPEC(4,float,minus,subps)
	SSE_PKOP2_SPEC(4,float,multiplies,mulps)
	SSE_PKOP2_SPEC(4,float,divides,divps)
	SSE_PKOP2_SPEC(4,float,bitwise_and,andps)
	SSE_PKOP2_SPEC(4,float,bitwise_or,orps)
	SSE_PKOP2_SPEC(4,float,bitwise_xor,xorps)
	SSE_PKOP2_SPEC(4,float,fpmax,maxps)
	SSE_PKOP2_SPEC(4,float,fpmin,minps)

	SIMD_CONVERT_SPEC(4,float,int32_t,fround) {
	asm ("cvtps2pi %2, %%mm0\n\t"
	"cvtps2pi %3, %%mm1\n\t"
	"movq %%mm0, %0\n\t"
	"movq %%mm1, %1"
	: DBL_MMX_ARGS);
	reset_mmx();
	}
	SIMD_CONVERT_SPEC(4,int32_t,float,fround) {
	asm ("cvtpi2ps %2, %%xmm0\n\t"
	"shufps $0x4E,%%xmm0,%%xmm0\n\t"
	"cvtpi2ps %1, %%xmm0\n\t"
	"movups %%xmm0, %0"
	: "=m"(oout[0]) : "m"(oin[0]), "m"(oin[2]) : "xmm0", "memory");
	}
	template <> inline int32_t fround<float,int32_t>::operator()(const float& a) const {
	register int32_t rv;
	asm ("movss %1, %%xmm0\n\t"
	"cvtss2si %%xmm0, %0"
	: "=r"(rv) : "m"(a) : "xmm0" );
	return (rv);
	}
	template <> inline uint32_t fround<float,uint32_t>::operator()(const float& a) const {
	register uint32_t rv;
	asm ("movss %1, %%xmm0\n\t"
	"cvtss2si %%xmm0, %0"
	: "=r"(rv) : "m"(a) : "xmm0" );
	return (rv);
	}

	SSE_IPASSIGN_SPEC(4,float)
	SSE_IPASSIGN_SPEC(4,int32_t)
	SSE_IPASSIGN_SPEC(4,uint32_t)

	#undef SSE_IPASSIGN_SPEC
	#undef SSE_PASSIGN_SPEC
	#undef SSE_PKOP2_SPEC
	#undef STD_SSE_ARGS
	#endif // CPU_HAS_SSE

	#undef SIMD_PACKEDOP_SPEC

	} // namespace simd
	} // namespace ustl

	#endif