sources/third_party/vulkan/src/libs/glm/gtx/simd_quat.inl - toolchain/prebuilts/ndk/r13 - Git at Google

 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // OpenGL Mathematics Copyright (c) 2005 - 2014 G-Truc Creation (www.g-truc.net)
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Created : 2013-04-22
 // Updated : 2013-04-22
 // Licence : This source is under MIT License
 // File    : glm/gtx/simd_quat.inl
 ///////////////////////////////////////////////////////////////////////////////////////////////////


 namespace glm{
 namespace detail{


 //////////////////////////////////////
 // Debugging
 #if 0
 void print(__m128 v)
 {
     GLM_ALIGN(16) float result[4];
     _mm_store_ps(result, v);

     printf("__m128:    %f %f %f %f\n", result[0], result[1], result[2], result[3]);
 }

 void print(const fvec4SIMD &v)
 {
     printf("fvec4SIMD: %f %f %f %f\n", v.x, v.y, v.z, v.w);
 }
 #endif


 //////////////////////////////////////
 // Implicit basic constructors

 GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD()
 #ifdef GLM_SIMD_ENABLE_DEFAULT_INIT
     : Data(_mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f))
 #endif
 {}

 GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(__m128 const & Data) :
 	Data(Data)
 {}

 GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(fquatSIMD const & q) :
 	Data(q.Data)
 {}


 //////////////////////////////////////
 // Explicit basic constructors

 GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(float const & w, float const & x, float const & y, float const & z) :
 	Data(_mm_set_ps(w, z, y, x))
 {}

 GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(quat const & q) :
 	Data(_mm_set_ps(q.w, q.z, q.y, q.x))
 {}

 GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(vec3 const & eulerAngles)
 {
     vec3 c = glm::cos(eulerAngles * 0.5f);
 	vec3 s = glm::sin(eulerAngles * 0.5f);

     Data = _mm_set_ps(
         (c.x * c.y * c.z) + (s.x * s.y * s.z),
         (c.x * c.y * s.z) - (s.x * s.y * c.z),
         (c.x * s.y * c.z) + (s.x * c.y * s.z),
         (s.x * c.y * c.z) - (c.x * s.y * s.z));
 }


 //////////////////////////////////////
 // Unary arithmetic operators

 GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator=(fquatSIMD const & q)
 {
     this->Data = q.Data;
     return *this;
 }

 GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator*=(float const & s)
 {
 	this->Data = _mm_mul_ps(this->Data, _mm_set_ps1(s));
 	return *this;
 }

 GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator/=(float const & s)
 {
 	this->Data = _mm_div_ps(Data, _mm_set1_ps(s));
 	return *this;
 }


 // negate operator
 GLM_FUNC_QUALIFIER fquatSIMD operator- (fquatSIMD const & q)
 {
     return fquatSIMD(_mm_mul_ps(q.Data, _mm_set_ps(-1.0f, -1.0f, -1.0f, -1.0f)));
 }

 // operator+
 GLM_FUNC_QUALIFIER fquatSIMD operator+ (fquatSIMD const & q1, fquatSIMD const & q2)
 {
 	return fquatSIMD(_mm_add_ps(q1.Data, q2.Data));
 }

 //operator*
 GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const & q2)
 {
     // SSE2 STATS:
     //    11 shuffle
     //    8  mul
     //    8  add

     // SSE4 STATS:
     //    3 shuffle
     //    4 mul
     //    4 dpps

     __m128 mul0 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(0, 1, 2, 3)));
     __m128 mul1 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2)));
     __m128 mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1)));
     __m128 mul3 = _mm_mul_ps(q1.Data, q2.Data);

 #   if((GLM_ARCH & GLM_ARCH_SSE4))
     __m128 add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f), 0xff);
     __m128 add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f), 0xff);
     __m128 add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f), 0xff);
     __m128 add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff);
 #   else
            mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f));
     __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0));
            add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1));

            mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f));
     __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1));
            add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1));

            mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f));
     __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2));
            add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1));

            mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));
     __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3));
            add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1));
 #endif


     // This SIMD code is a politically correct way of doing this, but in every test I've tried it has been slower than
     // the final code below. I'll keep this here for reference - maybe somebody else can do something better...
     //
     //__m128 xxyy = _mm_shuffle_ps(add0, add1, _MM_SHUFFLE(0, 0, 0, 0));
     //__m128 zzww = _mm_shuffle_ps(add2, add3, _MM_SHUFFLE(0, 0, 0, 0));
     //
     //return _mm_shuffle_ps(xxyy, zzww, _MM_SHUFFLE(2, 0, 2, 0));

     float x;
     float y;
     float z;
     float w;

     _mm_store_ss(&x, add0);
     _mm_store_ss(&y, add1);
     _mm_store_ss(&z, add2);
     _mm_store_ss(&w, add3);

     return detail::fquatSIMD(w, x, y, z);
 }

 GLM_FUNC_QUALIFIER fvec4SIMD operator* (fquatSIMD const & q, fvec4SIMD const & v)
 {
     static const __m128 two = _mm_set1_ps(2.0f);

     __m128 q_wwww  = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3));
     __m128 q_swp0  = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 0, 2, 1));
 	__m128 q_swp1  = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 1, 0, 2));
 	__m128 v_swp0  = _mm_shuffle_ps(v.Data, v.Data, _MM_SHUFFLE(3, 0, 2, 1));
 	__m128 v_swp1  = _mm_shuffle_ps(v.Data, v.Data, _MM_SHUFFLE(3, 1, 0, 2));

 	__m128 uv      = _mm_sub_ps(_mm_mul_ps(q_swp0, v_swp1), _mm_mul_ps(q_swp1, v_swp0));
     __m128 uv_swp0 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 0, 2, 1));
     __m128 uv_swp1 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 1, 0, 2));
     __m128 uuv     = _mm_sub_ps(_mm_mul_ps(q_swp0, uv_swp1), _mm_mul_ps(q_swp1, uv_swp0));


     uv  = _mm_mul_ps(uv,  _mm_mul_ps(q_wwww, two));
     uuv = _mm_mul_ps(uuv, two);

     return _mm_add_ps(v.Data, _mm_add_ps(uv, uuv));
 }

 GLM_FUNC_QUALIFIER fvec4SIMD operator* (fvec4SIMD const & v, fquatSIMD const & q)
 {
 	return glm::inverse(q) * v;
 }

 GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q, float s)
 {
 	return fquatSIMD(_mm_mul_ps(q.Data, _mm_set1_ps(s)));
 }

 GLM_FUNC_QUALIFIER fquatSIMD operator* (float s, fquatSIMD const & q)
 {
 	return fquatSIMD(_mm_mul_ps(_mm_set1_ps(s), q.Data));
 }


 //operator/
 GLM_FUNC_QUALIFIER fquatSIMD operator/ (fquatSIMD const & q, float s)
 {
 	return fquatSIMD(_mm_div_ps(q.Data, _mm_set1_ps(s)));
 }


 }//namespace detail


 GLM_FUNC_QUALIFIER quat quat_cast
 (
 	detail::fquatSIMD const & x
 )
 {
 	GLM_ALIGN(16) quat Result;
 	_mm_store_ps(&Result[0], x.Data);

 	return Result;
 }

 template <typename T>
 GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast_impl(const T m0[], const T m1[], const T m2[])
 {
     T trace = m0[0] + m1[1] + m2[2] + T(1.0);
     if (trace > T(0))
     {
         T s = static_cast<T>(0.5) / sqrt(trace);

         return _mm_set_ps(
             static_cast<float>(T(0.25) / s),
             static_cast<float>((m0[1] - m1[0]) * s),
             static_cast<float>((m2[0] - m0[2]) * s),
             static_cast<float>((m1[2] - m2[1]) * s));
     }
     else
     {
         if (m0[0] > m1[1])
         {
             if (m0[0] > m2[2])
             {
                 // X is biggest.
                 T s = sqrt(m0[0] - m1[1] - m2[2] + T(1.0)) * T(0.5);

                 return _mm_set_ps(
                     static_cast<float>((m1[2] - m2[1]) * s),
                     static_cast<float>((m2[0] + m0[2]) * s),
                     static_cast<float>((m0[1] + m1[0]) * s),
                     static_cast<float>(T(0.5)          * s));
             }
         }
         else
         {
             if (m1[1] > m2[2])
             {
                 // Y is biggest.
                 T s = sqrt(m1[1] - m0[0] - m2[2] + T(1.0)) * T(0.5);

                 return _mm_set_ps(
                     static_cast<float>((m2[0] - m0[2]) * s),
                     static_cast<float>((m1[2] + m2[1]) * s),
                     static_cast<float>(T(0.5)          * s),
                     static_cast<float>((m0[1] + m1[0]) * s));
             }
         }

         // Z is biggest.
         T s = sqrt(m2[2] - m0[0] - m1[1] + T(1.0)) * T(0.5);

         return _mm_set_ps(
             static_cast<float>((m0[1] - m1[0]) * s),
             static_cast<float>(T(0.5)          * s),
             static_cast<float>((m1[2] + m2[1]) * s),
             static_cast<float>((m2[0] + m0[2]) * s));
     }
 }

 GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast
 (
 	detail::fmat4x4SIMD const & m
 )
 {
     // Scalar implementation for now.
     GLM_ALIGN(16) float m0[4];
     GLM_ALIGN(16) float m1[4];
     GLM_ALIGN(16) float m2[4];

     _mm_store_ps(m0, m[0].Data);
     _mm_store_ps(m1, m[1].Data);
     _mm_store_ps(m2, m[2].Data);

     return quatSIMD_cast_impl(m0, m1, m2);
 }

 template <typename T, precision P>
 GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast
 (
     detail::tmat4x4<T, P> const & m
 )
 {
     return quatSIMD_cast_impl(&m[0][0], &m[1][0], &m[2][0]);
 }

 template <typename T, precision P>
 GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast
 (
     detail::tmat3x3<T, P> const & m
 )
 {
     return quatSIMD_cast_impl(&m[0][0], &m[1][0], &m[2][0]);
 }


 GLM_FUNC_QUALIFIER detail::fmat4x4SIMD mat4SIMD_cast
 (
 	detail::fquatSIMD const & q
 )
 {
     detail::fmat4x4SIMD result;

     __m128 _wwww  = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3));
     __m128 _xyzw  = q.Data;
     __m128 _zxyw  = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 1, 0, 2));
     __m128 _yzxw  = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 0, 2, 1));

     __m128 _xyzw2 = _mm_add_ps(_xyzw, _xyzw);
     __m128 _zxyw2 = _mm_shuffle_ps(_xyzw2, _xyzw2, _MM_SHUFFLE(3, 1, 0, 2));
     __m128 _yzxw2 = _mm_shuffle_ps(_xyzw2, _xyzw2, _MM_SHUFFLE(3, 0, 2, 1));

     __m128 _tmp0  = _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(_yzxw2, _yzxw));
            _tmp0  = _mm_sub_ps(_tmp0, _mm_mul_ps(_zxyw2, _zxyw));

     __m128 _tmp1  = _mm_mul_ps(_yzxw2, _xyzw);
            _tmp1  = _mm_add_ps(_tmp1, _mm_mul_ps(_zxyw2, _wwww));

     __m128 _tmp2  = _mm_mul_ps(_zxyw2, _xyzw);
            _tmp2  = _mm_sub_ps(_tmp2, _mm_mul_ps(_yzxw2, _wwww));


     // There's probably a better, more politically correct way of doing this...
     result[0].Data = _mm_set_ps(
         0.0f,
         reinterpret_cast<float*>(&_tmp2)[0],
         reinterpret_cast<float*>(&_tmp1)[0],
         reinterpret_cast<float*>(&_tmp0)[0]);

     result[1].Data = _mm_set_ps(
         0.0f,
         reinterpret_cast<float*>(&_tmp1)[1],
         reinterpret_cast<float*>(&_tmp0)[1],
         reinterpret_cast<float*>(&_tmp2)[1]);

     result[2].Data = _mm_set_ps(
         0.0f,
         reinterpret_cast<float*>(&_tmp0)[2],
         reinterpret_cast<float*>(&_tmp2)[2],
         reinterpret_cast<float*>(&_tmp1)[2]);

    result[3].Data = _mm_set_ps(
         1.0f,
         0.0f,
         0.0f,
         0.0f);


     return result;
 }

 GLM_FUNC_QUALIFIER mat4 mat4_cast
 (
 	detail::fquatSIMD const & q
 )
 {
     return mat4_cast(mat4SIMD_cast(q));
 }


 GLM_FUNC_QUALIFIER float length
 (
 	detail::fquatSIMD const & q
 )
 {
     return glm::sqrt(dot(q, q));
 }

 GLM_FUNC_QUALIFIER detail::fquatSIMD normalize
 (
 	detail::fquatSIMD const & q
 )
 {
     return _mm_mul_ps(q.Data, _mm_set1_ps(1.0f / length(q)));
 }

 GLM_FUNC_QUALIFIER float dot
 (
 	detail::fquatSIMD const & q1,
 	detail::fquatSIMD const & q2
 )
 {
     float result;
     _mm_store_ss(&result, detail::sse_dot_ps(q1.Data, q2.Data));

     return result;
 }

 GLM_FUNC_QUALIFIER detail::fquatSIMD mix
 (
 	detail::fquatSIMD const & x,
 	detail::fquatSIMD const & y,
 	float const & a
 )
 {
 	float cosTheta = dot(x, y);

     if (cosTheta > 1.0f - glm::epsilon<float>())
     {
 	    return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
     }
     else
     {
         float angle = glm::acos(cosTheta);


         float s0 = glm::sin((1.0f - a) * angle);
         float s1 = glm::sin(a * angle);
         float d  = 1.0f / glm::sin(angle);

         return (s0 * x + s1 * y) * d;
     }
 }

 GLM_FUNC_QUALIFIER detail::fquatSIMD lerp
 (
 	detail::fquatSIMD const & x,
 	detail::fquatSIMD const & y,
 	float const & a
 )
 {
 	// Lerp is only defined in [0, 1]
 	assert(a >= 0.0f);
 	assert(a <= 1.0f);

     return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
 }

 GLM_FUNC_QUALIFIER detail::fquatSIMD slerp
 (
 	detail::fquatSIMD const & x,
 	detail::fquatSIMD const & y,
 	float const & a
 )
 {
 	detail::fquatSIMD z = y;

 	float cosTheta = dot(x, y);

 	// If cosTheta < 0, the interpolation will take the long way around the sphere.
 	// To fix this, one quat must be negated.
 	if (cosTheta < 0.0f)
 	{
 		z        = -y;
 		cosTheta = -cosTheta;
 	}

 	// Perform a linear interpolation when cosTheta is close to 1 to avoid side effect of sin(angle) becoming a zero denominator
 	if(cosTheta > 1.0f - epsilon<float>())
 	{
 		return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
 	}
 	else
 	{
         float angle = glm::acos(cosTheta);


 		float s0 = glm::sin((1.0f - a) * angle);
         float s1 = glm::sin(a * angle);
         float d  = 1.0f / glm::sin(angle);

         return (s0 * x + s1 * y) * d;
 	}
 }


 GLM_FUNC_QUALIFIER detail::fquatSIMD fastMix
 (
 	detail::fquatSIMD const & x,
 	detail::fquatSIMD const & y,
 	float const & a
 )
 {
 	float cosTheta = dot(x, y);

     if (cosTheta > 1.0f - glm::epsilon<float>())
     {
 	    return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
     }
     else
     {
         float angle = glm::fastAcos(cosTheta);


         __m128 s  = glm::fastSin(_mm_set_ps((1.0f - a) * angle, a * angle, angle, 0.0f));

         __m128 s0 =                               _mm_shuffle_ps(s, s, _MM_SHUFFLE(3, 3, 3, 3));
         __m128 s1 =                               _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 2, 2, 2));
         __m128 d  = _mm_div_ps(_mm_set1_ps(1.0f), _mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 1, 1)));

         return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d);
     }
 }

 GLM_FUNC_QUALIFIER detail::fquatSIMD fastSlerp
 (
 	detail::fquatSIMD const & x,
 	detail::fquatSIMD const & y,
 	float const & a
 )
 {
 	detail::fquatSIMD z = y;

 	float cosTheta = dot(x, y);
 	if (cosTheta < 0.0f)
 	{
 		z        = -y;
 		cosTheta = -cosTheta;
 	}


 	if(cosTheta > 1.0f - epsilon<float>())
 	{
 		return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
 	}
 	else
 	{
         float angle = glm::fastAcos(cosTheta);


         __m128 s  = glm::fastSin(_mm_set_ps((1.0f - a) * angle, a * angle, angle, 0.0f));

         __m128 s0 =                               _mm_shuffle_ps(s, s, _MM_SHUFFLE(3, 3, 3, 3));
         __m128 s1 =                               _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 2, 2, 2));
         __m128 d  = _mm_div_ps(_mm_set1_ps(1.0f), _mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 1, 1)));

         return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d);
 	}
 }


 GLM_FUNC_QUALIFIER detail::fquatSIMD conjugate
 (
 	detail::fquatSIMD const & q
 )
 {
 	return detail::fquatSIMD(_mm_mul_ps(q.Data, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)));
 }

 GLM_FUNC_QUALIFIER detail::fquatSIMD inverse
 (
 	detail::fquatSIMD const & q
 )
 {
 	return conjugate(q) / dot(q, q);
 }


 GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD
 (
 	float const & angle,
 	vec3 const & v
 )
 {
 #ifdef GLM_FORCE_RADIANS
 	float a(angle);
 #else
 #	pragma message("GLM: rotateZ function taking degrees as parameters is deprecated. #define GLM_FORCE_RADIANS before including GLM headers to remove this message.")
 	float a(glm::radians(angle));
 #endif
 	float s = glm::sin(a * 0.5f);

 	return _mm_set_ps(
 		glm::cos(a * 0.5f),
 		v.z * s,
 		v.y * s,
 		v.x * s);
 }

 GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD
 (
 	float const & angle,
 	float const & x,
 	float const & y,
 	float const & z
 )
 {
 	return angleAxisSIMD(angle, vec3(x, y, z));
 }


 GLM_FUNC_QUALIFIER __m128 fastSin(__m128 x)
 {
     static const __m128 c0 = _mm_set1_ps(0.16666666666666666666666666666667f);
     static const __m128 c1 = _mm_set1_ps(0.00833333333333333333333333333333f);
     static const __m128 c2 = _mm_set1_ps(0.00019841269841269841269841269841f);

     __m128 x3 = _mm_mul_ps(x,  _mm_mul_ps(x, x));
     __m128 x5 = _mm_mul_ps(x3, _mm_mul_ps(x, x));
     __m128 x7 = _mm_mul_ps(x5, _mm_mul_ps(x, x));

     __m128 y0 = _mm_mul_ps(x3, c0);
     __m128 y1 = _mm_mul_ps(x5, c1);
     __m128 y2 = _mm_mul_ps(x7, c2);

     return _mm_sub_ps(_mm_add_ps(_mm_sub_ps(x, y0), y1), y2);
 }


 }//namespace glm
	///////////////////////////////////////////////////////////////////////////////////////////////////
	// OpenGL Mathematics Copyright (c) 2005 - 2014 G-Truc Creation (www.g-truc.net)
	///////////////////////////////////////////////////////////////////////////////////////////////////
	// Created : 2013-04-22
	// Updated : 2013-04-22
	// Licence : This source is under MIT License
	// File : glm/gtx/simd_quat.inl
	///////////////////////////////////////////////////////////////////////////////////////////////////


	namespace glm{
	namespace detail{


	//////////////////////////////////////
	// Debugging
	#if 0
	void print(__m128 v)
	{
	GLM_ALIGN(16) float result[4];
	_mm_store_ps(result, v);

	printf("__m128: %f %f %f %f\n", result[0], result[1], result[2], result[3]);
	}

	void print(const fvec4SIMD &v)
	{
	printf("fvec4SIMD: %f %f %f %f\n", v.x, v.y, v.z, v.w);
	}
	#endif


	//////////////////////////////////////
	// Implicit basic constructors

	GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD()
	#ifdef GLM_SIMD_ENABLE_DEFAULT_INIT
	: Data(_mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f))
	#endif
	{}

	GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(__m128 const & Data) :
	Data(Data)
	{}

	GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(fquatSIMD const & q) :
	Data(q.Data)
	{}


	//////////////////////////////////////
	// Explicit basic constructors

	GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(float const & w, float const & x, float const & y, float const & z) :
	Data(_mm_set_ps(w, z, y, x))
	{}

	GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(quat const & q) :
	Data(_mm_set_ps(q.w, q.z, q.y, q.x))
	{}

	GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(vec3 const & eulerAngles)
	{
	vec3 c = glm::cos(eulerAngles * 0.5f);
	vec3 s = glm::sin(eulerAngles * 0.5f);

	Data = _mm_set_ps(
	(c.x * c.y * c.z) + (s.x * s.y * s.z),
	(c.x * c.y * s.z) - (s.x * s.y * c.z),
	(c.x * s.y * c.z) + (s.x * c.y * s.z),
	(s.x * c.y * c.z) - (c.x * s.y * s.z));
	}


	//////////////////////////////////////
	// Unary arithmetic operators

	GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator=(fquatSIMD const & q)
	{
	this->Data = q.Data;
	return *this;
	}

	GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator*=(float const & s)
	{
	this->Data = _mm_mul_ps(this->Data, _mm_set_ps1(s));
	return *this;
	}

	GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator/=(float const & s)
	{
	this->Data = _mm_div_ps(Data, _mm_set1_ps(s));
	return *this;
	}



	// negate operator
	GLM_FUNC_QUALIFIER fquatSIMD operator- (fquatSIMD const & q)
	{
	return fquatSIMD(_mm_mul_ps(q.Data, _mm_set_ps(-1.0f, -1.0f, -1.0f, -1.0f)));
	}

	// operator+
	GLM_FUNC_QUALIFIER fquatSIMD operator+ (fquatSIMD const & q1, fquatSIMD const & q2)
	{
	return fquatSIMD(_mm_add_ps(q1.Data, q2.Data));
	}

	//operator*
	GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const & q2)
	{
	// SSE2 STATS:
	// 11 shuffle
	// 8 mul
	// 8 add

	// SSE4 STATS:
	// 3 shuffle
	// 4 mul
	// 4 dpps

	__m128 mul0 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(0, 1, 2, 3)));
	__m128 mul1 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2)));
	__m128 mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1)));
	__m128 mul3 = _mm_mul_ps(q1.Data, q2.Data);

	# if((GLM_ARCH & GLM_ARCH_SSE4))
	__m128 add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f), 0xff);
	__m128 add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f), 0xff);
	__m128 add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f), 0xff);
	__m128 add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff);
	# else
	mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f));
	__m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0));
	add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1));

	mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f));
	__m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1));
	add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1));

	mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f));
	__m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2));
	add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1));

	mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));
	__m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3));
	add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1));
	#endif


	// This SIMD code is a politically correct way of doing this, but in every test I've tried it has been slower than
	// the final code below. I'll keep this here for reference - maybe somebody else can do something better...
	//
	//__m128 xxyy = _mm_shuffle_ps(add0, add1, _MM_SHUFFLE(0, 0, 0, 0));
	//__m128 zzww = _mm_shuffle_ps(add2, add3, _MM_SHUFFLE(0, 0, 0, 0));
	//
	//return _mm_shuffle_ps(xxyy, zzww, _MM_SHUFFLE(2, 0, 2, 0));

	float x;
	float y;
	float z;
	float w;

	_mm_store_ss(&x, add0);
	_mm_store_ss(&y, add1);
	_mm_store_ss(&z, add2);
	_mm_store_ss(&w, add3);

	return detail::fquatSIMD(w, x, y, z);
	}

	GLM_FUNC_QUALIFIER fvec4SIMD operator* (fquatSIMD const & q, fvec4SIMD const & v)
	{
	static const __m128 two = _mm_set1_ps(2.0f);

	__m128 q_wwww = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3));
	__m128 q_swp0 = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 0, 2, 1));
	__m128 q_swp1 = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 1, 0, 2));
	__m128 v_swp0 = _mm_shuffle_ps(v.Data, v.Data, _MM_SHUFFLE(3, 0, 2, 1));
	__m128 v_swp1 = _mm_shuffle_ps(v.Data, v.Data, _MM_SHUFFLE(3, 1, 0, 2));

	__m128 uv = _mm_sub_ps(_mm_mul_ps(q_swp0, v_swp1), _mm_mul_ps(q_swp1, v_swp0));
	__m128 uv_swp0 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 0, 2, 1));
	__m128 uv_swp1 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 1, 0, 2));
	__m128 uuv = _mm_sub_ps(_mm_mul_ps(q_swp0, uv_swp1), _mm_mul_ps(q_swp1, uv_swp0));


	uv = _mm_mul_ps(uv, _mm_mul_ps(q_wwww, two));
	uuv = _mm_mul_ps(uuv, two);

	return _mm_add_ps(v.Data, _mm_add_ps(uv, uuv));
	}

	GLM_FUNC_QUALIFIER fvec4SIMD operator* (fvec4SIMD const & v, fquatSIMD const & q)
	{
	return glm::inverse(q) * v;
	}

	GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q, float s)
	{
	return fquatSIMD(_mm_mul_ps(q.Data, _mm_set1_ps(s)));
	}

	GLM_FUNC_QUALIFIER fquatSIMD operator* (float s, fquatSIMD const & q)
	{
	return fquatSIMD(_mm_mul_ps(_mm_set1_ps(s), q.Data));
	}


	//operator/
	GLM_FUNC_QUALIFIER fquatSIMD operator/ (fquatSIMD const & q, float s)
	{
	return fquatSIMD(_mm_div_ps(q.Data, _mm_set1_ps(s)));
	}


	}//namespace detail


	GLM_FUNC_QUALIFIER quat quat_cast
	(
	detail::fquatSIMD const & x
	)
	{
	GLM_ALIGN(16) quat Result;
	_mm_store_ps(&Result[0], x.Data);

	return Result;
	}

	template <typename T>
	GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast_impl(const T m0[], const T m1[], const T m2[])
	{
	T trace = m0[0] + m1[1] + m2[2] + T(1.0);
	if (trace > T(0))
	{
	T s = static_cast<T>(0.5) / sqrt(trace);

	return _mm_set_ps(
	static_cast<float>(T(0.25) / s),
	static_cast<float>((m0[1] - m1[0]) * s),
	static_cast<float>((m2[0] - m0[2]) * s),
	static_cast<float>((m1[2] - m2[1]) * s));
	}
	else
	{
	if (m0[0] > m1[1])
	{
	if (m0[0] > m2[2])
	{
	// X is biggest.
	T s = sqrt(m0[0] - m1[1] - m2[2] + T(1.0)) * T(0.5);

	return _mm_set_ps(
	static_cast<float>((m1[2] - m2[1]) * s),
	static_cast<float>((m2[0] + m0[2]) * s),
	static_cast<float>((m0[1] + m1[0]) * s),
	static_cast<float>(T(0.5) * s));
	}
	}
	else
	{
	if (m1[1] > m2[2])
	{
	// Y is biggest.
	T s = sqrt(m1[1] - m0[0] - m2[2] + T(1.0)) * T(0.5);

	return _mm_set_ps(
	static_cast<float>((m2[0] - m0[2]) * s),
	static_cast<float>((m1[2] + m2[1]) * s),
	static_cast<float>(T(0.5) * s),
	static_cast<float>((m0[1] + m1[0]) * s));
	}
	}

	// Z is biggest.
	T s = sqrt(m2[2] - m0[0] - m1[1] + T(1.0)) * T(0.5);

	return _mm_set_ps(
	static_cast<float>((m0[1] - m1[0]) * s),
	static_cast<float>(T(0.5) * s),
	static_cast<float>((m1[2] + m2[1]) * s),
	static_cast<float>((m2[0] + m0[2]) * s));
	}
	}

	GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast
	(
	detail::fmat4x4SIMD const & m
	)
	{
	// Scalar implementation for now.
	GLM_ALIGN(16) float m0[4];
	GLM_ALIGN(16) float m1[4];
	GLM_ALIGN(16) float m2[4];

	_mm_store_ps(m0, m[0].Data);
	_mm_store_ps(m1, m[1].Data);
	_mm_store_ps(m2, m[2].Data);

	return quatSIMD_cast_impl(m0, m1, m2);
	}

	template <typename T, precision P>
	GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast
	(
	detail::tmat4x4<T, P> const & m
	)
	{
	return quatSIMD_cast_impl(&m[0][0], &m[1][0], &m[2][0]);
	}

	template <typename T, precision P>
	GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast
	(
	detail::tmat3x3<T, P> const & m
	)
	{
	return quatSIMD_cast_impl(&m[0][0], &m[1][0], &m[2][0]);
	}


	GLM_FUNC_QUALIFIER detail::fmat4x4SIMD mat4SIMD_cast
	(
	detail::fquatSIMD const & q
	)
	{
	detail::fmat4x4SIMD result;

	__m128 _wwww = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3));
	__m128 _xyzw = q.Data;
	__m128 _zxyw = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 1, 0, 2));
	__m128 _yzxw = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 0, 2, 1));

	__m128 _xyzw2 = _mm_add_ps(_xyzw, _xyzw);
	__m128 _zxyw2 = _mm_shuffle_ps(_xyzw2, _xyzw2, _MM_SHUFFLE(3, 1, 0, 2));
	__m128 _yzxw2 = _mm_shuffle_ps(_xyzw2, _xyzw2, _MM_SHUFFLE(3, 0, 2, 1));

	__m128 _tmp0 = _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(_yzxw2, _yzxw));
	_tmp0 = _mm_sub_ps(_tmp0, _mm_mul_ps(_zxyw2, _zxyw));

	__m128 _tmp1 = _mm_mul_ps(_yzxw2, _xyzw);
	_tmp1 = _mm_add_ps(_tmp1, _mm_mul_ps(_zxyw2, _wwww));

	__m128 _tmp2 = _mm_mul_ps(_zxyw2, _xyzw);
	_tmp2 = _mm_sub_ps(_tmp2, _mm_mul_ps(_yzxw2, _wwww));


	// There's probably a better, more politically correct way of doing this...
	result[0].Data = _mm_set_ps(
	0.0f,
	reinterpret_cast<float*>(&_tmp2)[0],
	reinterpret_cast<float*>(&_tmp1)[0],
	reinterpret_cast<float*>(&_tmp0)[0]);

	result[1].Data = _mm_set_ps(
	0.0f,
	reinterpret_cast<float*>(&_tmp1)[1],
	reinterpret_cast<float*>(&_tmp0)[1],
	reinterpret_cast<float*>(&_tmp2)[1]);

	result[2].Data = _mm_set_ps(
	0.0f,
	reinterpret_cast<float*>(&_tmp0)[2],
	reinterpret_cast<float*>(&_tmp2)[2],
	reinterpret_cast<float*>(&_tmp1)[2]);

	result[3].Data = _mm_set_ps(
	1.0f,
	0.0f,
	0.0f,
	0.0f);


	return result;
	}

	GLM_FUNC_QUALIFIER mat4 mat4_cast
	(
	detail::fquatSIMD const & q
	)
	{
	return mat4_cast(mat4SIMD_cast(q));
	}



	GLM_FUNC_QUALIFIER float length
	(
	detail::fquatSIMD const & q
	)
	{
	return glm::sqrt(dot(q, q));
	}

	GLM_FUNC_QUALIFIER detail::fquatSIMD normalize
	(
	detail::fquatSIMD const & q
	)
	{
	return _mm_mul_ps(q.Data, _mm_set1_ps(1.0f / length(q)));
	}

	GLM_FUNC_QUALIFIER float dot
	(
	detail::fquatSIMD const & q1,
	detail::fquatSIMD const & q2
	)
	{
	float result;
	_mm_store_ss(&result, detail::sse_dot_ps(q1.Data, q2.Data));

	return result;
	}

	GLM_FUNC_QUALIFIER detail::fquatSIMD mix
	(
	detail::fquatSIMD const & x,
	detail::fquatSIMD const & y,
	float const & a
	)
	{
	float cosTheta = dot(x, y);

	if (cosTheta > 1.0f - glm::epsilon<float>())
	{
	return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
	}
	else
	{
	float angle = glm::acos(cosTheta);


	float s0 = glm::sin((1.0f - a) * angle);
	float s1 = glm::sin(a * angle);
	float d = 1.0f / glm::sin(angle);

	return (s0 * x + s1 * y) * d;
	}
	}

	GLM_FUNC_QUALIFIER detail::fquatSIMD lerp
	(
	detail::fquatSIMD const & x,
	detail::fquatSIMD const & y,
	float const & a
	)
	{
	// Lerp is only defined in [0, 1]
	assert(a >= 0.0f);
	assert(a <= 1.0f);

	return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
	}

	GLM_FUNC_QUALIFIER detail::fquatSIMD slerp
	(
	detail::fquatSIMD const & x,
	detail::fquatSIMD const & y,
	float const & a
	)
	{
	detail::fquatSIMD z = y;

	float cosTheta = dot(x, y);

	// If cosTheta < 0, the interpolation will take the long way around the sphere.
	// To fix this, one quat must be negated.
	if (cosTheta < 0.0f)
	{
	z = -y;
	cosTheta = -cosTheta;
	}

	// Perform a linear interpolation when cosTheta is close to 1 to avoid side effect of sin(angle) becoming a zero denominator
	if(cosTheta > 1.0f - epsilon<float>())
	{
	return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
	}
	else
	{
	float angle = glm::acos(cosTheta);


	float s0 = glm::sin((1.0f - a) * angle);
	float s1 = glm::sin(a * angle);
	float d = 1.0f / glm::sin(angle);

	return (s0 * x + s1 * y) * d;
	}
	}


	GLM_FUNC_QUALIFIER detail::fquatSIMD fastMix
	(
	detail::fquatSIMD const & x,
	detail::fquatSIMD const & y,
	float const & a
	)
	{
	float cosTheta = dot(x, y);

	if (cosTheta > 1.0f - glm::epsilon<float>())
	{
	return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
	}
	else
	{
	float angle = glm::fastAcos(cosTheta);


	__m128 s = glm::fastSin(_mm_set_ps((1.0f - a) * angle, a * angle, angle, 0.0f));

	__m128 s0 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(3, 3, 3, 3));
	__m128 s1 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 2, 2, 2));
	__m128 d = _mm_div_ps(_mm_set1_ps(1.0f), _mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 1, 1)));

	return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d);
	}
	}

	GLM_FUNC_QUALIFIER detail::fquatSIMD fastSlerp
	(
	detail::fquatSIMD const & x,
	detail::fquatSIMD const & y,
	float const & a
	)
	{
	detail::fquatSIMD z = y;

	float cosTheta = dot(x, y);
	if (cosTheta < 0.0f)
	{
	z = -y;
	cosTheta = -cosTheta;
	}


	if(cosTheta > 1.0f - epsilon<float>())
	{
	return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
	}
	else
	{
	float angle = glm::fastAcos(cosTheta);


	__m128 s = glm::fastSin(_mm_set_ps((1.0f - a) * angle, a * angle, angle, 0.0f));

	__m128 s0 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(3, 3, 3, 3));
	__m128 s1 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 2, 2, 2));
	__m128 d = _mm_div_ps(_mm_set1_ps(1.0f), _mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 1, 1)));

	return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d);
	}
	}



	GLM_FUNC_QUALIFIER detail::fquatSIMD conjugate
	(
	detail::fquatSIMD const & q
	)
	{
	return detail::fquatSIMD(_mm_mul_ps(q.Data, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)));
	}

	GLM_FUNC_QUALIFIER detail::fquatSIMD inverse
	(
	detail::fquatSIMD const & q
	)
	{
	return conjugate(q) / dot(q, q);
	}


	GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD
	(
	float const & angle,
	vec3 const & v
	)
	{
	#ifdef GLM_FORCE_RADIANS
	float a(angle);
	#else
	# pragma message("GLM: rotateZ function taking degrees as parameters is deprecated. #define GLM_FORCE_RADIANS before including GLM headers to remove this message.")
	float a(glm::radians(angle));
	#endif
	float s = glm::sin(a * 0.5f);

	return _mm_set_ps(
	glm::cos(a * 0.5f),
	v.z * s,
	v.y * s,
	v.x * s);
	}

	GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD
	(
	float const & angle,
	float const & x,
	float const & y,
	float const & z
	)
	{
	return angleAxisSIMD(angle, vec3(x, y, z));
	}


	GLM_FUNC_QUALIFIER __m128 fastSin(__m128 x)
	{
	static const __m128 c0 = _mm_set1_ps(0.16666666666666666666666666666667f);
	static const __m128 c1 = _mm_set1_ps(0.00833333333333333333333333333333f);
	static const __m128 c2 = _mm_set1_ps(0.00019841269841269841269841269841f);

	__m128 x3 = _mm_mul_ps(x, _mm_mul_ps(x, x));
	__m128 x5 = _mm_mul_ps(x3, _mm_mul_ps(x, x));
	__m128 x7 = _mm_mul_ps(x5, _mm_mul_ps(x, x));

	__m128 y0 = _mm_mul_ps(x3, c0);
	__m128 y1 = _mm_mul_ps(x5, c1);
	__m128 y2 = _mm_mul_ps(x7, c2);

	return _mm_sub_ps(_mm_add_ps(_mm_sub_ps(x, y0), y1), y2);
	}


	}//namespace glm