src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp - platform/external/mesa3d - Git at Google

 /****************************************************************************
  * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  ****************************************************************************/
 #pragma once

 #if !defined(__cplusplus)
 #error C++ compilation required
 #endif

 #include <immintrin.h>
 #include <inttypes.h>
 #include <stdint.h>

 #define SIMD_ARCH_AVX 0
 #define SIMD_ARCH_AVX2 1
 #define SIMD_ARCH_AVX512 2

 #if !defined(SIMD_ARCH)
 #define SIMD_ARCH SIMD_ARCH_AVX
 #endif

 #if defined(_MSC_VER)
 #define SIMDCALL __vectorcall
 #define SIMDINLINE __forceinline
 #define SIMDALIGN(type_, align_) __declspec(align(align_)) type_
 #else
 #define SIMDCALL
 #define SIMDINLINE inline
 #define SIMDALIGN(type_, align_) type_ __attribute__((aligned(align_)))
 #endif

 // For documentation, please see the following include...
 // #include "simdlib_interface.hpp"

 namespace SIMDImpl
 {
     enum class CompareType
     {
         EQ_OQ    = 0x00, // Equal (ordered, nonsignaling)
         LT_OS    = 0x01, // Less-than (ordered, signaling)
         LE_OS    = 0x02, // Less-than-or-equal (ordered, signaling)
         UNORD_Q  = 0x03, // Unordered (nonsignaling)
         NEQ_UQ   = 0x04, // Not-equal (unordered, nonsignaling)
         NLT_US   = 0x05, // Not-less-than (unordered, signaling)
         NLE_US   = 0x06, // Not-less-than-or-equal (unordered, signaling)
         ORD_Q    = 0x07, // Ordered (nonsignaling)
         EQ_UQ    = 0x08, // Equal (unordered, non-signaling)
         NGE_US   = 0x09, // Not-greater-than-or-equal (unordered, signaling)
         NGT_US   = 0x0A, // Not-greater-than (unordered, signaling)
         FALSE_OQ = 0x0B, // False (ordered, nonsignaling)
         NEQ_OQ   = 0x0C, // Not-equal (ordered, non-signaling)
         GE_OS    = 0x0D, // Greater-than-or-equal (ordered, signaling)
         GT_OS    = 0x0E, // Greater-than (ordered, signaling)
         TRUE_UQ  = 0x0F, // True (unordered, non-signaling)
         EQ_OS    = 0x10, // Equal (ordered, signaling)
         LT_OQ    = 0x11, // Less-than (ordered, nonsignaling)
         LE_OQ    = 0x12, // Less-than-or-equal (ordered, nonsignaling)
         UNORD_S  = 0x13, // Unordered (signaling)
         NEQ_US   = 0x14, // Not-equal (unordered, signaling)
         NLT_UQ   = 0x15, // Not-less-than (unordered, nonsignaling)
         NLE_UQ   = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
         ORD_S    = 0x17, // Ordered (signaling)
         EQ_US    = 0x18, // Equal (unordered, signaling)
         NGE_UQ   = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
         NGT_UQ   = 0x1A, // Not-greater-than (unordered, nonsignaling)
         FALSE_OS = 0x1B, // False (ordered, signaling)
         NEQ_OS   = 0x1C, // Not-equal (ordered, signaling)
         GE_OQ    = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
         GT_OQ    = 0x1E, // Greater-than (ordered, nonsignaling)
         TRUE_US  = 0x1F, // True (unordered, signaling)
     };

 #if SIMD_ARCH >= SIMD_ARCH_AVX512
     enum class CompareTypeInt
     {
         EQ = _MM_CMPINT_EQ, // Equal
         LT = _MM_CMPINT_LT, // Less than
         LE = _MM_CMPINT_LE, // Less than or Equal
         NE = _MM_CMPINT_NE, // Not Equal
         GE = _MM_CMPINT_GE, // Greater than or Equal
         GT = _MM_CMPINT_GT, // Greater than
     };
 #endif // SIMD_ARCH >= SIMD_ARCH_AVX512

     enum class ScaleFactor
     {
         SF_1 = 1, // No scaling
         SF_2 = 2, // Scale offset by 2
         SF_4 = 4, // Scale offset by 4
         SF_8 = 8, // Scale offset by 8
     };

     enum class RoundMode
     {
         TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5)
         TO_NEG_INF     = 0x01, // Round to negative infinity
         TO_POS_INF     = 0x02, // Round to positive infinity
         TO_ZERO        = 0x03, // Round to 0 a.k.a. truncate
         CUR_DIRECTION  = 0x04, // Round in direction set in MXCSR register

         RAISE_EXC = 0x00, // Raise exception on overflow
         NO_EXC    = 0x08, // Suppress exceptions

         NINT        = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC),
         NINT_NOEXC  = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC),
         FLOOR       = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC),
         FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC),
         CEIL        = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC),
         CEIL_NOEXC  = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC),
         TRUNC       = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC),
         TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC),
         RINT        = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC),
         NEARBYINT   = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC),
     };

     struct Traits
     {
         using CompareType = SIMDImpl::CompareType;
         using ScaleFactor = SIMDImpl::ScaleFactor;
         using RoundMode   = SIMDImpl::RoundMode;
     };

     // Attribute, 4-dimensional attribute in SIMD SOA layout
     template <typename Float, typename Integer, typename Double>
     union Vec4
     {
         Float   v[4];
         Integer vi[4];
         Double  vd[4];
         struct
         {
             Float x;
             Float y;
             Float z;
             Float w;
         };
         SIMDINLINE Float& SIMDCALL operator[](const int i) { return v[i]; }
         SIMDINLINE Float const& SIMDCALL operator[](const int i) const { return v[i]; }
         SIMDINLINE Vec4& SIMDCALL operator=(Vec4 const& in)
         {
             v[0] = in.v[0];
             v[1] = in.v[1];
             v[2] = in.v[2];
             v[3] = in.v[3];
             return *this;
         }
     };

     namespace SIMD128Impl
     {
         union Float
         {
             SIMDINLINE Float() = default;
             SIMDINLINE Float(__m128 in) : v(in) {}
             SIMDINLINE Float& SIMDCALL operator=(__m128 in)
             {
                 v = in;
                 return *this;
             }
             SIMDINLINE Float& SIMDCALL operator=(Float const& in)
             {
                 v = in.v;
                 return *this;
             }
             SIMDINLINE SIMDCALL operator __m128() const { return v; }

             SIMDALIGN(__m128, 16) v;
         };

         union Integer
         {
             SIMDINLINE Integer() = default;
             SIMDINLINE Integer(__m128i in) : v(in) {}
             SIMDINLINE Integer& SIMDCALL operator=(__m128i in)
             {
                 v = in;
                 return *this;
             }
             SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
             {
                 v = in.v;
                 return *this;
             }
             SIMDINLINE SIMDCALL operator __m128i() const { return v; }

             SIMDALIGN(__m128i, 16) v;
         };

         union Double
         {
             SIMDINLINE Double() = default;
             SIMDINLINE Double(__m128d in) : v(in) {}
             SIMDINLINE Double& SIMDCALL operator=(__m128d in)
             {
                 v = in;
                 return *this;
             }
             SIMDINLINE Double& SIMDCALL operator=(Double const& in)
             {
                 v = in.v;
                 return *this;
             }
             SIMDINLINE SIMDCALL operator __m128d() const { return v; }

             SIMDALIGN(__m128d, 16) v;
         };

         using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
         using Mask = uint8_t;

         static const uint32_t SIMD_WIDTH = 4;
     } // namespace SIMD128Impl

     namespace SIMD256Impl
     {
         union Float
         {
             SIMDINLINE Float() = default;
             SIMDINLINE Float(__m256 in) : v(in) {}
             SIMDINLINE Float(SIMD128Impl::Float const& in_lo,
                              SIMD128Impl::Float const& in_hi = _mm_setzero_ps())
             {
                 v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1);
             }
             SIMDINLINE Float& SIMDCALL operator=(__m256 in)
             {
                 v = in;
                 return *this;
             }
             SIMDINLINE Float& SIMDCALL operator=(Float const& in)
             {
                 v = in.v;
                 return *this;
             }
             SIMDINLINE SIMDCALL operator __m256() const { return v; }

             SIMDALIGN(__m256, 32) v;
             SIMD128Impl::Float v4[2];
         };

         union Integer
         {
             SIMDINLINE Integer() = default;
             SIMDINLINE Integer(__m256i in) : v(in) {}
             SIMDINLINE Integer(SIMD128Impl::Integer const& in_lo,
                                SIMD128Impl::Integer const& in_hi = _mm_setzero_si128())
             {
                 v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1);
             }
             SIMDINLINE Integer& SIMDCALL operator=(__m256i in)
             {
                 v = in;
                 return *this;
             }
             SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
             {
                 v = in.v;
                 return *this;
             }
             SIMDINLINE SIMDCALL operator __m256i() const { return v; }

             SIMDALIGN(__m256i, 32) v;
             SIMD128Impl::Integer v4[2];
         };

         union Double
         {
             SIMDINLINE Double() = default;
             SIMDINLINE Double(__m256d const& in) : v(in) {}
             SIMDINLINE Double(SIMD128Impl::Double const& in_lo,
                               SIMD128Impl::Double const& in_hi = _mm_setzero_pd())
             {
                 v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1);
             }
             SIMDINLINE Double& SIMDCALL operator=(__m256d in)
             {
                 v = in;
                 return *this;
             }
             SIMDINLINE Double& SIMDCALL operator=(Double const& in)
             {
                 v = in.v;
                 return *this;
             }
             SIMDINLINE SIMDCALL operator __m256d() const { return v; }

             SIMDALIGN(__m256d, 32) v;
             SIMD128Impl::Double v4[2];
         };

         using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
         using Mask = uint8_t;

         static const uint32_t SIMD_WIDTH = 8;
     } // namespace SIMD256Impl

     namespace SIMD512Impl
     {
 #if !(defined(__AVX512F__) || defined(_MM_K0_REG))
         // Define AVX512 types if not included via immintrin.h.
         // All data members of these types are ONLY to viewed
         // in a debugger.  Do NOT access them via code!
         union __m512
         {
         private:
             float m512_f32[16];
         };
         struct __m512d
         {
         private:
             double m512d_f64[8];
         };

         union __m512i
         {
         private:
             int8_t   m512i_i8[64];
             int16_t  m512i_i16[32];
             int32_t  m512i_i32[16];
             int64_t  m512i_i64[8];
             uint8_t  m512i_u8[64];
             uint16_t m512i_u16[32];
             uint32_t m512i_u32[16];
             uint64_t m512i_u64[8];
         };

         using __mmask16 = uint16_t;
 #endif

 #if defined(__INTEL_COMPILER) || (SIMD_ARCH >= SIMD_ARCH_AVX512)
 #define SIMD_ALIGNMENT_BYTES 64
 #else
 #define SIMD_ALIGNMENT_BYTES 32
 #endif

         union Float
         {
             SIMDINLINE Float() = default;
             SIMDINLINE Float(__m512 in) : v(in) {}
             SIMDINLINE Float(SIMD256Impl::Float const& in_lo,
                              SIMD256Impl::Float const& in_hi = _mm256_setzero_ps())
             {
                 v8[0] = in_lo;
                 v8[1] = in_hi;
             }
             SIMDINLINE Float& SIMDCALL operator=(__m512 in)
             {
                 v = in;
                 return *this;
             }
             SIMDINLINE Float& SIMDCALL operator=(Float const& in)
             {
 #if SIMD_ARCH >= SIMD_ARCH_AVX512
                 v = in.v;
 #else
                 v8[0] = in.v8[0];
                 v8[1] = in.v8[1];
 #endif
                 return *this;
             }
             SIMDINLINE SIMDCALL operator __m512() const { return v; }

             SIMDALIGN(__m512, SIMD_ALIGNMENT_BYTES) v;
             SIMD256Impl::Float v8[2];
         };

         union Integer
         {
             SIMDINLINE Integer() = default;
             SIMDINLINE Integer(__m512i in) : v(in) {}
             SIMDINLINE Integer(SIMD256Impl::Integer const& in_lo,
                                SIMD256Impl::Integer const& in_hi = _mm256_setzero_si256())
             {
                 v8[0] = in_lo;
                 v8[1] = in_hi;
             }
             SIMDINLINE Integer& SIMDCALL operator=(__m512i in)
             {
                 v = in;
                 return *this;
             }
             SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
             {
 #if SIMD_ARCH >= SIMD_ARCH_AVX512
                 v = in.v;
 #else
                 v8[0] = in.v8[0];
                 v8[1] = in.v8[1];
 #endif
                 return *this;
             }

             SIMDINLINE SIMDCALL operator __m512i() const { return v; }

             SIMDALIGN(__m512i, SIMD_ALIGNMENT_BYTES) v;
             SIMD256Impl::Integer v8[2];
         };

         union Double
         {
             SIMDINLINE Double() = default;
             SIMDINLINE Double(__m512d in) : v(in) {}
             SIMDINLINE Double(SIMD256Impl::Double const& in_lo,
                               SIMD256Impl::Double const& in_hi = _mm256_setzero_pd())
             {
                 v8[0] = in_lo;
                 v8[1] = in_hi;
             }
             SIMDINLINE Double& SIMDCALL operator=(__m512d in)
             {
                 v = in;
                 return *this;
             }
             SIMDINLINE Double& SIMDCALL operator=(Double const& in)
             {
 #if SIMD_ARCH >= SIMD_ARCH_AVX512
                 v = in.v;
 #else
                 v8[0] = in.v8[0];
                 v8[1] = in.v8[1];
 #endif
                 return *this;
             }

             SIMDINLINE SIMDCALL operator __m512d() const { return v; }

             SIMDALIGN(__m512d, SIMD_ALIGNMENT_BYTES) v;
             SIMD256Impl::Double v8[2];
         };

         typedef SIMDImpl::Vec4<Float, Integer, Double> SIMDALIGN(Vec4, 64);
         using Mask = __mmask16;

         static const uint32_t SIMD_WIDTH = 16;

 #undef SIMD_ALIGNMENT_BYTES
     } // namespace SIMD512Impl
 } // namespace SIMDImpl
	/****************************************************************************
	* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	* IN THE SOFTWARE.
	****************************************************************************/
	#pragma once

	#if !defined(__cplusplus)
	#error C++ compilation required
	#endif

	#include <immintrin.h>
	#include <inttypes.h>
	#include <stdint.h>

	#define SIMD_ARCH_AVX 0
	#define SIMD_ARCH_AVX2 1
	#define SIMD_ARCH_AVX512 2

	#if !defined(SIMD_ARCH)
	#define SIMD_ARCH SIMD_ARCH_AVX
	#endif

	#if defined(_MSC_VER)
	#define SIMDCALL __vectorcall
	#define SIMDINLINE __forceinline
	#define SIMDALIGN(type_, align_) __declspec(align(align_)) type_
	#else
	#define SIMDCALL
	#define SIMDINLINE inline
	#define SIMDALIGN(type_, align_) type_ __attribute__((aligned(align_)))
	#endif

	// For documentation, please see the following include...
	// #include "simdlib_interface.hpp"

	namespace SIMDImpl
	{
	enum class CompareType
	{
	EQ_OQ = 0x00, // Equal (ordered, nonsignaling)
	LT_OS = 0x01, // Less-than (ordered, signaling)
	LE_OS = 0x02, // Less-than-or-equal (ordered, signaling)
	UNORD_Q = 0x03, // Unordered (nonsignaling)
	NEQ_UQ = 0x04, // Not-equal (unordered, nonsignaling)
	NLT_US = 0x05, // Not-less-than (unordered, signaling)
	NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling)
	ORD_Q = 0x07, // Ordered (nonsignaling)
	EQ_UQ = 0x08, // Equal (unordered, non-signaling)
	NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling)
	NGT_US = 0x0A, // Not-greater-than (unordered, signaling)
	FALSE_OQ = 0x0B, // False (ordered, nonsignaling)
	NEQ_OQ = 0x0C, // Not-equal (ordered, non-signaling)
	GE_OS = 0x0D, // Greater-than-or-equal (ordered, signaling)
	GT_OS = 0x0E, // Greater-than (ordered, signaling)
	TRUE_UQ = 0x0F, // True (unordered, non-signaling)
	EQ_OS = 0x10, // Equal (ordered, signaling)
	LT_OQ = 0x11, // Less-than (ordered, nonsignaling)
	LE_OQ = 0x12, // Less-than-or-equal (ordered, nonsignaling)
	UNORD_S = 0x13, // Unordered (signaling)
	NEQ_US = 0x14, // Not-equal (unordered, signaling)
	NLT_UQ = 0x15, // Not-less-than (unordered, nonsignaling)
	NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
	ORD_S = 0x17, // Ordered (signaling)
	EQ_US = 0x18, // Equal (unordered, signaling)
	NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
	NGT_UQ = 0x1A, // Not-greater-than (unordered, nonsignaling)
	FALSE_OS = 0x1B, // False (ordered, signaling)
	NEQ_OS = 0x1C, // Not-equal (ordered, signaling)
	GE_OQ = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
	GT_OQ = 0x1E, // Greater-than (ordered, nonsignaling)
	TRUE_US = 0x1F, // True (unordered, signaling)
	};

	#if SIMD_ARCH >= SIMD_ARCH_AVX512
	enum class CompareTypeInt
	{
	EQ = _MM_CMPINT_EQ, // Equal
	LT = _MM_CMPINT_LT, // Less than
	LE = _MM_CMPINT_LE, // Less than or Equal
	NE = _MM_CMPINT_NE, // Not Equal
	GE = _MM_CMPINT_GE, // Greater than or Equal
	GT = _MM_CMPINT_GT, // Greater than
	};
	#endif // SIMD_ARCH >= SIMD_ARCH_AVX512

	enum class ScaleFactor
	{
	SF_1 = 1, // No scaling
	SF_2 = 2, // Scale offset by 2
	SF_4 = 4, // Scale offset by 4
	SF_8 = 8, // Scale offset by 8
	};

	enum class RoundMode
	{
	TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5)
	TO_NEG_INF = 0x01, // Round to negative infinity
	TO_POS_INF = 0x02, // Round to positive infinity
	TO_ZERO = 0x03, // Round to 0 a.k.a. truncate
	CUR_DIRECTION = 0x04, // Round in direction set in MXCSR register

	RAISE_EXC = 0x00, // Raise exception on overflow
	NO_EXC = 0x08, // Suppress exceptions

	NINT = static_cast<int>(TO_NEAREST_INT) \| static_cast<int>(RAISE_EXC),
	NINT_NOEXC = static_cast<int>(TO_NEAREST_INT) \| static_cast<int>(NO_EXC),
	FLOOR = static_cast<int>(TO_NEG_INF) \| static_cast<int>(RAISE_EXC),
	FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) \| static_cast<int>(NO_EXC),
	CEIL = static_cast<int>(TO_POS_INF) \| static_cast<int>(RAISE_EXC),
	CEIL_NOEXC = static_cast<int>(TO_POS_INF) \| static_cast<int>(NO_EXC),
	TRUNC = static_cast<int>(TO_ZERO) \| static_cast<int>(RAISE_EXC),
	TRUNC_NOEXC = static_cast<int>(TO_ZERO) \| static_cast<int>(NO_EXC),
	RINT = static_cast<int>(CUR_DIRECTION) \| static_cast<int>(RAISE_EXC),
	NEARBYINT = static_cast<int>(CUR_DIRECTION) \| static_cast<int>(NO_EXC),
	};

	struct Traits
	{
	using CompareType = SIMDImpl::CompareType;
	using ScaleFactor = SIMDImpl::ScaleFactor;
	using RoundMode = SIMDImpl::RoundMode;
	};

	// Attribute, 4-dimensional attribute in SIMD SOA layout
	template <typename Float, typename Integer, typename Double>
	union Vec4
	{
	Float v[4];
	Integer vi[4];
	Double vd[4];
	struct
	{
	Float x;
	Float y;
	Float z;
	Float w;
	};
	SIMDINLINE Float& SIMDCALL operator[](const int i) { return v[i]; }
	SIMDINLINE Float const& SIMDCALL operator[](const int i) const { return v[i]; }
	SIMDINLINE Vec4& SIMDCALL operator=(Vec4 const& in)
	{
	v[0] = in.v[0];
	v[1] = in.v[1];
	v[2] = in.v[2];
	v[3] = in.v[3];
	return *this;
	}
	};

	namespace SIMD128Impl
	{
	union Float
	{
	SIMDINLINE Float() = default;
	SIMDINLINE Float(__m128 in) : v(in) {}
	SIMDINLINE Float& SIMDCALL operator=(__m128 in)
	{
	v = in;
	return *this;
	}
	SIMDINLINE Float& SIMDCALL operator=(Float const& in)
	{
	v = in.v;
	return *this;
	}
	SIMDINLINE SIMDCALL operator __m128() const { return v; }

	SIMDALIGN(__m128, 16) v;
	};

	union Integer
	{
	SIMDINLINE Integer() = default;
	SIMDINLINE Integer(__m128i in) : v(in) {}
	SIMDINLINE Integer& SIMDCALL operator=(__m128i in)
	{
	v = in;
	return *this;
	}
	SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
	{
	v = in.v;
	return *this;
	}
	SIMDINLINE SIMDCALL operator __m128i() const { return v; }

	SIMDALIGN(__m128i, 16) v;
	};

	union Double
	{
	SIMDINLINE Double() = default;
	SIMDINLINE Double(__m128d in) : v(in) {}
	SIMDINLINE Double& SIMDCALL operator=(__m128d in)
	{
	v = in;
	return *this;
	}
	SIMDINLINE Double& SIMDCALL operator=(Double const& in)
	{
	v = in.v;
	return *this;
	}
	SIMDINLINE SIMDCALL operator __m128d() const { return v; }

	SIMDALIGN(__m128d, 16) v;
	};

	using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
	using Mask = uint8_t;

	static const uint32_t SIMD_WIDTH = 4;
	} // namespace SIMD128Impl

	namespace SIMD256Impl
	{
	union Float
	{
	SIMDINLINE Float() = default;
	SIMDINLINE Float(__m256 in) : v(in) {}
	SIMDINLINE Float(SIMD128Impl::Float const& in_lo,
	SIMD128Impl::Float const& in_hi = _mm_setzero_ps())
	{
	v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1);
	}
	SIMDINLINE Float& SIMDCALL operator=(__m256 in)
	{
	v = in;
	return *this;
	}
	SIMDINLINE Float& SIMDCALL operator=(Float const& in)
	{
	v = in.v;
	return *this;
	}
	SIMDINLINE SIMDCALL operator __m256() const { return v; }

	SIMDALIGN(__m256, 32) v;
	SIMD128Impl::Float v4[2];
	};

	union Integer
	{
	SIMDINLINE Integer() = default;
	SIMDINLINE Integer(__m256i in) : v(in) {}
	SIMDINLINE Integer(SIMD128Impl::Integer const& in_lo,
	SIMD128Impl::Integer const& in_hi = _mm_setzero_si128())
	{
	v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1);
	}
	SIMDINLINE Integer& SIMDCALL operator=(__m256i in)
	{
	v = in;
	return *this;
	}
	SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
	{
	v = in.v;
	return *this;
	}
	SIMDINLINE SIMDCALL operator __m256i() const { return v; }

	SIMDALIGN(__m256i, 32) v;
	SIMD128Impl::Integer v4[2];
	};

	union Double
	{
	SIMDINLINE Double() = default;
	SIMDINLINE Double(__m256d const& in) : v(in) {}
	SIMDINLINE Double(SIMD128Impl::Double const& in_lo,
	SIMD128Impl::Double const& in_hi = _mm_setzero_pd())
	{
	v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1);
	}
	SIMDINLINE Double& SIMDCALL operator=(__m256d in)
	{
	v = in;
	return *this;
	}
	SIMDINLINE Double& SIMDCALL operator=(Double const& in)
	{
	v = in.v;
	return *this;
	}
	SIMDINLINE SIMDCALL operator __m256d() const { return v; }

	SIMDALIGN(__m256d, 32) v;
	SIMD128Impl::Double v4[2];
	};

	using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
	using Mask = uint8_t;

	static const uint32_t SIMD_WIDTH = 8;
	} // namespace SIMD256Impl

	namespace SIMD512Impl
	{
	#if !(defined(__AVX512F__) \|\| defined(_MM_K0_REG))
	// Define AVX512 types if not included via immintrin.h.
	// All data members of these types are ONLY to viewed
	// in a debugger. Do NOT access them via code!
	union __m512
	{
	private:
	float m512_f32[16];
	};
	struct __m512d
	{
	private:
	double m512d_f64[8];
	};

	union __m512i
	{
	private:
	int8_t m512i_i8[64];
	int16_t m512i_i16[32];
	int32_t m512i_i32[16];
	int64_t m512i_i64[8];
	uint8_t m512i_u8[64];
	uint16_t m512i_u16[32];
	uint32_t m512i_u32[16];
	uint64_t m512i_u64[8];
	};

	using __mmask16 = uint16_t;
	#endif

	#if defined(__INTEL_COMPILER) \|\| (SIMD_ARCH >= SIMD_ARCH_AVX512)
	#define SIMD_ALIGNMENT_BYTES 64
	#else
	#define SIMD_ALIGNMENT_BYTES 32
	#endif

	union Float
	{
	SIMDINLINE Float() = default;
	SIMDINLINE Float(__m512 in) : v(in) {}
	SIMDINLINE Float(SIMD256Impl::Float const& in_lo,
	SIMD256Impl::Float const& in_hi = _mm256_setzero_ps())
	{
	v8[0] = in_lo;
	v8[1] = in_hi;
	}
	SIMDINLINE Float& SIMDCALL operator=(__m512 in)
	{
	v = in;
	return *this;
	}
	SIMDINLINE Float& SIMDCALL operator=(Float const& in)
	{
	#if SIMD_ARCH >= SIMD_ARCH_AVX512
	v = in.v;
	#else
	v8[0] = in.v8[0];
	v8[1] = in.v8[1];
	#endif
	return *this;
	}
	SIMDINLINE SIMDCALL operator __m512() const { return v; }

	SIMDALIGN(__m512, SIMD_ALIGNMENT_BYTES) v;
	SIMD256Impl::Float v8[2];
	};

	union Integer
	{
	SIMDINLINE Integer() = default;
	SIMDINLINE Integer(__m512i in) : v(in) {}
	SIMDINLINE Integer(SIMD256Impl::Integer const& in_lo,
	SIMD256Impl::Integer const& in_hi = _mm256_setzero_si256())
	{
	v8[0] = in_lo;
	v8[1] = in_hi;
	}
	SIMDINLINE Integer& SIMDCALL operator=(__m512i in)
	{
	v = in;
	return *this;
	}
	SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
	{
	#if SIMD_ARCH >= SIMD_ARCH_AVX512
	v = in.v;
	#else
	v8[0] = in.v8[0];
	v8[1] = in.v8[1];
	#endif
	return *this;
	}

	SIMDINLINE SIMDCALL operator __m512i() const { return v; }

	SIMDALIGN(__m512i, SIMD_ALIGNMENT_BYTES) v;
	SIMD256Impl::Integer v8[2];
	};

	union Double
	{
	SIMDINLINE Double() = default;
	SIMDINLINE Double(__m512d in) : v(in) {}
	SIMDINLINE Double(SIMD256Impl::Double const& in_lo,
	SIMD256Impl::Double const& in_hi = _mm256_setzero_pd())
	{
	v8[0] = in_lo;
	v8[1] = in_hi;
	}
	SIMDINLINE Double& SIMDCALL operator=(__m512d in)
	{
	v = in;
	return *this;
	}
	SIMDINLINE Double& SIMDCALL operator=(Double const& in)
	{
	#if SIMD_ARCH >= SIMD_ARCH_AVX512
	v = in.v;
	#else
	v8[0] = in.v8[0];
	v8[1] = in.v8[1];
	#endif
	return *this;
	}

	SIMDINLINE SIMDCALL operator __m512d() const { return v; }

	SIMDALIGN(__m512d, SIMD_ALIGNMENT_BYTES) v;
	SIMD256Impl::Double v8[2];
	};

	typedef SIMDImpl::Vec4<Float, Integer, Double> SIMDALIGN(Vec4, 64);
	using Mask = __mmask16;

	static const uint32_t SIMD_WIDTH = 16;

	#undef SIMD_ALIGNMENT_BYTES
	} // namespace SIMD512Impl
	} // namespace SIMDImpl