blob: 944c3c23fd3b913e86f208f8b2821ef1b396ee1b [file] [log] [blame]
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#pragma once
#if !defined(__cplusplus)
#error C++ compilation required
#endif
#include <immintrin.h>
#include <inttypes.h>
#include <stdint.h>
#define SIMD_ARCH_AVX 0
#define SIMD_ARCH_AVX2 1
#define SIMD_ARCH_AVX512 2
#if !defined(SIMD_ARCH)
#define SIMD_ARCH SIMD_ARCH_AVX
#endif
#if defined(_MSC_VER)
#define SIMDCALL __vectorcall
#define SIMDINLINE __forceinline
#define SIMDALIGN(type_, align_) __declspec(align(align_)) type_
#else
#define SIMDCALL
#define SIMDINLINE inline
#define SIMDALIGN(type_, align_) type_ __attribute__((aligned(align_)))
#endif
// For documentation, please see the following include...
// #include "simdlib_interface.hpp"
namespace SIMDImpl
{
enum class CompareType
{
EQ_OQ = 0x00, // Equal (ordered, nonsignaling)
LT_OS = 0x01, // Less-than (ordered, signaling)
LE_OS = 0x02, // Less-than-or-equal (ordered, signaling)
UNORD_Q = 0x03, // Unordered (nonsignaling)
NEQ_UQ = 0x04, // Not-equal (unordered, nonsignaling)
NLT_US = 0x05, // Not-less-than (unordered, signaling)
NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling)
ORD_Q = 0x07, // Ordered (nonsignaling)
EQ_UQ = 0x08, // Equal (unordered, non-signaling)
NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling)
NGT_US = 0x0A, // Not-greater-than (unordered, signaling)
FALSE_OQ = 0x0B, // False (ordered, nonsignaling)
NEQ_OQ = 0x0C, // Not-equal (ordered, non-signaling)
GE_OS = 0x0D, // Greater-than-or-equal (ordered, signaling)
GT_OS = 0x0E, // Greater-than (ordered, signaling)
TRUE_UQ = 0x0F, // True (unordered, non-signaling)
EQ_OS = 0x10, // Equal (ordered, signaling)
LT_OQ = 0x11, // Less-than (ordered, nonsignaling)
LE_OQ = 0x12, // Less-than-or-equal (ordered, nonsignaling)
UNORD_S = 0x13, // Unordered (signaling)
NEQ_US = 0x14, // Not-equal (unordered, signaling)
NLT_UQ = 0x15, // Not-less-than (unordered, nonsignaling)
NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
ORD_S = 0x17, // Ordered (signaling)
EQ_US = 0x18, // Equal (unordered, signaling)
NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
NGT_UQ = 0x1A, // Not-greater-than (unordered, nonsignaling)
FALSE_OS = 0x1B, // False (ordered, signaling)
NEQ_OS = 0x1C, // Not-equal (ordered, signaling)
GE_OQ = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
GT_OQ = 0x1E, // Greater-than (ordered, nonsignaling)
TRUE_US = 0x1F, // True (unordered, signaling)
};
#if SIMD_ARCH >= SIMD_ARCH_AVX512
enum class CompareTypeInt
{
EQ = _MM_CMPINT_EQ, // Equal
LT = _MM_CMPINT_LT, // Less than
LE = _MM_CMPINT_LE, // Less than or Equal
NE = _MM_CMPINT_NE, // Not Equal
GE = _MM_CMPINT_GE, // Greater than or Equal
GT = _MM_CMPINT_GT, // Greater than
};
#endif // SIMD_ARCH >= SIMD_ARCH_AVX512
enum class ScaleFactor
{
SF_1 = 1, // No scaling
SF_2 = 2, // Scale offset by 2
SF_4 = 4, // Scale offset by 4
SF_8 = 8, // Scale offset by 8
};
enum class RoundMode
{
TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5)
TO_NEG_INF = 0x01, // Round to negative infinity
TO_POS_INF = 0x02, // Round to positive infinity
TO_ZERO = 0x03, // Round to 0 a.k.a. truncate
CUR_DIRECTION = 0x04, // Round in direction set in MXCSR register
RAISE_EXC = 0x00, // Raise exception on overflow
NO_EXC = 0x08, // Suppress exceptions
NINT = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC),
NINT_NOEXC = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC),
FLOOR = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC),
FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC),
CEIL = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC),
CEIL_NOEXC = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC),
TRUNC = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC),
TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC),
RINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC),
NEARBYINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC),
};
struct Traits
{
using CompareType = SIMDImpl::CompareType;
using ScaleFactor = SIMDImpl::ScaleFactor;
using RoundMode = SIMDImpl::RoundMode;
};
// Attribute, 4-dimensional attribute in SIMD SOA layout
template <typename Float, typename Integer, typename Double>
union Vec4
{
Float v[4];
Integer vi[4];
Double vd[4];
struct
{
Float x;
Float y;
Float z;
Float w;
};
SIMDINLINE Float& SIMDCALL operator[](const int i) { return v[i]; }
SIMDINLINE Float const& SIMDCALL operator[](const int i) const { return v[i]; }
SIMDINLINE Vec4& SIMDCALL operator=(Vec4 const& in)
{
v[0] = in.v[0];
v[1] = in.v[1];
v[2] = in.v[2];
v[3] = in.v[3];
return *this;
}
};
namespace SIMD128Impl
{
union Float
{
SIMDINLINE Float() = default;
SIMDINLINE Float(__m128 in) : v(in) {}
SIMDINLINE Float& SIMDCALL operator=(__m128 in)
{
v = in;
return *this;
}
SIMDINLINE Float& SIMDCALL operator=(Float const& in)
{
v = in.v;
return *this;
}
SIMDINLINE SIMDCALL operator __m128() const { return v; }
SIMDALIGN(__m128, 16) v;
};
union Integer
{
SIMDINLINE Integer() = default;
SIMDINLINE Integer(__m128i in) : v(in) {}
SIMDINLINE Integer& SIMDCALL operator=(__m128i in)
{
v = in;
return *this;
}
SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
{
v = in.v;
return *this;
}
SIMDINLINE SIMDCALL operator __m128i() const { return v; }
SIMDALIGN(__m128i, 16) v;
};
union Double
{
SIMDINLINE Double() = default;
SIMDINLINE Double(__m128d in) : v(in) {}
SIMDINLINE Double& SIMDCALL operator=(__m128d in)
{
v = in;
return *this;
}
SIMDINLINE Double& SIMDCALL operator=(Double const& in)
{
v = in.v;
return *this;
}
SIMDINLINE SIMDCALL operator __m128d() const { return v; }
SIMDALIGN(__m128d, 16) v;
};
using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
using Mask = uint8_t;
static const uint32_t SIMD_WIDTH = 4;
} // namespace SIMD128Impl
namespace SIMD256Impl
{
union Float
{
SIMDINLINE Float() = default;
SIMDINLINE Float(__m256 in) : v(in) {}
SIMDINLINE Float(SIMD128Impl::Float const& in_lo,
SIMD128Impl::Float const& in_hi = _mm_setzero_ps())
{
v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1);
}
SIMDINLINE Float& SIMDCALL operator=(__m256 in)
{
v = in;
return *this;
}
SIMDINLINE Float& SIMDCALL operator=(Float const& in)
{
v = in.v;
return *this;
}
SIMDINLINE SIMDCALL operator __m256() const { return v; }
SIMDALIGN(__m256, 32) v;
SIMD128Impl::Float v4[2];
};
union Integer
{
SIMDINLINE Integer() = default;
SIMDINLINE Integer(__m256i in) : v(in) {}
SIMDINLINE Integer(SIMD128Impl::Integer const& in_lo,
SIMD128Impl::Integer const& in_hi = _mm_setzero_si128())
{
v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1);
}
SIMDINLINE Integer& SIMDCALL operator=(__m256i in)
{
v = in;
return *this;
}
SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
{
v = in.v;
return *this;
}
SIMDINLINE SIMDCALL operator __m256i() const { return v; }
SIMDALIGN(__m256i, 32) v;
SIMD128Impl::Integer v4[2];
};
union Double
{
SIMDINLINE Double() = default;
SIMDINLINE Double(__m256d const& in) : v(in) {}
SIMDINLINE Double(SIMD128Impl::Double const& in_lo,
SIMD128Impl::Double const& in_hi = _mm_setzero_pd())
{
v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1);
}
SIMDINLINE Double& SIMDCALL operator=(__m256d in)
{
v = in;
return *this;
}
SIMDINLINE Double& SIMDCALL operator=(Double const& in)
{
v = in.v;
return *this;
}
SIMDINLINE SIMDCALL operator __m256d() const { return v; }
SIMDALIGN(__m256d, 32) v;
SIMD128Impl::Double v4[2];
};
using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
using Mask = uint8_t;
static const uint32_t SIMD_WIDTH = 8;
} // namespace SIMD256Impl
namespace SIMD512Impl
{
#if !(defined(__AVX512F__) || defined(_MM_K0_REG))
// Define AVX512 types if not included via immintrin.h.
// All data members of these types are ONLY to viewed
// in a debugger. Do NOT access them via code!
union __m512
{
private:
float m512_f32[16];
};
struct __m512d
{
private:
double m512d_f64[8];
};
union __m512i
{
private:
int8_t m512i_i8[64];
int16_t m512i_i16[32];
int32_t m512i_i32[16];
int64_t m512i_i64[8];
uint8_t m512i_u8[64];
uint16_t m512i_u16[32];
uint32_t m512i_u32[16];
uint64_t m512i_u64[8];
};
using __mmask16 = uint16_t;
#endif
#if defined(__INTEL_COMPILER) || (SIMD_ARCH >= SIMD_ARCH_AVX512)
#define SIMD_ALIGNMENT_BYTES 64
#else
#define SIMD_ALIGNMENT_BYTES 32
#endif
union Float
{
SIMDINLINE Float() = default;
SIMDINLINE Float(__m512 in) : v(in) {}
SIMDINLINE Float(SIMD256Impl::Float const& in_lo,
SIMD256Impl::Float const& in_hi = _mm256_setzero_ps())
{
v8[0] = in_lo;
v8[1] = in_hi;
}
SIMDINLINE Float& SIMDCALL operator=(__m512 in)
{
v = in;
return *this;
}
SIMDINLINE Float& SIMDCALL operator=(Float const& in)
{
#if SIMD_ARCH >= SIMD_ARCH_AVX512
v = in.v;
#else
v8[0] = in.v8[0];
v8[1] = in.v8[1];
#endif
return *this;
}
SIMDINLINE SIMDCALL operator __m512() const { return v; }
SIMDALIGN(__m512, SIMD_ALIGNMENT_BYTES) v;
SIMD256Impl::Float v8[2];
};
union Integer
{
SIMDINLINE Integer() = default;
SIMDINLINE Integer(__m512i in) : v(in) {}
SIMDINLINE Integer(SIMD256Impl::Integer const& in_lo,
SIMD256Impl::Integer const& in_hi = _mm256_setzero_si256())
{
v8[0] = in_lo;
v8[1] = in_hi;
}
SIMDINLINE Integer& SIMDCALL operator=(__m512i in)
{
v = in;
return *this;
}
SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
{
#if SIMD_ARCH >= SIMD_ARCH_AVX512
v = in.v;
#else
v8[0] = in.v8[0];
v8[1] = in.v8[1];
#endif
return *this;
}
SIMDINLINE SIMDCALL operator __m512i() const { return v; }
SIMDALIGN(__m512i, SIMD_ALIGNMENT_BYTES) v;
SIMD256Impl::Integer v8[2];
};
union Double
{
SIMDINLINE Double() = default;
SIMDINLINE Double(__m512d in) : v(in) {}
SIMDINLINE Double(SIMD256Impl::Double const& in_lo,
SIMD256Impl::Double const& in_hi = _mm256_setzero_pd())
{
v8[0] = in_lo;
v8[1] = in_hi;
}
SIMDINLINE Double& SIMDCALL operator=(__m512d in)
{
v = in;
return *this;
}
SIMDINLINE Double& SIMDCALL operator=(Double const& in)
{
#if SIMD_ARCH >= SIMD_ARCH_AVX512
v = in.v;
#else
v8[0] = in.v8[0];
v8[1] = in.v8[1];
#endif
return *this;
}
SIMDINLINE SIMDCALL operator __m512d() const { return v; }
SIMDALIGN(__m512d, SIMD_ALIGNMENT_BYTES) v;
SIMD256Impl::Double v8[2];
};
typedef SIMDImpl::Vec4<Float, Integer, Double> SIMDALIGN(Vec4, 64);
using Mask = __mmask16;
static const uint32_t SIMD_WIDTH = 16;
#undef SIMD_ALIGNMENT_BYTES
} // namespace SIMD512Impl
} // namespace SIMDImpl