blob: c7a8c348071e60d30c346727acca6578c0d3ddf4 [file] [log] [blame]
//created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation, victoria.zhislina@intel.com
//*** Copyright (C) 2012-2018 Intel Corporation. All rights reserved.
//IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//By downloading, copying, installing or using the software you agree to this license.
//If you do not agree to this license, do not download, install, copy or use the software.
// License Agreement
//Redistribution and use in source and binary forms, with or without modification,
//are permitted provided that the following conditions are met:
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//This software is provided by the copyright holders and contributors "as is" and
//any express or implied warranties, including, but not limited to, the implied
//warranties of merchantability and fitness for a particular purpose are disclaimed.
//In no event shall the Intel Corporation or contributors be liable for any direct,
//indirect, incidental, special, exemplary, or consequential damages
//(including, but not limited to, procurement of substitute goods or services;
//loss of use, data, or profits; or business interruption) however caused
//and on any theory of liability, whether in contract, strict liability,
//or tort (including negligence or otherwise) arising in any way out of
//the use of this software, even if advised of the possibility of such damage.
//*****************************************************************************************
// This file is intended to simplify ARM->IA32 porting
// It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h")
// and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below
//MMX instruction set is not used due to non availability on x64 systems,
//performance overhead and the necessity to use the EMMS instruction (_mm_empty())for mmx-x87 floating point switching
//*****************************************************************************************
//!!!!!!!!!!!!!! To use this file just include it in your project that uses ARM NEON intinsics instead of "arm_neon.h" and compile it as usual
//!!!!!!!!!!!!!! but please pay attention at #define USE_SSE4 below - you might need to define it manualy for newest Intel Atom or any Intel Core platforms for greater performance.
#ifndef NEON2SSE_H
#define NEON2SSE_H
/*********************************************************************************************************************/
//!!!!!!!!!!!!!!
//if USE_SSE4 is defined, some functions use SSE4 instructions instead of earlier SSE versions, when undefined - SIMD up to SSSE3 are used
//For older devices without SSE4 support it should be undefined, for newer devices - defined, probably manualy if your compiler doesn't set __SSE4_2__ predefine
#ifndef USE_SSE4
# if defined(__SSE4_2__)
# define USE_SSE4
# endif
#endif
/*********************************************************************************************************************/
#include <xmmintrin.h> //SSE
#include <emmintrin.h> //SSE2
#include <pmmintrin.h> //SSE3
#include <tmmintrin.h> //SSSE3
#ifdef USE_SSE4
# include <smmintrin.h> //SSE4.1
# include <nmmintrin.h> //SSE4.2
#endif
#include <math.h>
//*************** functions and data attributes, compiler dependent *********************************
//***********************************************************************************
#ifdef __GNUC__
# define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
# define _NEON2SSESTORAGE static
# define _NEON2SSE_ALIGN_16 __attribute__((aligned(16)))
# define _NEON2SSE_INLINE _NEON2SSESTORAGE inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
# ifndef NEON2SSE_DISABLE_PERFORMANCE_WARNING
# if _GCC_VERSION < 40500
# define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated)) function
# else
# define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated(explanation))) function
# endif
# else
# define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) function
# endif
# if defined(__x86_64__)
# define _NEON2SSE_64BIT __x86_64__
# endif
#else
# define _NEON2SSESTORAGE static
# define _NEON2SSE_ALIGN_16 __declspec(align(16))
# define _NEON2SSE_INLINE _NEON2SSESTORAGE __inline
# if (defined(_MSC_VER) || defined (__INTEL_COMPILER)) && !defined(NEON2SSE_DISABLE_PERFORMANCE_WARNING)
# define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
# if defined(_M_X64)
# define _NEON2SSE_64BIT _M_X64
# endif
# else
# define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) function
# endif
#endif
#if defined (_NEON2SSE_64BIT) && defined (USE_SSE4)
# define _NEON2SSE_64BIT_SSE4
#endif
/*********************************************************************************************************************/
// data types conversion
/*********************************************************************************************************************/
#if defined(_MSC_VER) && (_MSC_VER < 1300)
typedef signed char int8_t;
typedef unsigned char uint8_t;
typedef signed short int16_t;
typedef unsigned short uint16_t;
typedef signed int int32_t;
typedef unsigned int uint32_t;
typedef signed long long int64_t;
typedef unsigned long long uint64_t;
#elif defined(_MSC_VER)
typedef signed __int8 int8_t;
typedef unsigned __int8 uint8_t;
typedef signed __int16 int16_t;
typedef unsigned __int16 uint16_t;
typedef signed __int32 int32_t;
typedef unsigned __int32 uint32_t;
typedef signed long long int64_t;
typedef unsigned long long uint64_t;
#else
# include <stdint.h>
# include <limits.h>
#endif
typedef union __m64_128 {
uint64_t m64_u64[1];
float m64_f32[2];
int8_t m64_i8[8];
int16_t m64_i16[4];
int32_t m64_i32[2];
int64_t m64_i64[1];
uint8_t m64_u8[8];
uint16_t m64_u16[4];
uint32_t m64_u32[2];
} __m64_128;
typedef __m64_128 int8x8_t;
typedef __m64_128 uint8x8_t;
typedef __m64_128 int16x4_t;
typedef __m64_128 uint16x4_t;
typedef __m64_128 int32x2_t;
typedef __m64_128 uint32x2_t;
typedef __m64_128 int64x1_t;
typedef __m64_128 uint64x1_t;
typedef __m64_128 poly8x8_t;
typedef __m64_128 poly16x4_t;
typedef __m64_128 float32x2_t;
typedef __m128 float32x4_t;
typedef __m128 float16x4_t; //not supported by IA, for compartibility
typedef __m128 float16x8_t; //not supported by IA, for compartibility
typedef __m64_128 float64x1_t;
typedef __m128d float64x2_t;
typedef __m128i int8x16_t;
typedef __m128i int16x8_t;
typedef __m128i int32x4_t;
typedef __m128i int64x2_t;
typedef __m128i uint8x16_t;
typedef __m128i uint16x8_t;
typedef __m128i uint32x4_t;
typedef __m128i uint64x2_t;
typedef __m128i poly8x16_t;
typedef __m128i poly16x8_t;
#if defined(_MSC_VER)
# define SINT_MIN (-2147483647 - 1) /* min signed int value */
# define SINT_MAX 2147483647 /* max signed int value */
#else
# define SINT_MIN INT_MIN /* min signed int value */
# define SINT_MAX INT_MAX /* max signed int value */
#endif
typedef float float32_t;
#if !defined(__clang__)
typedef float __fp16;
#endif
typedef double float64_t;
typedef uint8_t poly8_t;
typedef uint16_t poly16_t;
//MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type as functions arguments resulting in
//error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned. To avoid it we need the special trick for functions that use these types
struct int8x16x2_t {
int8x16_t val[2];
};
struct int16x8x2_t {
int16x8_t val[2];
};
struct int32x4x2_t {
int32x4_t val[2];
};
struct int64x2x2_t {
int64x2_t val[2];
};
//Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
struct int8x8x2_t {
int8x8_t val[2];
};
struct int16x4x2_t {
int16x4_t val[2];
};
struct int32x2x2_t {
int32x2_t val[2];
};
struct int64x1x2_t {
int64x1_t val[2];
};
typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy
typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy
typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy
typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy
typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy
typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy
typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy
typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy
/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */
typedef struct int8x16x2_t uint8x16x2_t;
typedef struct int16x8x2_t uint16x8x2_t;
typedef struct int32x4x2_t uint32x4x2_t;
typedef struct int64x2x2_t uint64x2x2_t;
typedef struct int8x16x2_t poly8x16x2_t;
typedef struct int16x8x2_t poly16x8x2_t;
typedef struct int8x8x2_t uint8x8x2_t;
typedef struct int16x4x2_t uint16x4x2_t;
typedef struct int32x2x2_t uint32x2x2_t;
typedef struct int64x1x2_t uint64x1x2_t;
typedef struct int8x8x2_t poly8x8x2_t;
typedef struct int16x4x2_t poly16x4x2_t;
//float
struct float32x4x2_t {
float32x4_t val[2];
};
struct float16x8x2_t {
float16x8_t val[2];
};
struct float32x2x2_t {
float32x2_t val[2];
};
typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy
typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy
typedef struct float32x2x2_t float32x2x2_t; //for C compilers to make them happy
typedef float16x8x2_t float16x4x2_t;
//4
struct int8x16x4_t {
int8x16_t val[4];
};
struct int16x8x4_t {
int16x8_t val[4];
};
struct int32x4x4_t {
int32x4_t val[4];
};
struct int64x2x4_t {
int64x2_t val[4];
};
struct int8x8x4_t {
int8x8_t val[4];
};
struct int16x4x4_t {
int16x4_t val[4];
};
struct int32x2x4_t {
int32x2_t val[4];
};
struct int64x1x4_t {
int64x1_t val[4];
};
typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy
typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy
typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy
typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy
typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy
typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy
typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy
typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy
/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
typedef struct int8x8x4_t uint8x8x4_t;
typedef struct int16x4x4_t uint16x4x4_t;
typedef struct int32x2x4_t uint32x2x4_t;
typedef struct int64x1x4_t uint64x1x4_t;
typedef struct int8x8x4_t poly8x8x4_t;
typedef struct int16x4x4_t poly16x4x4_t;
typedef struct int8x16x4_t uint8x16x4_t;
typedef struct int16x8x4_t uint16x8x4_t;
typedef struct int32x4x4_t uint32x4x4_t;
typedef struct int64x2x4_t uint64x2x4_t;
typedef struct int8x16x4_t poly8x16x4_t;
typedef struct int16x8x4_t poly16x8x4_t;
struct float32x4x4_t {
float32x4_t val[4];
};
struct float16x8x4_t {
float16x8_t val[4];
};
struct float32x2x4_t {
float32x2_t val[4];
};
typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy
typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy
typedef struct float32x2x4_t float32x2x4_t; //for C compilers to make them happy
typedef float16x8x4_t float16x4x4_t;
//3
struct int16x8x3_t {
int16x8_t val[3];
};
struct int32x4x3_t {
int32x4_t val[3];
};
struct int64x2x3_t {
int64x2_t val[3];
};
struct int8x16x3_t {
int8x16_t val[3];
};
struct int16x4x3_t {
int16x4_t val[3];
};
struct int32x2x3_t {
int32x2_t val[3];
};
struct int64x1x3_t {
int64x1_t val[3];
};
struct int8x8x3_t {
int8x8_t val[3];
};
typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy
typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy
typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy
typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy
typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy
typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy
typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy
typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy
/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
typedef struct int8x16x3_t uint8x16x3_t;
typedef struct int16x8x3_t uint16x8x3_t;
typedef struct int32x4x3_t uint32x4x3_t;
typedef struct int64x2x3_t uint64x2x3_t;
typedef struct int8x16x3_t poly8x16x3_t;
typedef struct int16x8x3_t poly16x8x3_t;
typedef struct int8x8x3_t uint8x8x3_t;
typedef struct int16x4x3_t uint16x4x3_t;
typedef struct int32x2x3_t uint32x2x3_t;
typedef struct int64x1x3_t uint64x1x3_t;
typedef struct int8x8x3_t poly8x8x3_t;
typedef struct int16x4x3_t poly16x4x3_t;
//float
struct float32x4x3_t {
float32x4_t val[3];
};
struct float32x2x3_t {
float32x2_t val[3];
};
struct float16x8x3_t {
float16x8_t val[3];
};
typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy
typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy
typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy
typedef float16x8x3_t float16x4x3_t;
//****************************************************************************
//****** Porting auxiliary macros ********************************************
//** floating point related macros **
#define _M128i(a) _mm_castps_si128(a)
#define _M128(a) _mm_castsi128_ps(a)
//here the most performance effective implementation is compiler and 32/64 bits build dependent
#if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) )
# define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a)))
# define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp);
# define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp));
#else
//for 32bit gcc and Microsoft compilers builds
# define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a))
# define _M64(out, inp) _mm_storel_epi64 ((__m128i*)&(out), inp)
# define _M64f(out, inp) _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp))
#endif
#define _pM128(a) _mm_castsi128_ps(_pM128i(a))
#define return64(a) _M64(res64,a); return res64;
#define return64f(a) _M64f(res64,a); return res64;
#define _Ui64(a) (*(uint64_t*)&(a))
#define _UNSIGNED_T(a) u ## a
#define _SIGNBIT64 ((uint64_t)1 << 63)
#define _SWAP_HI_LOW32 (2 | (3 << 2) | (0 << 4) | (1 << 6))
#define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) )
#define _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it"
#define _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it"
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#define __constrange(min,max) const
#define __transfersize(size)
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
//&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& mask constants used in porting &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
_NEON2SSE_ALIGN_16 static const int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
_NEON2SSE_ALIGN_16 static const int8_t mask8_32_even_odd[16] = { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 };
//&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
//*************************************************************************
//*************************************************************************
//********* Functions declarations as declared in original arm_neon.h *****
//*************************************************************************
//Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes.
_NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
_NEON2SSESTORAGE int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
_NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
_NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
_NEON2SSESTORAGE int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
_NEON2SSESTORAGE int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
_NEON2SSESTORAGE int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
_NEON2SSESTORAGE float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
_NEON2SSESTORAGE uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
//Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
_NEON2SSESTORAGE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
_NEON2SSESTORAGE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
_NEON2SSESTORAGE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
_NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
_NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0
_NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
//Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i]
_NEON2SSESTORAGE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
_NEON2SSESTORAGE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
_NEON2SSESTORAGE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
_NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
_NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0
_NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
//Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1
_NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
_NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
_NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
//Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1
_NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
_NEON2SSESTORAGE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
_NEON2SSESTORAGE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
//Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i])
_NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
_NEON2SSESTORAGE int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
_NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
_NEON2SSESTORAGE int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
_NEON2SSESTORAGE int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
_NEON2SSESTORAGE int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
_NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
//Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i]
_NEON2SSESTORAGE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
_NEON2SSESTORAGE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
_NEON2SSESTORAGE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
_NEON2SSESTORAGE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
_NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
_NEON2SSESTORAGE uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
//Vector rounding add high half: vraddhn
_NEON2SSESTORAGE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
_NEON2SSESTORAGE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
_NEON2SSESTORAGE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
_NEON2SSESTORAGE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
_NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
_NEON2SSESTORAGE uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
//Multiplication
//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
_NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
_NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
_NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
_NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
_NEON2SSESTORAGE int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
_NEON2SSESTORAGE float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
_NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
//multiply lane
_NEON2SSESTORAGE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
_NEON2SSESTORAGE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
_NEON2SSESTORAGE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
_NEON2SSESTORAGE uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
_NEON2SSESTORAGE uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
_NEON2SSESTORAGE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c);
_NEON2SSESTORAGE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
_NEON2SSESTORAGE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
_NEON2SSESTORAGE uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
_NEON2SSESTORAGE uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
//Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]
_NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
_NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
_NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
_NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
_NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
//Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i]
_NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
_NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
_NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
_NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
_NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0
_NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
//Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]
_NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
_NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
_NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
_NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
_NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
//Vector multiply subtract long
_NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
_NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
_NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
_NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
_NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0
_NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
//Vector saturating doubling multiply high
_NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
_NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
//Vector saturating rounding doubling multiply high
_NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
_NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
//Vector saturating doubling multiply accumulate long
_NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
_NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
//Vector saturating doubling multiply subtract long
_NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
_NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
//Vector long multiply
_NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
_NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
_NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
_NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
_NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0
_NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
_NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
//Vector saturating doubling long multiply
_NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
_NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
//Subtraction
//Vector subtract
_NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
_NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
_NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
_NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
_NEON2SSESTORAGE int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
_NEON2SSESTORAGE int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
_NEON2SSESTORAGE int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
_NEON2SSESTORAGE float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
_NEON2SSESTORAGE uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
//Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
_NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
_NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
_NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
_NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
_NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0
_NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
//Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
_NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
_NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
_NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
_NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
_NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0
_NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
//Vector saturating subtract
_NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
_NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
_NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
_NEON2SSESTORAGE int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
_NEON2SSESTORAGE int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
_NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
_NEON2SSESTORAGE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0
//Vector halving subtract
_NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
_NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
_NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
//Vector subtract high half
_NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
_NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
_NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
_NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
_NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
_NEON2SSESTORAGE uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
//Vector rounding subtract high half
_NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
_NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
_NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
_NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
_NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
_NEON2SSESTORAGE uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
//Comparison
//Vector compare equal
_NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
_NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
_NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
_NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
_NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
_NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
_NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
_NEON2SSESTORAGE uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
_NEON2SSESTORAGE uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
_NEON2SSESTORAGE uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
_NEON2SSESTORAGE uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
_NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
_NEON2SSESTORAGE uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
_NEON2SSESTORAGE uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
_NEON2SSESTORAGE uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
_NEON2SSESTORAGE uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
//Vector compare greater-than or equal
_NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
_NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
_NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
_NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
_NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
_NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
_NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
_NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
_NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
_NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
_NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
_NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
_NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
_NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
//Vector compare less-than or equal
_NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
_NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
_NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
_NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
_NEON2SSESTORAGE uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
_NEON2SSESTORAGE uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
_NEON2SSESTORAGE uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
_NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
_NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
_NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
_NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
_NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
_NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
_NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
//Vector compare greater-than
_NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
_NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
_NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
_NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
_NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
_NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
_NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
_NEON2SSESTORAGE uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
_NEON2SSESTORAGE uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
_NEON2SSESTORAGE uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
_NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
_NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
_NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
_NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
//Vector compare less-than
_NEON2SSESTORAGE uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
_NEON2SSESTORAGE uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
_NEON2SSESTORAGE uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
_NEON2SSESTORAGE uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
_NEON2SSESTORAGE uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
_NEON2SSESTORAGE uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
_NEON2SSESTORAGE uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
_NEON2SSESTORAGE uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
_NEON2SSESTORAGE uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
_NEON2SSESTORAGE uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
_NEON2SSESTORAGE uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
_NEON2SSESTORAGE uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
_NEON2SSESTORAGE uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
_NEON2SSESTORAGE uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
//Vector compare absolute greater-than or equal
_NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
_NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
//Vector compare absolute less-than or equal
_NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
_NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
//Vector compare absolute greater-than
_NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
_NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
//Vector compare absolute less-than
_NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
_NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
//Vector test bits
_NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
_NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
_NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
_NEON2SSESTORAGE uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
_NEON2SSESTORAGE uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
_NEON2SSESTORAGE uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
_NEON2SSESTORAGE uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
_NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
_NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
_NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
_NEON2SSESTORAGE uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
_NEON2SSESTORAGE uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
_NEON2SSESTORAGE uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
_NEON2SSESTORAGE uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
//Absolute difference
//Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |
_NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
_NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
_NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
_NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
_NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
//Absolute difference - long
_NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
_NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
_NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
_NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
_NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0
_NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
//Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] |
_NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
_NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
_NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
//Absolute difference and accumulate - long
_NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
_NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
_NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
_NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
_NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0
_NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
//Max/Min
//vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]
_NEON2SSESTORAGE int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
_NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
_NEON2SSESTORAGE int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
_NEON2SSESTORAGE int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
_NEON2SSESTORAGE float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
_NEON2SSESTORAGE float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
//vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i]
_NEON2SSESTORAGE int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
_NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
_NEON2SSESTORAGE int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
_NEON2SSESTORAGE int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
_NEON2SSESTORAGE float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
_NEON2SSESTORAGE float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
//Pairwise addition
//Pairwise add
_NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
_NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
//Long pairwise add
_NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
_NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
_NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
_NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
_NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0
_NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
_NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
_NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
_NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
_NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
_NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0
_NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
//Long pairwise add and accumulate
_NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
_NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
_NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
_NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
_NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0
_NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
_NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
_NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
_NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
_NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
_NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0
_NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
//Folding maximum vpmax -> takes maximum of adjacent pairs
_NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
_NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
//Folding minimum vpmin -> takes minimum of adjacent pairs
_NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
_NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
//Reciprocal/Sqrt
_NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
_NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
_NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
_NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
//Shifts by signed variable
//Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right)
_NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
_NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
_NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
_NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
_NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
_NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
_NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
//Vector saturating shift left: (negative values shift right)
_NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
_NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
_NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
_NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
_NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
_NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
_NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
//Vector rounding shift left: (negative values shift right)
_NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
_NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
_NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
_NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
_NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
_NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
_NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
//Vector saturating rounding shift left: (negative values shift right)
_NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
_NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
_NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
_NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
_NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
_NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
_NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
_NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
_NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
_NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
//Shifts by a constant
//Vector shift right by constant
_NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
_NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
_NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
_NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
_NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
_NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16
_NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
_NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
_NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
_NEON2SSESTORAGE int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
_NEON2SSESTORAGE int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
_NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
_NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
_NEON2SSESTORAGE uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16
_NEON2SSESTORAGE uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
_NEON2SSESTORAGE uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
//Vector shift left by constant
_NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
_NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
_NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
_NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
_NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
_NEON2SSESTORAGE uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
_NEON2SSESTORAGE uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
_NEON2SSESTORAGE uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
_NEON2SSESTORAGE int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
_NEON2SSESTORAGE int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
_NEON2SSESTORAGE int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
_NEON2SSESTORAGE int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
_NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
_NEON2SSESTORAGE uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
_NEON2SSESTORAGE uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
_NEON2SSESTORAGE uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
//Vector rounding shift right by constant
_NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
_NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
_NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
_NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
_NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
_NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16
_NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
_NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
_NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
_NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
_NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
_NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
_NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
_NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16
_NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
_NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
//Vector shift right by constant and accumulate
_NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
_NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
_NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
_NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
_NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
_NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16
_NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
_NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
_NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
_NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
_NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
_NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
_NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
_NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16
_NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
_NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
//Vector rounding shift right by constant and accumulate
_NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
_NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
_NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
_NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
_NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
_NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16
_NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
_NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
_NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
_NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
_NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
_NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
_NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
_NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16
_NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
_NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
//Vector saturating shift left by constant
_NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
_NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
_NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
_NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
_NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
_NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0
_NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
_NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
_NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
_NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
_NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
_NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
_NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
_NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0
_NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
_NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
//Vector signed->unsigned saturating shift left by constant
_NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
_NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
_NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
_NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
_NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
_NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
_NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
_NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
//Vector narrowing shift right by constant
_NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
_NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
_NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
_NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
_NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
_NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
//Vector signed->unsigned narrowing saturating shift right by constant
_NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
_NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
_NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
//Vector signed->unsigned rounding narrowing saturating shift right by constant
_NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
_NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
_NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
//Vector narrowing saturating shift right by constant
_NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
_NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
_NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
_NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8
_NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
_NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
//Vector rounding narrowing shift right by constant
_NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
_NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
_NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
_NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
_NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
_NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
//Vector rounding narrowing saturating shift right by constant
_NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
_NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
_NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
_NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8
_NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
_NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
//Vector widening shift left by constant
_NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
_NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
_NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
_NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
_NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0
_NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
//Shifts with insert
//Vector shift right and insert
_NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
_NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
_NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
_NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
_NEON2SSESTORAGE uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
_NEON2SSESTORAGE uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
_NEON2SSESTORAGE uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
_NEON2SSESTORAGE uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
_NEON2SSESTORAGE poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
_NEON2SSESTORAGE poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
_NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
_NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
_NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
_NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
_NEON2SSESTORAGE uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
_NEON2SSESTORAGE uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
_NEON2SSESTORAGE uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
_NEON2SSESTORAGE uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
_NEON2SSESTORAGE poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
_NEON2SSESTORAGE poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
//Vector shift left and insert
_NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
_NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
_NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
_NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
_NEON2SSESTORAGE uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
_NEON2SSESTORAGE uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
_NEON2SSESTORAGE uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
_NEON2SSESTORAGE uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
_NEON2SSESTORAGE poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
_NEON2SSESTORAGE poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
_NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
_NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
_NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
_NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
_NEON2SSESTORAGE uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
_NEON2SSESTORAGE uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
_NEON2SSESTORAGE uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
_NEON2SSESTORAGE uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
_NEON2SSESTORAGE poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
_NEON2SSESTORAGE poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
//Loads of a single vector or lane. Perform loads and stores of a single vector of some type.
//Load a single vector from memory
_NEON2SSESTORAGE uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
_NEON2SSESTORAGE uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
_NEON2SSESTORAGE uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
_NEON2SSESTORAGE uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
_NEON2SSESTORAGE int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
_NEON2SSESTORAGE int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
_NEON2SSESTORAGE int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
_NEON2SSESTORAGE int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
_NEON2SSESTORAGE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
_NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
_NEON2SSESTORAGE poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
_NEON2SSESTORAGE poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
_NEON2SSESTORAGE uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
_NEON2SSESTORAGE uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
_NEON2SSESTORAGE uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
_NEON2SSESTORAGE uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
_NEON2SSESTORAGE int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
_NEON2SSESTORAGE int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
_NEON2SSESTORAGE int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
_NEON2SSESTORAGE int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
_NEON2SSESTORAGE float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
_NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
_NEON2SSESTORAGE poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
_NEON2SSESTORAGE poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
_NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
//Load a single lane from memory
_NEON2SSESTORAGE uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
_NEON2SSESTORAGE uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
_NEON2SSESTORAGE uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
_NEON2SSESTORAGE uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
_NEON2SSESTORAGE int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
_NEON2SSESTORAGE int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
_NEON2SSESTORAGE int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0]
_NEON2SSESTORAGE float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
_NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
_NEON2SSESTORAGE int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0]
_NEON2SSESTORAGE poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
_NEON2SSESTORAGE poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
_NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
_NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
_NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
_NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
_NEON2SSESTORAGE int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0]
_NEON2SSESTORAGE int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
_NEON2SSESTORAGE int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
_NEON2SSESTORAGE float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
_NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
_NEON2SSESTORAGE int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
_NEON2SSESTORAGE poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
_NEON2SSESTORAGE poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
//Load all lanes of vector with same value from memory
_NEON2SSESTORAGE uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
_NEON2SSESTORAGE uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
_NEON2SSESTORAGE uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
_NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
_NEON2SSESTORAGE int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
_NEON2SSESTORAGE int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
_NEON2SSESTORAGE int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
_NEON2SSESTORAGE int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
_NEON2SSESTORAGE float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
_NEON2SSESTORAGE float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
_NEON2SSESTORAGE poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
_NEON2SSESTORAGE poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
_NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
_NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
_NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
_NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
_NEON2SSESTORAGE int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
_NEON2SSESTORAGE int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
_NEON2SSESTORAGE int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
_NEON2SSESTORAGE int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
_NEON2SSESTORAGE float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
_NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
_NEON2SSESTORAGE poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
_NEON2SSESTORAGE poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
//Store a single vector or lane. Stores all lanes or a single lane of a vector.
//Store a single vector into memory
_NEON2SSESTORAGE void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
_NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
_NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
_NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
_NEON2SSESTORAGE void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
_NEON2SSESTORAGE void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
_NEON2SSESTORAGE void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
_NEON2SSESTORAGE void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
_NEON2SSESTORAGE void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
_NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
_NEON2SSESTORAGE void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
_NEON2SSESTORAGE void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
//Store a lane of a vector into memory
//Loads of an N-element structure
//Load N-element structure from memory
_NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
_NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
_NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
_NEON2SSESTORAGE int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
_NEON2SSESTORAGE int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
_NEON2SSESTORAGE int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
_NEON2SSESTORAGE float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
_NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
_NEON2SSESTORAGE poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
_NEON2SSESTORAGE poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
_NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
_NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
_NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
_NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
_NEON2SSESTORAGE int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
_NEON2SSESTORAGE int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
_NEON2SSESTORAGE int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
_NEON2SSESTORAGE int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
//float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
_NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
_NEON2SSESTORAGE poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
_NEON2SSESTORAGE poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
_NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
//Load all lanes of N-element structure with same value from memory
_NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
_NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
_NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
_NEON2SSESTORAGE uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
_NEON2SSESTORAGE int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
_NEON2SSESTORAGE int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
_NEON2SSESTORAGE int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
_NEON2SSESTORAGE int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
//float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
_NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
_NEON2SSESTORAGE poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
_NEON2SSESTORAGE poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
_NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
_NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
_NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
_NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
_NEON2SSESTORAGE int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
_NEON2SSESTORAGE int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
_NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
_NEON2SSESTORAGE poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
_NEON2SSESTORAGE poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
_NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
_NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
_NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
_NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
_NEON2SSESTORAGE int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
_NEON2SSESTORAGE int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
_NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
_NEON2SSESTORAGE poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
_NEON2SSESTORAGE poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
//Load a single lane of N-element structure from memory
//the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned
_NEON2SSESTORAGE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
_NEON2SSESTORAGE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
_NEON2SSESTORAGE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
_NEON2SSESTORAGE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
_NEON2SSESTORAGE float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
_NEON2SSESTORAGE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
_NEON2SSESTORAGE poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
_NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0]
//float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
_NEON2SSESTORAGE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
_NEON2SSESTORAGE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
_NEON2SSESTORAGE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
_NEON2SSESTORAGE float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
_NEON2SSESTORAGE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
_NEON2SSESTORAGE poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
_NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE int8x8x3_t vld3_lane_s8(__transfersize(3) int8_t const * ptr, int8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE int16x4x3_t vld3_lane_s16(__transfersize(3) int16_t const * ptr, int16x4x3_t src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE int32x2x3_t vld3_lane_s32(__transfersize(3) int32_t const * ptr, int32x2x3_t src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE poly8x8x3_t vld3_lane_p8(__transfersize(3) poly8_t const * ptr, poly8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSESTORAGE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSESTORAGE int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSESTORAGE int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSESTORAGE float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSESTORAGE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSESTORAGE poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSESTORAGE int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSESTORAGE int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSESTORAGE int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSESTORAGE float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSESTORAGE poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSESTORAGE poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
//Store N-element structure to memory
_NEON2SSESTORAGE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
_NEON2SSESTORAGE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
_NEON2SSESTORAGE void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
_NEON2SSESTORAGE void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
_NEON2SSESTORAGE void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
_NEON2SSESTORAGE void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
_NEON2SSESTORAGE void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
_NEON2SSESTORAGE void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
_NEON2SSESTORAGE void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
_NEON2SSESTORAGE void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
_NEON2SSESTORAGE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val); // VST2.8 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val); // VST2.16 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val); // VST2.32 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val); // VST1.64 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val); // VST2.8 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val); // VST1.64 {d0, d1}, [r0]
//void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst2_p8(__transfersize(16) poly8_t * ptr, poly8x8x2_t val); // VST2.8 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst2_p16(__transfersize(8) poly16_t * ptr, poly16x4x2_t val); // VST2.16 {d0, d1}, [r0]
_NEON2SSESTORAGE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
_NEON2SSESTORAGE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
_NEON2SSESTORAGE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
_NEON2SSESTORAGE void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
//Store a single lane of N-element structure to memory
_NEON2SSESTORAGE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
_NEON2SSESTORAGE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
_NEON2SSESTORAGE void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
_NEON2SSESTORAGE void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
_NEON2SSESTORAGE void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
_NEON2SSESTORAGE void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0]
_NEON2SSESTORAGE void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
_NEON2SSESTORAGE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0]
_NEON2SSESTORAGE void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
_NEON2SSESTORAGE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
_NEON2SSESTORAGE void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
_NEON2SSESTORAGE void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
_NEON2SSESTORAGE void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
_NEON2SSESTORAGE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
_NEON2SSESTORAGE void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
_NEON2SSESTORAGE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
_NEON2SSESTORAGE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSESTORAGE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSESTORAGE void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSESTORAGE void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSESTORAGE void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSESTORAGE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSESTORAGE void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSESTORAGE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSESTORAGE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSESTORAGE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSESTORAGE void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
_NEON2SSESTORAGE void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSESTORAGE void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSESTORAGE void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSESTORAGE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSESTORAGE void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSESTORAGE void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
//Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector.
_NEON2SSESTORAGE uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
_NEON2SSESTORAGE uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
_NEON2SSESTORAGE uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
_NEON2SSESTORAGE int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
_NEON2SSESTORAGE int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
_NEON2SSESTORAGE int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
_NEON2SSESTORAGE poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
_NEON2SSESTORAGE poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
_NEON2SSESTORAGE float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
_NEON2SSESTORAGE uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
_NEON2SSESTORAGE uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
_NEON2SSESTORAGE uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
_NEON2SSESTORAGE int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
_NEON2SSESTORAGE int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
_NEON2SSESTORAGE int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
_NEON2SSESTORAGE poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
_NEON2SSESTORAGE poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
_NEON2SSESTORAGE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
_NEON2SSESTORAGE int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
_NEON2SSESTORAGE uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
_NEON2SSESTORAGE int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
_NEON2SSESTORAGE uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
//Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector.
_NEON2SSESTORAGE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
_NEON2SSESTORAGE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
_NEON2SSESTORAGE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
_NEON2SSESTORAGE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
_NEON2SSESTORAGE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
_NEON2SSESTORAGE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
_NEON2SSESTORAGE poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
_NEON2SSESTORAGE poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
_NEON2SSESTORAGE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
_NEON2SSESTORAGE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
_NEON2SSESTORAGE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
_NEON2SSESTORAGE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
_NEON2SSESTORAGE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
_NEON2SSESTORAGE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
_NEON2SSESTORAGE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
_NEON2SSESTORAGE poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
_NEON2SSESTORAGE poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
_NEON2SSESTORAGE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
_NEON2SSESTORAGE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
_NEON2SSESTORAGE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
_NEON2SSESTORAGE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
_NEON2SSESTORAGE uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
//Initialize a vector from a literal bit pattern.
_NEON2SSESTORAGE int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
_NEON2SSESTORAGE int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
_NEON2SSESTORAGE int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
_NEON2SSESTORAGE float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
_NEON2SSESTORAGE float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
_NEON2SSESTORAGE uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
_NEON2SSESTORAGE uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
_NEON2SSESTORAGE uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
_NEON2SSESTORAGE uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
_NEON2SSESTORAGE poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
_NEON2SSESTORAGE poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
_NEON2SSESTORAGE int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
//Set all lanes to same value
//Load all lanes of vector to the same literal value
_NEON2SSESTORAGE uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
_NEON2SSESTORAGE uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
_NEON2SSESTORAGE uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
_NEON2SSESTORAGE int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
_NEON2SSESTORAGE int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
_NEON2SSESTORAGE int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
_NEON2SSESTORAGE poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
_NEON2SSESTORAGE poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
_NEON2SSESTORAGE float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
_NEON2SSESTORAGE uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
_NEON2SSESTORAGE uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
_NEON2SSESTORAGE uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
_NEON2SSESTORAGE int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
_NEON2SSESTORAGE int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
_NEON2SSESTORAGE int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
_NEON2SSESTORAGE poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
_NEON2SSESTORAGE poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
_NEON2SSESTORAGE float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
_NEON2SSESTORAGE int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
_NEON2SSESTORAGE uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
_NEON2SSESTORAGE int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
_NEON2SSESTORAGE uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
_NEON2SSESTORAGE uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
_NEON2SSESTORAGE uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
_NEON2SSESTORAGE uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
_NEON2SSESTORAGE int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
_NEON2SSESTORAGE int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
_NEON2SSESTORAGE int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
_NEON2SSESTORAGE poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
_NEON2SSESTORAGE poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
_NEON2SSESTORAGE float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
_NEON2SSESTORAGE uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
_NEON2SSESTORAGE uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
_NEON2SSESTORAGE uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
_NEON2SSESTORAGE int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
_NEON2SSESTORAGE int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
_NEON2SSESTORAGE int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
_NEON2SSESTORAGE poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
_NEON2SSESTORAGE poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
_NEON2SSESTORAGE float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
_NEON2SSESTORAGE int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
_NEON2SSESTORAGE uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
_NEON2SSESTORAGE int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
_NEON2SSESTORAGE uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
//Load all lanes of the vector to the value of a lane of a vector
_NEON2SSESTORAGE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
_NEON2SSESTORAGE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
_NEON2SSESTORAGE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
_NEON2SSESTORAGE int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
_NEON2SSESTORAGE int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
_NEON2SSESTORAGE int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
_NEON2SSESTORAGE poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
_NEON2SSESTORAGE poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
_NEON2SSESTORAGE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
_NEON2SSESTORAGE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
_NEON2SSESTORAGE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
_NEON2SSESTORAGE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
_NEON2SSESTORAGE int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
_NEON2SSESTORAGE int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
_NEON2SSESTORAGE int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
_NEON2SSESTORAGE poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
_NEON2SSESTORAGE poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
_NEON2SSESTORAGE float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
_NEON2SSESTORAGE int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
_NEON2SSESTORAGE uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
_NEON2SSESTORAGE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
_NEON2SSESTORAGE uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
//Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector.
_NEON2SSESTORAGE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
_NEON2SSESTORAGE int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
_NEON2SSESTORAGE int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
_NEON2SSESTORAGE int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
_NEON2SSESTORAGE float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
_NEON2SSESTORAGE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
_NEON2SSESTORAGE uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
_NEON2SSESTORAGE uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
_NEON2SSESTORAGE uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
_NEON2SSESTORAGE uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
_NEON2SSESTORAGE poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
_NEON2SSESTORAGE poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
//Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors
_NEON2SSESTORAGE int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
_NEON2SSESTORAGE int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
_NEON2SSESTORAGE int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
_NEON2SSESTORAGE int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
_NEON2SSESTORAGE float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
_NEON2SSESTORAGE float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
_NEON2SSESTORAGE uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
_NEON2SSESTORAGE uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
_NEON2SSESTORAGE uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
_NEON2SSESTORAGE uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
_NEON2SSESTORAGE poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
_NEON2SSESTORAGE poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
_NEON2SSESTORAGE int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
_NEON2SSESTORAGE int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
_NEON2SSESTORAGE int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
_NEON2SSESTORAGE int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
_NEON2SSESTORAGE float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
_NEON2SSESTORAGE float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
_NEON2SSESTORAGE uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
_NEON2SSESTORAGE uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
_NEON2SSESTORAGE uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
_NEON2SSESTORAGE uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
_NEON2SSESTORAGE poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
_NEON2SSESTORAGE poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
//Converting vectors. These intrinsics are used to convert vectors.
//Convert from float
_NEON2SSESTORAGE int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
_NEON2SSESTORAGE uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
_NEON2SSESTORAGE int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
_NEON2SSESTORAGE uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
_NEON2SSESTORAGE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
_NEON2SSESTORAGE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
_NEON2SSESTORAGE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
_NEON2SSESTORAGE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
_NEON2SSESTORAGE int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0
//Convert to float
_NEON2SSESTORAGE float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
_NEON2SSESTORAGE float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
_NEON2SSESTORAGE float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
_NEON2SSESTORAGE float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
_NEON2SSESTORAGE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
_NEON2SSESTORAGE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
_NEON2SSESTORAGE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
_NEON2SSESTORAGE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
//Convert between floats
_NEON2SSESTORAGE float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
_NEON2SSESTORAGE float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
//Vector narrow integer
_NEON2SSESTORAGE int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
_NEON2SSESTORAGE int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
_NEON2SSESTORAGE int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
_NEON2SSESTORAGE uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
_NEON2SSESTORAGE uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
_NEON2SSESTORAGE uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
//Vector long move
_NEON2SSESTORAGE int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
_NEON2SSESTORAGE int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
_NEON2SSESTORAGE int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
_NEON2SSESTORAGE uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
_NEON2SSESTORAGE uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0
_NEON2SSESTORAGE uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
//Vector saturating narrow integer
_NEON2SSESTORAGE int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
_NEON2SSESTORAGE int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
_NEON2SSESTORAGE int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
_NEON2SSESTORAGE uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0
_NEON2SSESTORAGE uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
_NEON2SSESTORAGE uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
//Vector saturating narrow integer signed->unsigned
_NEON2SSESTORAGE uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
_NEON2SSESTORAGE uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
_NEON2SSESTORAGE uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
//Table look up
_NEON2SSESTORAGE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
_NEON2SSESTORAGE int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
_NEON2SSESTORAGE poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
//Extended table look up intrinsics
_NEON2SSESTORAGE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
_NEON2SSESTORAGE int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
_NEON2SSESTORAGE poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
_NEON2SSESTORAGE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
_NEON2SSESTORAGE int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
_NEON2SSESTORAGE poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
_NEON2SSESTORAGE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
_NEON2SSESTORAGE int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
_NEON2SSESTORAGE poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
_NEON2SSESTORAGE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
_NEON2SSESTORAGE int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
_NEON2SSESTORAGE poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
//Operations with a scalar value
//Vector multiply accumulate with scalar
_NEON2SSESTORAGE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
_NEON2SSESTORAGE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
_NEON2SSESTORAGE uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
_NEON2SSESTORAGE uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
_NEON2SSESTORAGE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0]
_NEON2SSESTORAGE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0]
_NEON2SSESTORAGE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0]
_NEON2SSESTORAGE uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0]
_NEON2SSESTORAGE uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0]
_NEON2SSESTORAGE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0]
//Vector widening multiply accumulate with scalar
_NEON2SSESTORAGE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0]
_NEON2SSESTORAGE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0]
_NEON2SSESTORAGE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0]
_NEON2SSESTORAGE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0]
//Vector widening saturating doubling multiply accumulate with scalar
_NEON2SSESTORAGE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0]
_NEON2SSESTORAGE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0]
//Vector multiply subtract with scalar
_NEON2SSESTORAGE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
_NEON2SSESTORAGE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
_NEON2SSESTORAGE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
_NEON2SSESTORAGE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
_NEON2SSESTORAGE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0]
_NEON2SSESTORAGE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0]
_NEON2SSESTORAGE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0]
_NEON2SSESTORAGE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0]
_NEON2SSESTORAGE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0]
_NEON2SSESTORAGE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0]
//Vector widening multiply subtract with scalar
_NEON2SSESTORAGE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0]
_NEON2SSESTORAGE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0]
_NEON2SSESTORAGE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0]
_NEON2SSESTORAGE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0]
//Vector widening saturating doubling multiply subtract with scalar
_NEON2SSESTORAGE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0]
_NEON2SSESTORAGE int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0]
//Vector multiply by scalar
_NEON2SSESTORAGE int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
_NEON2SSESTORAGE int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
_NEON2SSESTORAGE float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
_NEON2SSESTORAGE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
_NEON2SSESTORAGE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
_NEON2SSESTORAGE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
_NEON2SSESTORAGE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
_NEON2SSESTORAGE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
_NEON2SSESTORAGE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
_NEON2SSESTORAGE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
//Vector long multiply with scalar
_NEON2SSESTORAGE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
_NEON2SSESTORAGE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
_NEON2SSESTORAGE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0]
_NEON2SSESTORAGE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
//Vector long multiply by scalar
_NEON2SSESTORAGE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
_NEON2SSESTORAGE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
_NEON2SSESTORAGE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0]
_NEON2SSESTORAGE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
//Vector saturating doubling long multiply with scalar
_NEON2SSESTORAGE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
_NEON2SSESTORAGE int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
//Vector saturating doubling long multiply by scalar
_NEON2SSESTORAGE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
_NEON2SSESTORAGE int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
//Vector saturating doubling multiply high with scalar
_NEON2SSESTORAGE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
_NEON2SSESTORAGE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
_NEON2SSESTORAGE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
_NEON2SSESTORAGE int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
//Vector saturating doubling multiply high by scalar
_NEON2SSESTORAGE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
_NEON2SSESTORAGE int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
_NEON2SSESTORAGE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
_NEON2SSESTORAGE int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
//Vector saturating rounding doubling multiply high with scalar
_NEON2SSESTORAGE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
_NEON2SSESTORAGE int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
_NEON2SSESTORAGE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
_NEON2SSESTORAGE int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
//Vector rounding saturating doubling multiply high by scalar
_NEON2SSESTORAGE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
_NEON2SSESTORAGE int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
_NEON2SSESTORAGE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
_NEON2SSESTORAGE int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
//Vector multiply accumulate with scalar
_NEON2SSESTORAGE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
_NEON2SSESTORAGE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
_NEON2SSESTORAGE uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
_NEON2SSESTORAGE uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
_NEON2SSESTORAGE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
_NEON2SSESTORAGE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
_NEON2SSESTORAGE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
_NEON2SSESTORAGE uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
_NEON2SSESTORAGE uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
_NEON2SSESTORAGE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
//Vector widening multiply accumulate with scalar
_NEON2SSESTORAGE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
_NEON2SSESTORAGE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
_NEON2SSESTORAGE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0]
_NEON2SSESTORAGE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
//Vector widening saturating doubling multiply accumulate with scalar
_NEON2SSESTORAGE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
_NEON2SSESTORAGE int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
//Vector multiply subtract with scalar
_NEON2SSESTORAGE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
_NEON2SSESTORAGE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
_NEON2SSESTORAGE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
_NEON2SSESTORAGE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
_NEON2SSESTORAGE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
_NEON2SSESTORAGE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
_NEON2SSESTORAGE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
_NEON2SSESTORAGE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
_NEON2SSESTORAGE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
_NEON2SSESTORAGE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
//Vector widening multiply subtract with scalar
_NEON2SSESTORAGE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
_NEON2SSESTORAGE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
_NEON2SSESTORAGE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0]
_NEON2SSESTORAGE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
//Vector widening saturating doubling multiply subtract with scalar
_NEON2SSESTORAGE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
_NEON2SSESTORAGE int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
//Vector extract
_NEON2SSESTORAGE int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
_NEON2SSESTORAGE uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
_NEON2SSESTORAGE poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
_NEON2SSESTORAGE int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
_NEON2SSESTORAGE uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
_NEON2SSESTORAGE poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
_NEON2SSESTORAGE int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
_NEON2SSESTORAGE uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
_NEON2SSESTORAGE int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
_NEON2SSESTORAGE uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
_NEON2SSESTORAGE float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
_NEON2SSESTORAGE int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
_NEON2SSESTORAGE uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
_NEON2SSESTORAGE poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
_NEON2SSESTORAGE int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
_NEON2SSESTORAGE uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
_NEON2SSESTORAGE poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
_NEON2SSESTORAGE int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
_NEON2SSESTORAGE uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
_NEON2SSESTORAGE int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
_NEON2SSESTORAGE uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
_NEON2SSESTORAGE float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
//Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
_NEON2SSESTORAGE int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
_NEON2SSESTORAGE int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
_NEON2SSESTORAGE int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
_NEON2SSESTORAGE uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
_NEON2SSESTORAGE uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
_NEON2SSESTORAGE uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
_NEON2SSESTORAGE poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
_NEON2SSESTORAGE poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
_NEON2SSESTORAGE float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
_NEON2SSESTORAGE int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
_NEON2SSESTORAGE int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
_NEON2SSESTORAGE int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
_NEON2SSESTORAGE uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
_NEON2SSESTORAGE uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
_NEON2SSESTORAGE uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
_NEON2SSESTORAGE poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
_NEON2SSESTORAGE poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
_NEON2SSESTORAGE float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
_NEON2SSESTORAGE int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
_NEON2SSESTORAGE int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
_NEON2SSESTORAGE uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
_NEON2SSESTORAGE uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
_NEON2SSESTORAGE poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
_NEON2SSESTORAGE poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
_NEON2SSESTORAGE int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
_NEON2SSESTORAGE int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
_NEON2SSESTORAGE uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
_NEON2SSESTORAGE uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
_NEON2SSESTORAGE poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
_NEON2SSESTORAGE poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
_NEON2SSESTORAGE int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
_NEON2SSESTORAGE uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
_NEON2SSESTORAGE poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
_NEON2SSESTORAGE int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
_NEON2SSESTORAGE uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
_NEON2SSESTORAGE poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
//Other single operand arithmetic
//Absolute: Vd[i] = |Va[i]|
_NEON2SSESTORAGE int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
_NEON2SSESTORAGE int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
_NEON2SSESTORAGE int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
_NEON2SSESTORAGE float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
_NEON2SSESTORAGE int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
_NEON2SSESTORAGE int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
_NEON2SSESTORAGE int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
_NEON2SSESTORAGE float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
_NEON2SSESTORAGE int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0
_NEON2SSESTORAGE float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0
//Saturating absolute: Vd[i] = sat(|Va[i]|)
_NEON2SSESTORAGE int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
_NEON2SSESTORAGE int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
_NEON2SSESTORAGE int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
_NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
_NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
_NEON2SSESTORAGE int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
//Negate: Vd[i] = - Va[i]
_NEON2SSESTORAGE int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
_NEON2SSESTORAGE int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
_NEON2SSESTORAGE int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
_NEON2SSESTORAGE float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
_NEON2SSESTORAGE int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
_NEON2SSESTORAGE int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
_NEON2SSESTORAGE int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
_NEON2SSESTORAGE float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
//Saturating Negate: sat(Vd[i] = - Va[i])
_NEON2SSESTORAGE int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
_NEON2SSESTORAGE int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
_NEON2SSESTORAGE int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
_NEON2SSESTORAGE int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
_NEON2SSESTORAGE int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
_NEON2SSESTORAGE int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
//Count leading sign bits
_NEON2SSESTORAGE int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
_NEON2SSESTORAGE int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
_NEON2SSESTORAGE int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
_NEON2SSESTORAGE int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
_NEON2SSESTORAGE int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
_NEON2SSESTORAGE int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
//Count leading zeros
_NEON2SSESTORAGE int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
_NEON2SSESTORAGE int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
_NEON2SSESTORAGE int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
_NEON2SSESTORAGE uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
_NEON2SSESTORAGE uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
_NEON2SSESTORAGE uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
_NEON2SSESTORAGE int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
_NEON2SSESTORAGE int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
_NEON2SSESTORAGE int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
_NEON2SSESTORAGE uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
_NEON2SSESTORAGE uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
_NEON2SSESTORAGE uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
//Count number of set bits
_NEON2SSESTORAGE uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
_NEON2SSESTORAGE int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
_NEON2SSESTORAGE poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
_NEON2SSESTORAGE uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
_NEON2SSESTORAGE int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
_NEON2SSESTORAGE poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
//Reciprocal estimate
_NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
_NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
_NEON2SSESTORAGE float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
_NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
//Reciprocal square root estimate
_NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
_NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
_NEON2SSESTORAGE float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
_NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
//Logical operations
//Bitwise not
_NEON2SSESTORAGE int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
_NEON2SSESTORAGE int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
_NEON2SSESTORAGE int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
_NEON2SSESTORAGE uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
_NEON2SSESTORAGE uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
_NEON2SSESTORAGE uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
_NEON2SSESTORAGE poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
_NEON2SSESTORAGE int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
_NEON2SSESTORAGE int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
_NEON2SSESTORAGE int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
_NEON2SSESTORAGE uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
_NEON2SSESTORAGE uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
_NEON2SSESTORAGE uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
_NEON2SSESTORAGE poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
//Bitwise and
_NEON2SSESTORAGE int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
_NEON2SSESTORAGE int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
_NEON2SSESTORAGE int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
_NEON2SSESTORAGE int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
_NEON2SSESTORAGE uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
_NEON2SSESTORAGE int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
_NEON2SSESTORAGE int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
_NEON2SSESTORAGE int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
_NEON2SSESTORAGE int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
_NEON2SSESTORAGE uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
//Bitwise or
_NEON2SSESTORAGE int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
_NEON2SSESTORAGE int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
_NEON2SSESTORAGE int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
_NEON2SSESTORAGE int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
_NEON2SSESTORAGE uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
_NEON2SSESTORAGE int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
_NEON2SSESTORAGE int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
_NEON2SSESTORAGE int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
_NEON2SSESTORAGE int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
_NEON2SSESTORAGE uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
//Bitwise exclusive or (EOR or XOR)
_NEON2SSESTORAGE int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
_NEON2SSESTORAGE int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
_NEON2SSESTORAGE int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
_NEON2SSESTORAGE int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
_NEON2SSESTORAGE uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
_NEON2SSESTORAGE uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
_NEON2SSESTORAGE uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
_NEON2SSESTORAGE uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
_NEON2SSESTORAGE int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
_NEON2SSESTORAGE int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
_NEON2SSESTORAGE int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
_NEON2SSESTORAGE int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
_NEON2SSESTORAGE uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
_NEON2SSESTORAGE uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
_NEON2SSESTORAGE uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
_NEON2SSESTORAGE uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
//Bit Clear
_NEON2SSESTORAGE int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
_NEON2SSESTORAGE int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
_NEON2SSESTORAGE int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
_NEON2SSESTORAGE int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
_NEON2SSESTORAGE uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
_NEON2SSESTORAGE int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
_NEON2SSESTORAGE int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
_NEON2SSESTORAGE int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
_NEON2SSESTORAGE int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
_NEON2SSESTORAGE uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
//Bitwise OR complement
_NEON2SSESTORAGE int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
_NEON2SSESTORAGE int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
_NEON2SSESTORAGE int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
_NEON2SSESTORAGE int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
_NEON2SSESTORAGE uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
_NEON2SSESTORAGE int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
_NEON2SSESTORAGE int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
_NEON2SSESTORAGE int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
_NEON2SSESTORAGE int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
_NEON2SSESTORAGE uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
//Bitwise Select
_NEON2SSESTORAGE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
_NEON2SSESTORAGE int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
_NEON2SSESTORAGE int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
_NEON2SSESTORAGE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
_NEON2SSESTORAGE uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
_NEON2SSESTORAGE uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
_NEON2SSESTORAGE uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
_NEON2SSESTORAGE uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
_NEON2SSESTORAGE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
_NEON2SSESTORAGE poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
_NEON2SSESTORAGE poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
_NEON2SSESTORAGE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
_NEON2SSESTORAGE int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
_NEON2SSESTORAGE int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
_NEON2SSESTORAGE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
_NEON2SSESTORAGE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
_NEON2SSESTORAGE uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
_NEON2SSESTORAGE uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
_NEON2SSESTORAGE uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
_NEON2SSESTORAGE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
_NEON2SSESTORAGE poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
_NEON2SSESTORAGE poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
//Transposition operations
//Transpose elements
_NEON2SSESTORAGE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
_NEON2SSESTORAGE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
_NEON2SSESTORAGE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
_NEON2SSESTORAGE uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
_NEON2SSESTORAGE uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
_NEON2SSESTORAGE uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
_NEON2SSESTORAGE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
_NEON2SSESTORAGE poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
_NEON2SSESTORAGE poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
_NEON2SSESTORAGE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
_NEON2SSESTORAGE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
_NEON2SSESTORAGE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
_NEON2SSESTORAGE uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
_NEON2SSESTORAGE uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
_NEON2SSESTORAGE uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
_NEON2SSESTORAGE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
_NEON2SSESTORAGE poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
_NEON2SSESTORAGE poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
//Interleave elements
_NEON2SSESTORAGE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
_NEON2SSESTORAGE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
_NEON2SSESTORAGE int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
_NEON2SSESTORAGE uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
_NEON2SSESTORAGE uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
_NEON2SSESTORAGE uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
_NEON2SSESTORAGE float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
_NEON2SSESTORAGE poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
_NEON2SSESTORAGE poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
_NEON2SSESTORAGE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
_NEON2SSESTORAGE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
_NEON2SSESTORAGE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
_NEON2SSESTORAGE uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
_NEON2SSESTORAGE uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
_NEON2SSESTORAGE uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
_NEON2SSESTORAGE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
_NEON2SSESTORAGE poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
_NEON2SSESTORAGE poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
//De-Interleave elements
_NEON2SSESTORAGE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
_NEON2SSESTORAGE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
_NEON2SSESTORAGE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
_NEON2SSESTORAGE uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
_NEON2SSESTORAGE uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
_NEON2SSESTORAGE uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
_NEON2SSESTORAGE float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
_NEON2SSESTORAGE poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
_NEON2SSESTORAGE poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
_NEON2SSESTORAGE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
_NEON2SSESTORAGE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
_NEON2SSESTORAGE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
_NEON2SSESTORAGE uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
_NEON2SSESTORAGE uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
_NEON2SSESTORAGE uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
_NEON2SSESTORAGE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
_NEON2SSESTORAGE poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
_NEON2SSESTORAGE poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
_NEON2SSESTORAGE float32x4_t vrndnq_f32(float32x4_t a); // VRND.F32 q0,q0
_NEON2SSESTORAGE float64x2_t vrndnq_f64(float64x2_t a); // VRND.F64 q0,q0
//Sqrt
_NEON2SSESTORAGE float32x4_t vsqrtq_f32(float32x4_t a); // VSQRT.F32 q0,q0
_NEON2SSESTORAGE float64x2_t vsqrtq_f64(float64x2_t a); // VSQRT.F64 q0,q0
//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
// the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics.
// we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal
//
#if ( defined (__INTEL_COMPILER) || defined (__GNUC__) && !defined(__llvm__) )
# define _MM_ALIGNR_EPI8 _mm_alignr_epi8
# define _MM_EXTRACT_EPI16 (int16_t) _mm_extract_epi16
# define _MM_INSERT_EPI16 _mm_insert_epi16
# ifdef USE_SSE4
# define _MM_EXTRACT_EPI8 _mm_extract_epi8
# define _MM_EXTRACT_EPI32 _mm_extract_epi32
# define _MM_EXTRACT_PS _mm_extract_ps
# define _MM_INSERT_EPI8 _mm_insert_epi8
# define _MM_INSERT_EPI32 _mm_insert_epi32
# define _MM_INSERT_PS _mm_insert_ps
# ifdef _NEON2SSE_64BIT
# define _MM_INSERT_EPI64 _mm_insert_epi64
# define _MM_EXTRACT_EPI64 _mm_extract_epi64
# endif
# endif //SSE4
#else
# define _NEON2SSE_COMMA ,
# define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \
switch(LANE) \
{ \
case 0: return NAME(a b, 0); \
case 1: return NAME(a b, 1); \
case 2: return NAME(a b, 2); \
case 3: return NAME(a b, 3); \
case 4: return NAME(a b, 4); \
case 5: return NAME(a b, 5); \
case 6: return NAME(a b, 6); \
case 7: return NAME(a b, 7); \
case 8: return NAME(a b, 8); \
case 9: return NAME(a b, 9); \
case 10: return NAME(a b, 10); \
case 11: return NAME(a b, 11); \
case 12: return NAME(a b, 12); \
case 13: return NAME(a b, 13); \
case 14: return NAME(a b, 14); \
case 15: return NAME(a b, 15); \
default: return NAME(a b, 0); \
}
# define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \
switch(LANE) \
{ \
case 0: return NAME(vec p,0); \
case 1: return NAME(vec p,1); \
case 2: return NAME(vec p,2); \
case 3: return NAME(vec p,3); \
case 4: return NAME(vec p,4); \
case 5: return NAME(vec p,5); \
case 6: return NAME(vec p,6); \
case 7: return NAME(vec p,7); \
default: return NAME(vec p,0); \
}
# define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \
switch(LANE) \
{ \
case case0: return NAME(vec p,case0); \
case case1: return NAME(vec p,case1); \
case case2: return NAME(vec p,case2); \
case case3: return NAME(vec p,case3); \
default: return NAME(vec p,case0); \
}
_NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE)
{
_NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE)
}
_NEON2SSE_INLINE __m128i _MM_INSERT_EPI16(__m128i vec, int p, const int LANE)
{
_NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p)
}
_NEON2SSE_INLINE int16_t _MM_EXTRACT_EPI16(__m128i vec, const int LANE)
{
_NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,)
}
#ifdef USE_SSE4
_NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
{
_NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,)
}
_NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
{
_NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,)
}
_NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
{
_NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE)
}
_NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
{
_NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p)
}
_NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
{
_NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE)
}
#ifdef _NEON2SSE_64BIT
//the special case of functions available only for SSE4 and 64-bit build.
_NEON2SSE_INLINE __m128i _MM_INSERT_EPI64(__m128i vec, int p, const int LANE)
{
switch(LANE) {
case 0:
return _mm_insert_epi64(vec, p, 0);
case 1:
return _mm_insert_epi64(vec, p, 1);
default:
return _mm_insert_epi64(vec, p, 0);
}
}
_NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
{
if (LANE ==0) return _mm_extract_epi64(val, 0);
else return _mm_extract_epi64(val, 1);
}
#endif
_NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
{
_NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p)
}
#endif //USE_SSE4
#endif //#ifdef NDEBUG
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices
// or for some specific commonly used operations implementation missing in SSE
#ifdef USE_SSE4
# define _MM_CVTEPU8_EPI16 _mm_cvtepu8_epi16
# define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32
# define _MM_CVTEPU32_EPI64 _mm_cvtepu32_epi64
# define _MM_CVTEPI8_EPI16 _mm_cvtepi8_epi16
# define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32
# define _MM_CVTEPI32_EPI64 _mm_cvtepi32_epi64
# define _MM_MAX_EPI8 _mm_max_epi8
# define _MM_MAX_EPI32 _mm_max_epi32
# define _MM_MAX_EPU16 _mm_max_epu16
# define _MM_MAX_EPU32 _mm_max_epu32
# define _MM_MIN_EPI8 _mm_min_epi8
# define _MM_MIN_EPI32 _mm_min_epi32
# define _MM_MIN_EPU16 _mm_min_epu16
# define _MM_MIN_EPU32 _mm_min_epu32
# define _MM_BLENDV_EPI8 _mm_blendv_epi8
# define _MM_PACKUS_EPI32 _mm_packus_epi32
# define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a)
# define _MM_MULLO_EPI32 _mm_mullo_epi32
# define _MM_MUL_EPI32 _mm_mul_epi32
# define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64
#else //no SSE4 !!!!!!
_NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a)
{
__m128i zero = _mm_setzero_si128();
return _mm_unpacklo_epi8(a, zero);
}
_NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a)
{
__m128i zero = _mm_setzero_si128();
return _mm_unpacklo_epi16(a, zero);
}
_NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a)
{
__m128i zero = _mm_setzero_si128();
return _mm_unpacklo_epi32(a, zero);
}
_NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a)
{
__m128i zero = _mm_setzero_si128();
__m128i sign = _mm_cmpgt_epi8(zero, a);
return _mm_unpacklo_epi8(a, sign);
}
_NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a)
{
__m128i zero = _mm_setzero_si128();
__m128i sign = _mm_cmpgt_epi16(zero, a);
return _mm_unpacklo_epi16(a, sign);
}
_NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a)
{
__m128i zero = _mm_setzero_si128();
__m128i sign = _mm_cmpgt_epi32(zero, a);
return _mm_unpacklo_epi32(a, sign);
}
_NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
{
_NEON2SSE_ALIGN_16 int32_t tmp[4];
_mm_store_si128((__m128i*)tmp, vec);
return tmp[LANE];
}
_NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
{
_NEON2SSE_ALIGN_16 int8_t tmp[16];
_mm_store_si128((__m128i*)tmp, vec);
return (int)tmp[LANE];
}
_NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
{
_NEON2SSE_ALIGN_16 int32_t tmp[4];
_mm_store_si128((__m128i*)tmp, _M128i(vec));
return tmp[LANE];
}
_NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
{
_NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0};
_NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
__m128i vec_masked, p_masked;
pvec[LANE] = p;
mask[LANE] = 0x0;
vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
return _mm_or_si128(vec_masked, p_masked);
}
_NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
{
_NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
_NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};
__m128i vec_masked, p_masked;
pvec[LANE] = (int8_t)p;
mask[LANE] = 0x0;
vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
return _mm_or_si128(vec_masked, p_masked);
}
_NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
{
_NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
__m128 tmp, vec_masked, p_masked;
mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it
vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p
p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec
tmp = _mm_or_ps(vec_masked, p_masked);
return tmp;
}
_NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b)
{
__m128i cmp, resa, resb;
cmp = _mm_cmpgt_epi8 (a, b);
resa = _mm_and_si128 (cmp, a);
resb = _mm_andnot_si128 (cmp,b);
return _mm_or_si128(resa, resb);
}
_NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b)
{
__m128i cmp, resa, resb;
cmp = _mm_cmpgt_epi32(a, b);
resa = _mm_and_si128 (cmp, a);
resb = _mm_andnot_si128 (cmp,b);
return _mm_or_si128(resa, resb);
}
_NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b)
{
__m128i c8000, b_s, a_s, cmp;
c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
b_s = _mm_sub_epi16 (b, c8000);
a_s = _mm_sub_epi16 (a, c8000);
cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed
a_s = _mm_and_si128 (cmp,a);
b_s = _mm_andnot_si128 (cmp,b);
return _mm_or_si128(a_s, b_s);
}
_NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b)
{
__m128i c80000000, b_s, a_s, cmp;
c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
b_s = _mm_sub_epi32 (b, c80000000);
a_s = _mm_sub_epi32 (a, c80000000);
cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed
a_s = _mm_and_si128 (cmp,a);
b_s = _mm_andnot_si128 (cmp,b);
return _mm_or_si128(a_s, b_s);
}
_NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b)
{
__m128i cmp, resa, resb;
cmp = _mm_cmpgt_epi8 (b, a);
resa = _mm_and_si128 (cmp, a);
resb = _mm_andnot_si128 (cmp,b);
return _mm_or_si128(resa, resb);
}
_NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b)
{
__m128i cmp, resa, resb;
cmp = _mm_cmpgt_epi32(b, a);
resa = _mm_and_si128 (cmp, a);
resb = _mm_andnot_si128 (cmp,b);
return _mm_or_si128(resa, resb);
}
_NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b)
{
__m128i c8000, b_s, a_s, cmp;
c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
b_s = _mm_sub_epi16 (b, c8000);
a_s = _mm_sub_epi16 (a, c8000);
cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed
a_s = _mm_and_si128 (cmp,a);
b_s = _mm_andnot_si128 (cmp,b);
return _mm_or_si128(a_s, b_s);
}
_NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b)
{
__m128i c80000000, b_s, a_s, cmp;
c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
b_s = _mm_sub_epi32 (b, c80000000);
a_s = _mm_sub_epi32 (a, c80000000);
cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed
a_s = _mm_and_si128 (cmp,a);
b_s = _mm_andnot_si128 (cmp,b);
return _mm_or_si128(a_s, b_s);
}
_NEON2SSE_INLINE __m128i _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8 !!!!! - please see below
{
//it assumes mask is either 0xff or 0 always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
__m128i a_masked, b_masked;
b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff
a_masked = _mm_andnot_si128 (mask,a);
return _mm_or_si128(a_masked, b_masked);
}
_NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b)
{
__m128i a16, b16, res, reshi,cmp, zero;
zero = _mm_setzero_si128();
a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd);
b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd);
res = _mm_unpacklo_epi64(a16, b16); //result without saturation
reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation
cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
}
_NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a)
{
__m128i a16, res, reshi,cmp, zero;
zero = _mm_setzero_si128();
a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd);
reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation
cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
}
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL)
{
_NEON2SSE_ALIGN_16 int32_t atmp[4], btmp[4], res[4];
int64_t res64;
int i;
_mm_store_si128((__m128i*)atmp, a);
_mm_store_si128((__m128i*)btmp, b);
for (i = 0; i<4; i++) {
res64 = atmp[i] * btmp[i];
res[i] = (int)(res64 & 0xffffffff);
}
return _mm_load_si128((__m128i*)res);
}
_NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b)
{
__m128i sign, zero, mul_us, a_neg, b_neg, mul_us_neg;
sign = _mm_xor_si128 (a, b);
sign = _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive
sign = _mm_shuffle_epi32(sign, _MM_SHUFFLE(2, 2, 0, 0)); //promote sign bit to 3 and 1st data lanes
zero = _mm_setzero_si128();
a_neg = _mm_abs_epi32 (a); //negate a and b
b_neg = _mm_abs_epi32 (b); //negate a and b
mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
mul_us_neg = _mm_sub_epi64(zero, mul_us);
mul_us_neg = _mm_and_si128(sign, mul_us_neg);
mul_us = _mm_andnot_si128(sign, mul_us);
return _mm_or_si128 (mul_us, mul_us_neg);
}
_NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b)
{
__m128i res;
res = _mm_cmpeq_epi32 (a, b);
return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data
}
#endif //SSE4
//the special case of functions working only for 32 bits, no SSE4
_NEON2SSE_INLINE __m128i _MM_INSERT_EPI64_32(__m128i vec, int p, const int LANE)
{
_NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0};
_NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff};
__m128i vec_masked, p_masked;
pvec[LANE] = p;
mask[LANE] = 0x0;
vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
return _mm_or_si128(vec_masked, p_masked);
}
_NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE)
{
_NEON2SSE_ALIGN_16 int64_t tmp[2];
_mm_store_si128((__m128i*)tmp, val);
return tmp[LANE];
}
#ifndef _NEON2SSE_64BIT_SSE4
# define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32
# define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32
#endif
_NEON2SSESTORAGE int32x4_t vqd_s32(int32x4_t a); //Doubling saturation for signed ints
_NEON2SSE_INLINE int32x4_t vqd_s32(int32x4_t a)
{
//Overflow happens only if a and sum have the opposite signs
__m128i c7fffffff, res, res_sat, res_xor_a;
c7fffffff = _mm_set1_epi32(0x7fffffff);
res = _mm_slli_epi32 (a, 1); // res = a*2
res_sat = _mm_srli_epi32(a, 31);
res_sat = _mm_add_epi32(res_sat, c7fffffff);
res_xor_a = _mm_xor_si128(res, a);
res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
res_sat = _mm_and_si128(res_xor_a, res_sat);
res = _mm_andnot_si128(res_xor_a, res);
return _mm_or_si128(res, res_sat);
}
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
//*************************************************************************
//*************************************************************************
//***************** Functions redefinition\implementatin starts here *****
//*************************************************************************
//*************************************************************************
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
/*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample:
#ifdef ARM
#define vector_addq_s32 _mm_add_epi32
#else //if we have IA
#define vector_addq_s32 vadd_s32
#endif
********************************************************************************************
Functions below are organised in the following way:
Each NEON intrinsic function has one of the following options:
1. its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement
2. x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement
3. the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition
4. for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance
the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path
- please consider such functions removal from your code.
*/
//***********************************************************************
//************************ Vector add *****************************
//***********************************************************************
_NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
_NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b)
{
int8x8_t res64;
return64(_mm_add_epi8(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
_NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b)
{
int16x4_t res64;
return64(_mm_add_epi16(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
_NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b)
{
int32x2_t res64;
return64(_mm_add_epi32(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
_NEON2SSE_INLINE int64x1_t vadd_s64(int64x1_t a, int64x1_t b)
{
int64x1_t res64;
res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0];
return res64;
}
_NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
_NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b)
{
__m128 res;
__m64_128 res64;
res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits
_M64f(res64, res);
return res64;
}
_NEON2SSESTORAGE uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
#define vadd_u8 vadd_s8
_NEON2SSESTORAGE uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
#define vadd_u16 vadd_s16
_NEON2SSESTORAGE uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
#define vadd_u32 vadd_s32
_NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
_NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b)
{
uint64x1_t res64;
res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0];
return res64;
}
_NEON2SSESTORAGE int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
#define vaddq_s8 _mm_add_epi8
_NEON2SSESTORAGE int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
#define vaddq_s16 _mm_add_epi16
_NEON2SSESTORAGE int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
#define vaddq_s32 _mm_add_epi32
_NEON2SSESTORAGE int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
#define vaddq_s64 _mm_add_epi64
_NEON2SSESTORAGE float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
#define vaddq_f32 _mm_add_ps
_NEON2SSESTORAGE uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
#define vaddq_u8 _mm_add_epi8
_NEON2SSESTORAGE uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
#define vaddq_u16 _mm_add_epi16
_NEON2SSESTORAGE uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
#define vaddq_u32 _mm_add_epi32
_NEON2SSESTORAGE uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
#define vaddq_u64 _mm_add_epi64
//**************************** Vector long add *****************************:
//***********************************************************************
//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
_NEON2SSESTORAGE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
_NEON2SSE_INLINE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0
{
__m128i a16, b16;
a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
return _mm_add_epi16 (a16, b16);
}
_NEON2SSESTORAGE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
_NEON2SSE_INLINE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0
{
__m128i a32, b32;
a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1
return _mm_add_epi32 (a32, b32);
}
_NEON2SSESTORAGE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
_NEON2SSE_INLINE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0
{
//may be not optimal
__m128i a64, b64;
a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
return _mm_add_epi64 ( a64, b64);
}
_NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
_NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0
{
__m128i a16, b16;
a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1
b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
return _mm_add_epi16 (a16, b16);
}
_NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0
_NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0
{
__m128i a32, b32;
a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
return _mm_add_epi32 (a32, b32);
}
_NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
_NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0
{
//may be not optimal
__m128i a64, b64;
a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
return _mm_add_epi64 (a64, b64);
}
//*************** Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ******************
//*************** *********************************************************************
_NEON2SSESTORAGE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
_NEON2SSE_INLINE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0
{
__m128i b16;
b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
return _mm_add_epi16 (a, b16);
}
_NEON2SSESTORAGE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
_NEON2SSE_INLINE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0
{
__m128i b32;
b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1,
return _mm_add_epi32 (a, b32);
}
_NEON2SSESTORAGE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
_NEON2SSE_INLINE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0
{
__m128i b64;
b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
return _mm_add_epi64 (a, b64);
}
_NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
_NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0
{
__m128i b16;
b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
return _mm_add_epi16 (a, b16);
}
_NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0
_NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0
{
__m128i b32;
b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
return _mm_add_epi32 (a, b32);
}
_NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
_NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0
{
__m128i b64;
b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
return _mm_add_epi64 (a, b64);
}
//******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 , result truncated *******************************
//*************************************************************************************************************************
_NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
_NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b)
{
int8x8_t res64;
return64(vhaddq_s8(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
_NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b)
{
int16x4_t res64;
return64( vhaddq_s16(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
_NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b)
{
int32x2_t res64;
return64( vhaddq_s32(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.w d0,d0,d0
_NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b)
{
uint8x8_t res64;
return64( vhaddq_u8(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.s16 d0,d0,d0
_NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b)
{
uint16x4_t res64;
return64( vhaddq_u16(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
_NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b)
{
uint32x2_t res64;
return64( vhaddq_u32(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
_NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b)
{
//need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
__m128i tmp1, tmp2;
tmp1 = _mm_and_si128(a,b);
tmp2 = _mm_xor_si128(a,b);
tmp2 = vshrq_n_s8(tmp2,1);
return _mm_add_epi8(tmp1,tmp2);
}
_NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0
_NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b)
{
//need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
__m128i tmp1, tmp2;
tmp1 = _mm_and_si128(a,b);
tmp2 = _mm_xor_si128(a,b);
tmp2 = _mm_srai_epi16(tmp2,1);
return _mm_add_epi16(tmp1,tmp2);
}
_NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
_NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0
{
//need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
__m128i tmp1, tmp2;
tmp1 = _mm_and_si128(a,b);
tmp2 = _mm_xor_si128(a,b);
tmp2 = _mm_srai_epi32(tmp2,1);
return _mm_add_epi32(tmp1,tmp2);
}
_NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
_NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0
{
__m128i c1, sum, res;
c1 = _mm_set1_epi8(1);
sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it
res = _mm_xor_si128(a, b); //for rounding compensation
res = _mm_and_si128(res,c1); //for rounding compensation
return _mm_sub_epi8 (sum, res); //actual rounding compensation
}
_NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0
_NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0
{
__m128i sum, res;
sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it
res = _mm_xor_si128(a, b); //for rounding compensation
res = _mm_slli_epi16 (res,15); //shift left then back right to
res = _mm_srli_epi16 (res,15); //get 1 or zero
return _mm_sub_epi16 (sum, res); //actual rounding compensation
}
_NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
_NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0
{
//need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
__m128i tmp1, tmp2;
tmp1 = _mm_and_si128(a,b);
tmp2 = _mm_xor_si128(a,b);
tmp2 = _mm_srli_epi32(tmp2,1);
return _mm_add_epi32(tmp1,tmp2);
}
//************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1 ***************************
//*****************************************************************************************************************************
_NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
_NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b)
{
int8x8_t res64;
return64(vrhaddq_s8(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
_NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b)
{
int16x4_t res64;
return64(vrhaddq_s16(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
_NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b)
{
int32x2_t res64;
return64(vrhaddq_s32(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
_NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b)
{
uint8x8_t res64;
return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
}
_NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0
_NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b)
{
uint16x4_t res64;
return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
}
_NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
_NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b)
{
uint32x2_t res64;
return64(vrhaddq_u32(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
_NEON2SSE_INLINE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0
{
//no signed average in x86 SIMD, go to unsigned
__m128i c128, au, bu, sum;
c128 = _mm_set1_epi8((int8_t)0x80); //-128
au = _mm_sub_epi8(a, c128); //add 128
bu = _mm_sub_epi8(b, c128); //add 128
sum = _mm_avg_epu8(au, bu);
return _mm_add_epi8 (sum, c128); //sub 128
}
_NEON2SSESTORAGE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
_NEON2SSE_INLINE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0
{
//no signed average in x86 SIMD, go to unsigned
__m128i cx8000, au, bu, sum;
cx8000 = _mm_set1_epi16((int16_t)0x8000); // - 32768
au = _mm_sub_epi16(a, cx8000); //add 32768
bu = _mm_sub_epi16(b, cx8000); //add 32768
sum = _mm_avg_epu16(au, bu);
return _mm_add_epi16 (sum, cx8000); //sub 32768
}
_NEON2SSESTORAGE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
_NEON2SSE_INLINE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b)
{
//need to avoid overflow
__m128i a2, b2, res, sum;
a2 = _mm_srai_epi32(a,1); //a2=a/2;
b2 = _mm_srai_epi32(b,1); // b2=b/2;
res = _mm_or_si128(a,b); //for rounding
res = _mm_slli_epi32 (res,31); //shift left then back right to
res = _mm_srli_epi32 (res,31); //get 1 or zero
sum = _mm_add_epi32(a2,b2);
return _mm_add_epi32(sum,res);
}
_NEON2SSESTORAGE uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
#define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded
_NEON2SSESTORAGE uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0
#define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded
_NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
_NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0
{
//need to avoid overflow
__m128i a2, b2, res, sum;
a2 = _mm_srli_epi32(a,1); //a2=a/2;
b2 = _mm_srli_epi32(b,1); // b2=b/2;
res = _mm_or_si128(a,b); //for rounding
res = _mm_slli_epi32 (res,31); //shift left then back right to
res = _mm_srli_epi32 (res,31); //get 1 or zero
sum = _mm_add_epi32(a2,b2);
return _mm_add_epi32(sum,res);
}
//****************** VQADD: Vector saturating add ************************
//************************************************************************
_NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
_NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b)
{
int8x8_t res64;
return64(_mm_adds_epi8(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
_NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b)
{
int16x4_t res64;
return64(_mm_adds_epi16(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
_NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b)
{
int32x2_t res64;
return64(vqaddq_s32(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
int64x1_t res;
uint64_t a64, b64;
a64 = a.m64_u64[0];
b64 = b.m64_u64[0];
res.m64_u64[0] = a64 + b64;
a64 = (a64 >> 63) + (~_SIGNBIT64);
if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) {
res.m64_u64[0] = a64;
}
return res;
}
_NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
_NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b)
{
uint8x8_t res64;
return64(_mm_adds_epu8(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0
_NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b)
{
uint16x4_t res64;
return64(_mm_adds_epu16(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
_NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b)
{
uint32x2_t res64;
return64(vqaddq_u32(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
_NEON2SSE_ALIGN_16 uint64_t a64, b64;
uint64x1_t res;
a64 = a.m64_u64[0];
b64 = b.m64_u64[0];
res.m64_u64[0] = a64 + b64;
if (res.m64_u64[0] < a64) {
res.m64_u64[0] = ~(uint64_t)0;
}
return res;
}
_NEON2SSESTORAGE int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
#define vqaddq_s8 _mm_adds_epi8
_NEON2SSESTORAGE int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
#define vqaddq_s16 _mm_adds_epi16
_NEON2SSESTORAGE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
_NEON2SSE_INLINE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b)
{
//no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
__m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_;
c7fffffff = _mm_set1_epi32(0x7fffffff);
res = _mm_add_epi32(a, b);
res_sat = _mm_srli_epi32(a, 31);
res_sat = _mm_add_epi32(res_sat, c7fffffff);
res_xor_a = _mm_xor_si128(res, a);
b_xor_a_ = _mm_xor_si128(b, a);
res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a);
res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
res_sat = _mm_and_si128(res_xor_a, res_sat);
res = _mm_andnot_si128(res_xor_a, res);
return _mm_or_si128(res, res_sat);
}
_NEON2SSESTORAGE int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
_NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
_mm_store_si128((__m128i*)atmp, a);
_mm_store_si128((__m128i*)btmp, b);
res[0] = atmp[0] + btmp[0];
res[1] = atmp[1] + btmp[1];
atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64);
atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64);
if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) {
res[0] = atmp[0];
}
if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) {
res[1] = atmp[1];
}
return _mm_load_si128((__m128i*)res);
}
_NEON2SSESTORAGE uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
#define vqaddq_u8 _mm_adds_epu8
_NEON2SSESTORAGE uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0
#define vqaddq_u16 _mm_adds_epu16
_NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
_NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b)
{
__m128i c80000000, cmp, subsum, suba, sum;
c80000000 = _mm_set1_epi32 (0x80000000);
sum = _mm_add_epi32 (a, b);
subsum = _mm_sub_epi32 (sum, c80000000);
suba = _mm_sub_epi32 (a, c80000000);
cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed
return _mm_or_si128 (sum, cmp); //saturation
}
_NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
#ifdef USE_SSE4
_NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b)
{
__m128i c80000000, sum, cmp, suba, subsum;
c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
sum = _mm_add_epi64 (a, b);
subsum = _mm_sub_epi64 (sum, c80000000);
suba = _mm_sub_epi64 (a, c80000000);
cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!!
return _mm_or_si128 (sum, cmp); //saturation
}
#else
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
_NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
_mm_store_si128((__m128i*)atmp, a);
_mm_store_si128((__m128i*)btmp, b);
res[0] = atmp[0] + btmp[0];
res[1] = atmp[1] + btmp[1];
if (res[0] < atmp[0]) res[0] = ~(uint64_t)0;
if (res[1] < atmp[1]) res[1] = ~(uint64_t)0;
return _mm_load_si128((__m128i*)(res));
}
#endif
//******************* Vector add high half (truncated) ******************
//************************************************************************
_NEON2SSESTORAGE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
_NEON2SSE_INLINE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0
{
int8x8_t res64;
__m128i sum;
sum = _mm_add_epi16 (a, b);
sum = _mm_srai_epi16 (sum, 8);
sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only
return64(sum);
}
_NEON2SSESTORAGE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
_NEON2SSE_INLINE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0
{
int16x4_t res64;
__m128i sum;
sum = _mm_add_epi32 (a, b);
sum = _mm_srai_epi32(sum, 16);
sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only
return64(sum);
}
_NEON2SSESTORAGE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
_NEON2SSE_INLINE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b)
{
int32x2_t res64;
__m128i sum;
sum = _mm_add_epi64 (a, b);
sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (0 << 4) | (2 << 6));
return64(sum);
}
_NEON2SSESTORAGE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
_NEON2SSE_INLINE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0
{
uint8x8_t res64;
__m128i sum;
sum = _mm_add_epi16 (a, b);
sum = _mm_srli_epi16 (sum, 8);
sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only
return64(sum);
}
_NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
_NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0
{
uint16x4_t res64;
__m128i sum;
sum = _mm_add_epi32 (a, b);
sum = _mm_srli_epi32 (sum, 16);
#ifdef USE_SSE4
sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only
#else
sum = _mm_shuffle_epi8 (sum, *(__m128i*) mask8_32_even_odd); //go to 16 bits
#endif
return64(sum);
}
_NEON2SSESTORAGE uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
#define vaddhn_u64 vaddhn_s64
//*********** Vector rounding add high half: vraddhn_<type> ******************.
//***************************************************************************
_NEON2SSESTORAGE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
_NEON2SSE_INLINE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0
{
int8x8_t res64;
__m128i sum, mask1;
sum = _mm_add_epi16 (a, b);
mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
sum = _mm_srai_epi16 (sum, 8); //get high half
sum = _mm_add_epi16 (sum, mask1); //actual rounding
sum = _mm_packs_epi16 (sum, sum);
return64(sum);
}
_NEON2SSESTORAGE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
_NEON2SSE_INLINE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0
{
//SIMD may be not optimal, serial may be faster
int16x4_t res64;
__m128i sum, mask1;
sum = _mm_add_epi32 (a, b);
mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
sum = _mm_srai_epi32 (sum, 16); //get high half
sum = _mm_add_epi32 (sum, mask1); //actual rounding
sum = _mm_packs_epi32 (sum, sum);
return64(sum);
}
_NEON2SSESTORAGE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
_NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b)
{
//SIMD may be not optimal, serial may be faster
int32x2_t res64;
__m128i sum, mask1;
sum = _mm_add_epi64 (a, b);
mask1 = _mm_slli_epi64(sum, 33); //shift left then back right to
mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero
sum = _mm_add_epi64 (sum, mask1); //actual high half rounding
sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (1 << 4) | (3 << 6));
return64(sum);
}
_NEON2SSESTORAGE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
_NEON2SSE_INLINE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0
{
uint8x8_t res64;
__m128i sum, mask1;
sum = _mm_add_epi16 (a, b);
mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
sum = _mm_srai_epi16 (sum, 8); //get high half
sum = _mm_add_epi16 (sum, mask1); //actual rounding
sum = _mm_packus_epi16 (sum, sum);
return64(sum);
}
_NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
_NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b)
{
//SIMD may be not optimal, serial may be faster
uint16x4_t res64;
__m128i sum, mask1;
sum = _mm_add_epi32 (a, b);
mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
sum = _mm_srai_epi32 (sum, 16); //get high half
sum = _mm_add_epi32 (sum, mask1); //actual rounding
sum = _MM_PACKUS1_EPI32 (sum);
return64(sum);
}
_NEON2SSESTORAGE uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
#define vraddhn_u64 vraddhn_s64
//**********************************************************************************
//********* Multiplication *************************************
//**************************************************************************************
//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
//As we don't go to wider result functions are equal to "multiply low" in x86
_NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
_NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0
{
// no 8 bit simd multiply, need to go to 16 bits in SSE
int8x8_t res64;
__m128i a128, b128, res;
a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
res = _mm_mullo_epi16 (a128, b128);
res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only
return64(res);
}
_NEON2SSESTORAGE int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
#define vmul_s16 vmul_u16
_NEON2SSESTORAGE int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
#define vmul_s32 vmul_u32
_NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
_NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b)
{
float32x4_t tmp;
__m64_128 res64;
tmp = _mm_mul_ps(_pM128(a),_pM128(b));
_M64f(res64, tmp); //use low 64 bits
return res64;
}
_NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
_NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0
{
// no 8 bit simd multiply, need to go to 16 bits in SSE
uint8x8_t res64;
__m128i mask, a128, b128, res;
mask = _mm_set1_epi16(0xff);
a128 = _MM_CVTEPU8_EPI16 (_pM128i(a));
b128 = _MM_CVTEPU8_EPI16 (_pM128i(b));
res = _mm_mullo_epi16 (a128, b128);
res = _mm_and_si128(res, mask); //to avoid saturation
res = _mm_packus_epi16 (res,res); //use only low 64 bits
return64(res);
}
_NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
_NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b)
{
uint16x4_t res64;
return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
uint32x2_t res;
res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0];
res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1];
return res;
}
_NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
_NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b)
{
//may be optimized
poly8x8_t res64;
__m128i a64, b64, c1, res, tmp, bmasked;
int i;
a64 = _pM128i(a);
b64 = _pM128i(b);
c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff....
c1 = vshrq_n_u8(c1,7); //0x1
bmasked = _mm_and_si128(b64, c1); //0x1
res = vmulq_u8(a64, bmasked);
for(i = 1; i<8; i++) {
c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
bmasked = _mm_and_si128(b64, c1); //0x1
tmp = vmulq_u8(a64, bmasked);
res = _mm_xor_si128(res, tmp);
}
return64 (res);
}
_NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
_NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0
{
// no 8 bit simd multiply, need to go to 16 bits
//solution may be not optimal
__m128i a16, b16, r16_1, r16_2;
a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
r16_1 = _mm_mullo_epi16 (a16, b16);
//swap hi and low part of a and b to process the remaining data
a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 __m128i r16_2
r16_2 = _mm_mullo_epi16 (a16, b16);
r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit
r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit
return _mm_unpacklo_epi64(r16_1, r16_2);
}
_NEON2SSESTORAGE int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
#define vmulq_s16 _mm_mullo_epi16
_NEON2SSESTORAGE int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
#define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1
_NEON2SSESTORAGE float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
#define vmulq_f32 _mm_mul_ps
_NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
_NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0
{
// no 8 bit simd multiply, need to go to 16 bits
//solution may be not optimal
__m128i maskff, a16, b16, r16_1, r16_2;
maskff = _mm_set1_epi16(0xff);
a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1
b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
r16_1 = _mm_mullo_epi16 (a16, b16);
r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation
//swap hi and low part of a and b to process the remaining data
a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
r16_2 = _mm_mullo_epi16 (a16, b16);
r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation
return _mm_packus_epi16 (r16_1, r16_2);
}
_NEON2SSESTORAGE uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
#define vmulq_u16 _mm_mullo_epi16
_NEON2SSESTORAGE uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
#define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1
_NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
_NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b)
{
//may be optimized
__m128i c1, res, tmp, bmasked;
int i;
c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
c1 = vshrq_n_u8(c1,7); //0x1
bmasked = _mm_and_si128(b, c1); //0x1
res = vmulq_u8(a, bmasked);
for(i = 1; i<8; i++) {
c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
bmasked = _mm_and_si128(b, c1); //0x1
tmp = vmulq_u8(a, bmasked);
res = _mm_xor_si128(res, tmp);
}
return res;
}
//************************* Vector long multiply ***********************************
//****************************************************************************
_NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
_NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0
{
//no 8 bit simd multiply, need to go to 16 bits
__m128i a16, b16;
a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
}
_NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
_NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0
{
#ifdef USE_SSE4
__m128i a16, b16;
a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1
b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1
return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
#else
__m128i low, hi, a128,b128;
a128 = _pM128i(a);
b128 = _pM128i(b);
low = _mm_mullo_epi16(a128,b128);
hi = _mm_mulhi_epi16(a128,b128);
return _mm_unpacklo_epi16(low,hi);
#endif
}
_NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
_NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0
{
__m128i ab, ba, a128, b128;
a128 = _pM128i(a);
b128 = _pM128i(b);
ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
}
_NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
_NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0
{
//no 8 bit simd multiply, need to go to 16 bits
__m128i a16, b16;
a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
}
_NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0
_NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0
{
#ifdef USE_SSE4
__m128i a16, b16;
a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1
b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1
return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
#else
__m128i a128,b128,low, hi;
a128 = _pM128i(a);
b128 = _pM128i(b);
low = _mm_mullo_epi16(a128,b128);
hi = _mm_mulhi_epu16(a128,b128);
return _mm_unpacklo_epi16(low,hi);
#endif
}
_NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
_NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0
{
///may be not optimal compared with serial implementation
__m128i ab, ba, a128, b128;
a128 = _pM128i(a);
b128 = _pM128i(b);
ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
}
_NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
_NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b)
{
//may be optimized
__m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked;
int i;
a128 = _pM128i(a);
b128 = _pM128i(b);
c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff....
c1 = vshrq_n_u8(c1,7); //0x1
bmasked = _mm_and_si128(b128, c1); //0x1
a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1
bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit
for(i = 1; i<8; i++) {
c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
bmasked = _mm_and_si128(b128, c1); //0x1
bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked);
res = _mm_xor_si128(res, tmp);
}
return res;
}
//****************Vector saturating doubling long multiply **************************
//*****************************************************************
_NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
_NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b)
{
//the serial soulution may be faster due to saturation
__m128i res;
res = vmull_s16(a, b);
return vqd_s32(res);
}
_NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
{
//the serial soulution may be faster due to saturation
__m128i res;
res = vmull_s32(a,b);
return vqaddq_s64(res,res); //slow serial function!!!!
}
//********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************************
//******************************************************************************************
_NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
_NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0
{
// no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits
int8x8_t res64;
__m128i b128, c128, res;
b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
res = _mm_mullo_epi16 (c128, b128);
res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd);
res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
return64(res);
}
_NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
_NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c)
{
int16x4_t res64;
return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
}
_NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
_NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0
{
int32x2_t res64;
__m128i res;
res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1
res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits
return64(res);
}
_NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
_NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c)
{
//fma is coming soon, but right now:
__m128 res;
__m64_128 res64;
res = _mm_mul_ps (_pM128(c), _pM128(b));
res = _mm_add_ps (_pM128(a), res);
_M64f(res64, res);
return res64;
}
_NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
_NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0
{
// no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits
uint8x8_t res64;
__m128i mask, b128, c128, res;
mask = _mm_set1_epi16(0xff);
b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
res = _mm_mullo_epi16 (c128, b128);
res = _mm_and_si128(res, mask); //to avoid saturation
res = _mm_packus_epi16 (res, res);
res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
return64(res);
}
_NEON2SSESTORAGE uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
#define vmla_u16 vmla_s16
_NEON2SSESTORAGE uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
#define vmla_u32 vmla_s32
_NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
_NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0
{
//solution may be not optimal
// no 8 bit simd multiply, need to go to 16 bits
__m128i b16, c16, r16_1, a_2,r16_2;
b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
r16_1 = _mm_mullo_epi16 (b16, c16);
r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
r16_1 = _mm_add_epi8 (r16_1, a);
//swap hi and low part of a, b and c to process the remaining data
a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
r16_2 = _mm_mullo_epi16 (b16, c16);
r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
r16_2 = _mm_add_epi8(r16_2, a_2);
return _mm_unpacklo_epi64(r16_1,r16_2);
}
_NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
_NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0
{
__m128i res;
res = _mm_mullo_epi16 (c, b);
return _mm_add_epi16 (res, a);
}
_NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
_NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0
{
__m128i res;
res = _MM_MULLO_EPI32 (c, b); //SSE4.1
return _mm_add_epi32 (res, a);
}
_NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
_NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0
{
//fma is coming soon, but right now:
__m128 res;
res = _mm_mul_ps (c, b);
return _mm_add_ps (a, res);
}
_NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
_NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0
{
//solution may be not optimal
// no 8 bit simd multiply, need to go to 16 bits
__m128i b16, c16, r16_1, a_2, r16_2;
b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
r16_1 = _mm_mullo_epi16 (b16, c16);
r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
r16_1 = _mm_add_epi8 (r16_1, a);
//swap hi and low part of a, b and c to process the remaining data
a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
r16_2 = _mm_mullo_epi16 (b16, c16);
r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
r16_2 = _mm_add_epi8(r16_2, a_2);
return _mm_unpacklo_epi64(r16_1,r16_2);
}
_NEON2SSESTORAGE uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
#define vmlaq_u16 vmlaq_s16
_NEON2SSESTORAGE uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
#define vmlaq_u32 vmlaq_s32
//********************** Vector widening multiply accumulate (long multiply accumulate):
// vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] **************
//********************************************************************************************
_NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
_NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0
{
int16x8_t res;
res = vmull_s8(b, c);
return _mm_add_epi16 (res, a);
}
_NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
_NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0
{
//may be not optimal compared with serial implementation
int32x4_t res;
res = vmull_s16(b, c);
return _mm_add_epi32 (res, a);
}
_NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
_NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0
{
//may be not optimal compared with serial implementation
int64x2_t res;
res = vmull_s32( b, c);
return _mm_add_epi64 (res, a);
}
_NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
_NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0
{
uint16x8_t res;
res = vmull_u8(b, c);
return _mm_add_epi16 (res, a);
}
_NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0
_NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0
{
//may be not optimal compared with serial implementation
uint32x4_t res;
res = vmull_u16(b, c);
return _mm_add_epi32 (res, a);
}
_NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
_NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0
{
//may be not optimal compared with serial implementation
int64x2_t res;
res = vmull_u32( b,c);
return _mm_add_epi64 (res, a);
}
//******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ***************************************
//********************************************************************************************
_NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
_NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0
{
// no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits
int8x8_t res64;
__m128i res;
res64 = vmul_s8(b,c);
res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
return64(res);
}
_NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
_NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c)
{
int16x4_t res64;
return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
}
_NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
_NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0
{
int32x2_t res64;
__m128i res;
res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1
res = _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only
return64(res);
}
_NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
_NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c)
{
__m128 res;
__m64_128 res64;
res = _mm_mul_ps (_pM128(c), _pM128(b));
res = _mm_sub_ps (_pM128(a), res);
_M64f(res64, res);
return res64;
}
_NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
_NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
{
// no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits
uint8x8_t res64;
__m128i res;
res64 = vmul_u8(b,c);
res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
return64(res);
}
_NEON2SSESTORAGE uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
#define vmls_u16 vmls_s16
_NEON2SSESTORAGE uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
#define vmls_u32 vmls_s32
_NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
_NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0
{
//solution may be not optimal
// no 8 bit simd multiply, need to go to 16 bits
__m128i b16, c16, r16_1, a_2, r16_2;
b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
r16_1 = _mm_mullo_epi16 (b16, c16);
r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);
r16_1 = _mm_sub_epi8 (a, r16_1);
//swap hi and low part of a, b, c to process the remaining data
a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
r16_2 = _mm_mullo_epi16 (b16, c16);
r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
r16_2 = _mm_sub_epi8 (a_2, r16_2);
return _mm_unpacklo_epi64(r16_1,r16_2);
}
_NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
_NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0
{
__m128i res;
res = _mm_mullo_epi16 (c, b);
return _mm_sub_epi16 (a, res);
}
_NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
_NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0
{
__m128i res;
res = _MM_MULLO_EPI32 (c, b); //SSE4.1
return _mm_sub_epi32 (a, res);
}
_NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
_NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0
{
__m128 res;
res = _mm_mul_ps (c, b);
return _mm_sub_ps (a, res);
}
_NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
_NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0
{
//solution may be not optimal
// no 8 bit simd multiply, need to go to 16 bits
__m128i b16, c16, r16_1, a_2, r16_2;
b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
r16_1 = _mm_mullo_epi16 (b16, c16);
r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
r16_1 = _mm_sub_epi8 (a, r16_1);
//swap hi and low part of a, b and c to process the remaining data
a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
r16_2 = _mm_mullo_epi16 (b16, c16);
r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
r16_2 = _mm_sub_epi8(a_2, r16_2);
return _mm_unpacklo_epi64(r16_1,r16_2);
}
_NEON2SSESTORAGE uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
#define vmlsq_u16 vmlsq_s16
_NEON2SSESTORAGE uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
#define vmlsq_u32 vmlsq_s32
//******************** Vector multiply subtract long (widening multiply subtract) ************************************
//*************************************************************************************************************
_NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
_NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0
{
int16x8_t res;
res = vmull_s8(b, c);
return _mm_sub_epi16 (a, res);
}
_NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
_NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0
{
//may be not optimal compared with serial implementation
int32x4_t res;
res = vmull_s16(b, c);
return _mm_sub_epi32 (a, res);
}
_NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
_NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0
{
//may be not optimal compared with serial implementation
int64x2_t res;
res = vmull_s32( b,c);
return _mm_sub_epi64 (a, res);
}
_NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
_NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0
{
uint16x8_t res;
res = vmull_u8(b, c);
return _mm_sub_epi16 (a, res);
}
_NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0
_NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0
{
//may be not optimal compared with serial implementation
uint32x4_t res;
res = vmull_u16(b, c);
return _mm_sub_epi32 (a, res);
}
_NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
_NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0
{
//may be not optimal compared with serial implementation
int64x2_t res;
res = vmull_u32( b,c);
return _mm_sub_epi64 (a, res);
}
//****** Vector saturating doubling multiply high **********************
//*************************************************************************
_NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
int16x4_t res;
int32_t a32, b32, i;
for (i = 0; i<4; i++) {
a32 = (int32_t) a.m64_i16[i];
b32 = (int32_t) b.m64_i16[i];
a32 = (a32 * b32) >> 15;
res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32;
}
return res;
}
_NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
_NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster
{
//may be not optimal compared with a serial solution
int32x2_t res64;
__m128i mask;
_NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
int64x2_t mul;
mul = vmull_s32(a,b);
mul = _mm_slli_epi64(mul,1); //double the result
//at this point start treating 2 64-bit numbers as 4 32-bit
mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
return64(mul);
}
_NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
_NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0
{
__m128i res, res_lo, mask;
_NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
res = _mm_mulhi_epi16 (a, b);
res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation
res_lo = _mm_mullo_epi16 (a, b);
res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit
res = _mm_add_epi16(res, res_lo); //combine results
mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
return _mm_xor_si128 (res, mask); //res saturated for 0x8000
}
_NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
{
// no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
__m128i ab, ba, mask, mul, mul1;
_NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
mul = _mm_slli_epi64(mul,1); //double the result
ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
mul1 = _mm_slli_epi64(mul1,1); //double the result
mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
mul = _mm_unpacklo_epi64(mul, mul1);
mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
}
//********* Vector saturating rounding doubling multiply high ****************
//****************************************************************************
//If use _mm_mulhrs_xx functions the result may differ from NEON one a little due to different rounding rules and order
_NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
_NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b)
{
int16x4_t res64;
return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
{
//may be not optimal compared with a serial solution
int32x2_t res64;
_NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
__m128i res_sat, mask, mask1;
int64x2_t mul;
mul = vmull_s32(a,b);
res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to
mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
mul = _mm_add_epi32 (res_sat, mask1); //actual rounding
//at this point start treating 2 64-bit numbers as 4 32-bit
mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
return64(mul);
}
_NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
_NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0
{
__m128i mask, res;
_NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
res = _mm_mulhrs_epi16 (a, b);
mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
return _mm_xor_si128 (res, mask); //res saturated for 0x8000
}
_NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
{
// no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
__m128i ab, ba, mask, mul, mul1, mask1;
_NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to
mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
mul = _mm_add_epi32 (mul, mask1); //actual rounding
ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered
mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to
mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding
//at this point start treating 2 64-bit numbers as 4 32-bit
mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
mul = _mm_unpacklo_epi64(mul, mul1);
mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
}
//*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) *****
//*************************************************************************************************************************
_NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
_NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0
{
//not optimal SIMD soulution, serial may be faster
__m128i res32;
res32 = vmull_s16(b, c);
res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1);
return vqaddq_s32(res32, a); //saturation
}
_NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)
{
__m128i res64;
res64 = vmull_s32(b,c);
res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1);
return vqaddq_s64(res64, a); //saturation
}
//************************************************************************************
//****************** Vector subtract ***********************************************
//************************************************************************************
_NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
_NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b)
{
int8x8_t res64;
return64(_mm_sub_epi8(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
_NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b)
{
int16x4_t res64;
return64(_mm_sub_epi16(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
_NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b)
{
int32x2_t res64;
return64(_mm_sub_epi32(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
_NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a, int64x1_t b)
{
int64x1_t res64;
res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0];
return res64;
}
_NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
_NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b)
{
float32x2_t res;
res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0];
res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1];
return res;
}
_NEON2SSESTORAGE uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
#define vsub_u8 vsub_s8
_NEON2SSESTORAGE uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
#define vsub_u16 vsub_s16
_NEON2SSESTORAGE uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
#define vsub_u32 vsub_s32
_NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
_NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b)
{
int64x1_t res64;
res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0];
return res64;
}
_NEON2SSESTORAGE int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
#define vsubq_s8 _mm_sub_epi8
_NEON2SSESTORAGE int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
#define vsubq_s16 _mm_sub_epi16
_NEON2SSESTORAGE int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
#define vsubq_s32 _mm_sub_epi32
_NEON2SSESTORAGE int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
#define vsubq_s64 _mm_sub_epi64
_NEON2SSESTORAGE float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
#define vsubq_f32 _mm_sub_ps
_NEON2SSESTORAGE uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
#define vsubq_u8 _mm_sub_epi8
_NEON2SSESTORAGE uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
#define vsubq_u16 _mm_sub_epi16
_NEON2SSESTORAGE uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
#define vsubq_u32 _mm_sub_epi32
_NEON2SSESTORAGE uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
#define vsubq_u64 _mm_sub_epi64
//***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ******************
//***********************************************************************************
//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
_NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
_NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0
{
__m128i a16, b16;
a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
return _mm_sub_epi16 (a16, b16);
}
_NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
_NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0
{
__m128i a32, b32;
a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
return _mm_sub_epi32 (a32, b32);
}
_NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
_NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0
{
//may be not optimal
__m128i a64, b64;
a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1,
return _mm_sub_epi64 (a64, b64);
}
_NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
_NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0
{
__m128i a16, b16;
a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1,
b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
return _mm_sub_epi16 (a16, b16);
}
_NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0
_NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0
{
__m128i a32, b32;
a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
return _mm_sub_epi32 (a32, b32);
}
_NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
_NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0
{
//may be not optimal
__m128i a64, b64;
a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1,
return _mm_sub_epi64 (a64, b64);
}
//***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] **********************************
//*****************************************************************************************************
_NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
_NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0
{
__m128i b16;
b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
return _mm_sub_epi16 (a, b16);
}
_NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
_NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0
{
__m128i b32;
b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
return _mm_sub_epi32 (a, b32);
}
_NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
_NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0
{
__m128i b64;
b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
return _mm_sub_epi64 (a, b64);
}
_NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
_NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0
{
__m128i b16;
b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
return _mm_sub_epi16 (a, b16);
}
_NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0
_NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0
{
__m128i b32;
b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
return _mm_sub_epi32 (a, b32);
}
_NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
_NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0
{
__m128i b64;
b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
return _mm_sub_epi64 (a, b64);
}
//************************Vector saturating subtract *********************************
//*************************************************************************************
_NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
_NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b)
{
int8x8_t res64;
return64(_mm_subs_epi8(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
_NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b)
{
int16x4_t res64;
return64(_mm_subs_epi16(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
_NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b)
{
int32x2_t res64;
return64(vqsubq_s32(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
{
uint64x1_t res;
uint64_t a64,b64;
a64 = a.m64_u64[0];
b64 = b.m64_u64[0];
res.m64_u64[0] = a64 - b64;
a64 = (a64 >> 63) + (~_SIGNBIT64);
if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) {
res.m64_u64[0] = a64;
}
return res;
}
_NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
_NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b)
{
uint8x8_t res64;
return64(_mm_subs_epu8(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0
_NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b)
{
uint16x4_t res64;
return64(_mm_subs_epu16(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
_NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b)
{
uint32x2_t res64;
return64(vqsubq_u32(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
uint64x1_t res;
uint64_t a64, b64;
a64 = _Ui64(a);
b64 = _Ui64(b);
if (a64 > b64) {
res.m64_u64[0] = a64 - b64;
} else {
res.m64_u64[0] = 0;
}
return res;
}
_NEON2SSESTORAGE int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
#define vqsubq_s8 _mm_subs_epi8
_NEON2SSESTORAGE int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
#define vqsubq_s16 _mm_subs_epi16
_NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
_NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b)
{
//no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
__m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a;
c7fffffff = _mm_set1_epi32(0x7fffffff);
res = _mm_sub_epi32(a, b);
res_sat = _mm_srli_epi32(a, 31);
res_sat = _mm_add_epi32(res_sat, c7fffffff);
res_xor_a = _mm_xor_si128(res, a);
b_xor_a = _mm_xor_si128(b, a);
res_xor_a = _mm_and_si128(b_xor_a, res_xor_a);
res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
res_sat = _mm_and_si128(res_xor_a, res_sat);
res = _mm_andnot_si128(res_xor_a, res);
return _mm_or_si128(res, res_sat);
}
_NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
{
_NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2];
_NEON2SSE_ALIGN_16 uint64_t res[2];
_mm_store_si128((__m128i*)atmp, a);
_mm_store_si128((__m128i*)btmp, b);
res[0] = atmp[0] - btmp[0];
res[1] = atmp[1] - btmp[1];
if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) {
res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64;
}
if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) {
res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64;
}
return _mm_load_si128((__m128i*)res);
}
_NEON2SSESTORAGE uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
#define vqsubq_u8 _mm_subs_epu8
_NEON2SSESTORAGE uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0
#define vqsubq_u16 _mm_subs_epu16
_NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
_NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0
{
__m128i min, mask, sub;
min = _MM_MIN_EPU32(a, b); //SSE4.1
mask = _mm_cmpeq_epi32 (min, b);
sub = _mm_sub_epi32 (a, b);
return _mm_and_si128 ( sub, mask);
}
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0
#ifdef USE_SSE4
_NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b)
{
__m128i c80000000, subb, suba, cmp, sub;
c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
sub = _mm_sub_epi64 (a, b);
suba = _mm_sub_epi64 (a, c80000000);
subb = _mm_sub_epi64 (b, c80000000);
cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!!
return _mm_and_si128 (sub, cmp); //saturation
}
#else
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
_NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
_mm_store_si128((__m128i*)atmp, a);
_mm_store_si128((__m128i*)btmp, b);
res[0] = (atmp[0] > btmp[0]) ? atmp[0] - btmp[0] : 0;
res[1] = (atmp[1] > btmp[1]) ? atmp[1] - btmp[1] : 0;
return _mm_load_si128((__m128i*)(res));
}
#endif
//**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1 ******************************************************
//****************************************************************
_NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
_NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0
{
//no 8 bit shift available, internal overflow is possible, so let's go to 16 bit,
int8x8_t res64;
__m128i r16;
int8x8_t r;
r = vsub_s8 (a, b);
r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1
r16 = _mm_srai_epi16 (r16, 1); //SSE2
r16 = _mm_packs_epi16 (r16,r16); //use low 64 bits
return64(r16);
}
_NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
_NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b)
{
int16x4_t res64;
return64(vhsubq_s16(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
_NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b)
{
int32x2_t res64;
return64(vhsubq_s32(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
_NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b)
{
uint8x8_t res64;
return64(vhsubq_u8(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.s16 d0,d0,d0
_NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b)
{
uint16x4_t res64;
return64(vhsubq_u16(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
_NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b)
{
uint32x2_t res64;
return64(vhsubq_u32(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
_NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0
{
// //need to deal with the possibility of internal overflow
__m128i c128, au,bu;
c128 = _mm_set1_epi8((int8_t)128);
au = _mm_add_epi8( a, c128);
bu = _mm_add_epi8( b, c128);
return vhsubq_u8(au,bu);
}
_NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
_NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0
{
//need to deal with the possibility of internal overflow
__m128i c8000, au,bu;
c8000 = _mm_set1_epi16((int16_t)0x8000);
au = _mm_add_epi16( a, c8000);
bu = _mm_add_epi16( b, c8000);
return vhsubq_u16(au,bu);
}
_NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
_NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0
{
//need to deal with the possibility of internal overflow
__m128i a2, b2,r, b_1;
a2 = _mm_srai_epi32 (a,1);
b2 = _mm_srai_epi32 (b,1);
r = _mm_sub_epi32 (a2, b2);
b_1 = _mm_andnot_si128(a, b); //!a and b
b_1 = _mm_slli_epi32 (b_1,31);
b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
return _mm_sub_epi32(r,b_1);
}
_NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
_NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0
{
__m128i avg;
avg = _mm_avg_epu8 (a, b);
return _mm_sub_epi8(a, avg);
}
_NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0
_NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0
{
__m128i avg;
avg = _mm_avg_epu16 (a, b);
return _mm_sub_epi16(a, avg);
}
_NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
_NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0
{
//need to deal with the possibility of internal overflow
__m128i a2, b2,r, b_1;
a2 = _mm_srli_epi32 (a,1);
b2 = _mm_srli_epi32 (b,1);
r = _mm_sub_epi32 (a2, b2);
b_1 = _mm_andnot_si128(a, b); //!a and b
b_1 = _mm_slli_epi32 (b_1,31);
b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
return _mm_sub_epi32(r,b_1);
}
//******* Vector subtract high half (truncated) ** ************
//************************************************************
_NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
_NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0
{
int8x8_t res64;
__m128i sum, sum8;
sum = _mm_sub_epi16 (a, b);
sum8 = _mm_srai_epi16 (sum, 8);
sum8 = _mm_packs_epi16(sum8,sum8);
return64(sum8);
}
_NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
_NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0
{
int16x4_t res64;
__m128i sum, sum16;
sum = _mm_sub_epi32 (a, b);
sum16 = _mm_srai_epi32 (sum, 16);
sum16 = _mm_packs_epi32(sum16,sum16);
return64(sum16);
}
_NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
_NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b)
{
int32x2_t res64;
__m128i sub;
sub = _mm_sub_epi64 (a, b);
sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6));
return64(sub);
}
_NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
_NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0
{
uint8x8_t res64;
__m128i sum, sum8;
sum = _mm_sub_epi16 (a, b);
sum8 = _mm_srli_epi16 (sum, 8);
sum8 = _mm_packus_epi16(sum8,sum8);
return64(sum8);
}
_NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
_NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0
{
uint16x4_t res64;
__m128i sum, sum16;
sum = _mm_sub_epi32 (a, b);
sum16 = _mm_srli_epi32 (sum, 16);
#ifdef USE_SSE4
sum16 = _MM_PACKUS1_EPI32(sum16);
#else
sum16 = _mm_shuffle_epi8 (sum16, *(__m128i*) mask8_32_even_odd); //go to 16 bits
#endif
return64(sum16);
}
_NEON2SSESTORAGE uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
#define vsubhn_u64 vsubhn_s64
//************ Vector rounding subtract high half *********************
//*********************************************************************
_NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
_NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0
{
int8x8_t res64;
__m128i sub, mask1;
sub = _mm_sub_epi16 (a, b);
mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
sub = _mm_srai_epi16 (sub, 8); //get high half
sub = _mm_add_epi16 (sub, mask1); //actual rounding
sub = _mm_packs_epi16 (sub, sub);
return64(sub);
}
_NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
_NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0
{
//SIMD may be not optimal, serial may be faster
int16x4_t res64;
__m128i sub, mask1;
sub = _mm_sub_epi32 (a, b);
mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
sub = _mm_srai_epi32 (sub, 16); //get high half
sub = _mm_add_epi32 (sub, mask1); //actual rounding
sub = _mm_packs_epi32 (sub, sub);
return64(sub);
}
_NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
_NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b)
{
//SIMD may be not optimal, serial may be faster
int32x2_t res64;
__m128i sub, mask1;
sub = _mm_sub_epi64 (a, b);
mask1 = _mm_slli_epi64(sub, 33); //shift left then back right to
mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero
sub = _mm_add_epi64 (sub, mask1); //actual high half rounding
sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6));
return64(sub);
}
_NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
_NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0
{
uint8x8_t res64;
__m128i sub, mask1;
sub = _mm_sub_epi16 (a, b);
mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
sub = _mm_srai_epi16 (sub, 8); //get high half
sub = _mm_add_epi16 (sub, mask1); //actual rounding
sub = _mm_packus_epi16 (sub, sub);
return64(sub);
}
_NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
_NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0
{
//SIMD may be not optimal, serial may be faster
uint16x4_t res64;
__m128i sub, mask1;
sub = _mm_sub_epi32 (a, b);
mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
sub = _mm_srai_epi32 (sub, 16); //get high half
sub = _mm_add_epi32 (sub, mask1); //actual rounding
#ifdef USE_SSE4
sub = _MM_PACKUS1_EPI32 (sub);
#else
sub = _mm_shuffle_epi8 (sub, *(__m128i*) mask8_32_even_odd); //go to 16 bits
#endif
return64(sub);
}
_NEON2SSESTORAGE uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
#define vrsubhn_u64 vrsubhn_s64
//*********** Vector saturating doubling multiply subtract long ********************
//************************************************************************************
_NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
_NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c)
{
//not optimal SIMD soulution, serial may be faster
__m128i res32, mask;
int32x4_t res;
_NEON2SSE_ALIGN_16 static const uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
res = vmull_s16(b, c);
res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered
mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask);
res32 = _mm_xor_si128 (res32, mask); //res32 saturated for 0x80000000
return vqsubq_s32(a, res32); //saturation
}
_NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
{
__m128i res64, mask;
int64x2_t res;
_NEON2SSE_ALIGN_16 static const uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000};
res = vmull_s32(b, c);
res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered
mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask);
res64 = _mm_xor_si128 (res64, mask); //res32 saturated for 0x80000000
return vqsubq_s64(a, res64); //saturation
}
//****************** COMPARISON ***************************************
//******************* Vector compare equal *************************************
//****************************************************************************
_NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
_NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b)
{
int8x8_t res64;
return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
_NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b)
{
int16x4_t res64;
return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
_NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b)
{
int32x2_t res64;
return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
_NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b)
{
uint32x2_t res64;
__m128 res;
res = _mm_cmpeq_ps(_pM128(a), _pM128(b) );
return64f(res);
}
_NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
_NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b)
{
uint8x8_t res64;
return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
_NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b)
{
uint16x4_t res64;
return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
_NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b)
{
uint32x2_t res64;
return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
#define vceq_p8 vceq_u8
_NEON2SSESTORAGE uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
#define vceqq_s8 _mm_cmpeq_epi8
_NEON2SSESTORAGE uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
#define vceqq_s16 _mm_cmpeq_epi16
_NEON2SSESTORAGE uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
#define vceqq_s32 _mm_cmpeq_epi32
_NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
_NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b)
{
__m128 res;
res = _mm_cmpeq_ps(a,b);
return _M128i(res);
}
_NEON2SSESTORAGE uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
#define vceqq_u8 _mm_cmpeq_epi8
_NEON2SSESTORAGE uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
#define vceqq_u16 _mm_cmpeq_epi16
_NEON2SSESTORAGE uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
#define vceqq_u32 _mm_cmpeq_epi32
_NEON2SSESTORAGE uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
#define vceqq_p8 _mm_cmpeq_epi8
//******************Vector compare greater-than or equal*************************
//*******************************************************************************
//in IA SIMD no greater-than-or-equal comparison for integers,
// there is greater-than available only, so we need the following tricks
_NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
_NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a, int8x8_t b)
{
int8x8_t res64;
return64(vcgeq_s8(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
_NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a, int16x4_t b)
{
int16x4_t res64;
return64(vcgeq_s16(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
_NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a, int32x2_t b)
{
int32x2_t res64;
return64(vcgeq_s32(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
_NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b)
{
uint32x2_t res64;
__m128 res;
res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries
return64f(res);
}
_NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
_NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b)
{
uint8x8_t res64;
return64(vcgeq_u8(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0
_NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b)
{
uint16x4_t res64;
return64(vcgeq_u16(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
_NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b)
{
//serial solution looks faster
uint32x2_t res64;
return64(vcgeq_u32 (_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
_NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
{
__m128i m1, m2;
m1 = _mm_cmpgt_epi8 ( a, b);
m2 = _mm_cmpeq_epi8 ( a, b);
return _mm_or_si128 ( m1, m2);
}
_NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
_NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
{
__m128i m1, m2;
m1 = _mm_cmpgt_epi16 ( a, b);
m2 = _mm_cmpeq_epi16 ( a, b);
return _mm_or_si128 ( m1,m2);
}
_NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
_NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
{
__m128i m1, m2;
m1 = _mm_cmpgt_epi32 (a, b);
m2 = _mm_cmpeq_epi32 (a, b);
return _mm_or_si128 (m1, m2);
}
_NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
_NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b)
{
__m128 res;
res = _mm_cmpge_ps(a,b); //use only 2 first entries
return *(__m128i*)&res;
}
_NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
_NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
{
//no unsigned chars comparison, only signed available,so need the trick
__m128i cmp;
cmp = _mm_max_epu8(a, b);
return _mm_cmpeq_epi8(cmp, a); //a>=b
}
_NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
_NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
{
//no unsigned shorts comparison, only signed available,so need the trick
#ifdef USE_SSE4
__m128i cmp;
cmp = _mm_max_epu16(a, b);
return _mm_cmpeq_epi16(cmp, a); //a>=b
#else
__m128i as, mask;
__m128i zero = _mm_setzero_si128();
__m128i cffff = _mm_set1_epi16(0xffff);
as = _mm_subs_epu16(b,a);
mask = _mm_cmpgt_epi16(as, zero);
return _mm_xor_si128 ( mask, cffff);
#endif
}
_NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
_NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
{
//no unsigned ints comparison, only signed available,so need the trick
#ifdef USE_SSE4
__m128i cmp;
cmp = _mm_max_epu32(a, b);
return _mm_cmpeq_epi32(cmp, a); //a>=b
#else
//serial solution may be faster
__m128i c80000000, as, bs, m1, m2;
c80000000 = _mm_set1_epi32 (0x80000000);
as = _mm_sub_epi32(a,c80000000);
bs = _mm_sub_epi32(b,c80000000);
m1 = _mm_cmpgt_epi32 (as, bs);
m2 = _mm_cmpeq_epi32 (as, bs);
return _mm_or_si128 ( m1, m2);
#endif
}
//**********************Vector compare less-than or equal******************************
//***************************************************************************************
//in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks
_NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
_NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a, int8x8_t b)
{
int8x8_t res64;
return64(vcleq_s8(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
_NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a, int16x4_t b)
{
int16x4_t res64;
return64(vcleq_s16(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
_NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a, int32x2_t b)
{
int32x2_t res64;
return64(vcleq_s32(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0?
_NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b)
{
uint32x2_t res64;
__m128 res;
res = _mm_cmple_ps(_pM128(a),_pM128(b));
return64f(res);
}
_NEON2SSESTORAGE uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
#define vcle_u8(a,b) vcge_u8(b,a)
_NEON2SSESTORAGE uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0
#define vcle_u16(a,b) vcge_u16(b,a)
_NEON2SSESTORAGE uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
#define vcle_u32(a,b) vcge_u32(b,a)
_NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
_NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
{
__m128i c1, res;
c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
res = _mm_cmpgt_epi8 ( a, b);
return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal
}
_NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
_NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
{
__m128i c1, res;
c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff....
res = _mm_cmpgt_epi16 ( a, b);
return _mm_andnot_si128 (res, c1);
}
_NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
_NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
{
__m128i c1, res;
c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff....
res = _mm_cmpgt_epi32 ( a, b);
return _mm_andnot_si128 (res, c1);
}
_NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
_NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b)
{
__m128 res;
res = _mm_cmple_ps(a,b);
return *(__m128i*)&res;
}
_NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
#ifdef USE_SSE4
_NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
{
//no unsigned chars comparison in SSE, only signed available,so need the trick
__m128i cmp;
cmp = _mm_min_epu8(a, b);
return _mm_cmpeq_epi8(cmp, a); //a<=b
}
#else
# define vcleq_u8(a,b) vcgeq_u8(b,a)
#endif
_NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
#ifdef USE_SSE4
_NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
{
//no unsigned shorts comparison in SSE, only signed available,so need the trick
__m128i cmp;
cmp = _mm_min_epu16(a, b);
return _mm_cmpeq_epi16(cmp, a); //a<=b
}
#else
# define vcleq_u16(a,b) vcgeq_u16(b,a)
#endif
_NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
#ifdef USE_SSE4
_NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
{
//no unsigned chars comparison in SSE, only signed available,so need the trick
__m128i cmp;
cmp = _mm_min_epu32(a, b);
return _mm_cmpeq_epi32(cmp, a); //a<=b
}
#else
//solution may be not optimal compared with the serial one
# define vcleq_u32(a,b) vcgeq_u32(b,a)
#endif
//****** Vector compare greater-than ******************************************
//**************************************************************************
_NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
_NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b)
{
int8x8_t res64;
return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
_NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b)
{
int16x4_t res64;
return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
_NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b)
{
int32x2_t res64;
return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
_NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b)
{
uint32x2_t res64;
__m128 res;
res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries
return64f(res);
}
_NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
_NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b)
{
uint8x8_t res64;
return64(vcgtq_u8(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
_NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b)
{
uint16x4_t res64;
return64(vcgtq_u16(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
_NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b)
{
uint32x2_t res64;
return64(vcgtq_u32(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
#define vcgtq_s8 _mm_cmpgt_epi8
_NEON2SSESTORAGE uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
#define vcgtq_s16 _mm_cmpgt_epi16
_NEON2SSESTORAGE uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
#define vcgtq_s32 _mm_cmpgt_epi32
_NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
_NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b)
{
__m128 res;
res = _mm_cmpgt_ps(a,b); //use only 2 first entries
return *(__m128i*)&res;
}
_NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
_NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0
{
//no unsigned chars comparison, only signed available,so need the trick
__m128i as;
__m128i zero = _mm_setzero_si128();
as = _mm_subs_epu8(a, b);
return _mm_cmpgt_epi8(as, zero);
}
_NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
_NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0
{
//no unsigned short comparison, only signed available,so need the trick
__m128i as;
__m128i zero = _mm_setzero_si128();
as = _mm_subs_epu16(a, b);
return _mm_cmpgt_epi16(as, zero);
}
_NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
_NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0
{
//no unsigned int comparison, only signed available,so need the trick
__m128i c80000000, as, bs;
c80000000 = _mm_set1_epi32 (0x80000000);
as = _mm_sub_epi32(a,c80000000);
bs = _mm_sub_epi32(b,c80000000);
return _mm_cmpgt_epi32 ( as, bs);
}
//********************* Vector compare less-than **************************
//*************************************************************************
_NEON2SSESTORAGE uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
#define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!!
_NEON2SSESTORAGE uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
#define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!!
_NEON2SSESTORAGE uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
#define vclt_s32(a,b) vcgt_s32(b,a) //swap the arguments!!
_NEON2SSESTORAGE uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
#define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!!
_NEON2SSESTORAGE uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
#define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!!
_NEON2SSESTORAGE uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
#define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!!
_NEON2SSESTORAGE uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
#define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!!
_NEON2SSESTORAGE uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
#define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!!
_NEON2SSESTORAGE uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
#define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!!
_NEON2SSESTORAGE uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
#define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!!
_NEON2SSESTORAGE uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
#define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!!
_NEON2SSESTORAGE uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
#define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!!
_NEON2SSESTORAGE uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
#define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!!
_NEON2SSESTORAGE uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
#define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!!
//*****************Vector compare absolute greater-than or equal ************
//***************************************************************************
_NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
_NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b)
{
uint32x2_t res64;
__m128i c7fffffff;
__m128 a0, b0;
c7fffffff = _mm_set1_epi32 (0x7fffffff);
a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
a0 = _mm_cmpge_ps ( a0, b0);
return64f(a0);
}
_NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
_NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
{
__m128i c7fffffff;
__m128 a0, b0;
c7fffffff = _mm_set1_epi32 (0x7fffffff);
a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
a0 = _mm_cmpge_ps ( a0, b0);
return (*(__m128i*)&a0);
}
//********Vector compare absolute less-than or equal ******************
//********************************************************************
_NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
_NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b)
{
uint32x2_t res64;
__m128i c7fffffff;
__m128 a0, b0;
c7fffffff = _mm_set1_epi32 (0x7fffffff);
a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
a0 = _mm_cmple_ps (a0, b0);
return64f(a0);
}
_NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
_NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
{
__m128i c7fffffff;
__m128 a0, b0;
c7fffffff = _mm_set1_epi32 (0x7fffffff);
a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
a0 = _mm_cmple_ps (a0, b0);
return (*(__m128i*)&a0);
}
//******** Vector compare absolute greater-than ******************
//******************************************************************
_NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
_NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b)
{
uint32x2_t res64;
__m128i c7fffffff;
__m128 a0, b0;
c7fffffff = _mm_set1_epi32 (0x7fffffff);
a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
a0 = _mm_cmpgt_ps (a0, b0);
return64f(a0);
}
_NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
_NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
{
__m128i c7fffffff;
__m128 a0, b0;
c7fffffff = _mm_set1_epi32 (0x7fffffff);
a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
a0 = _mm_cmpgt_ps (a0, b0);
return (*(__m128i*)&a0);
}
//***************Vector compare absolute less-than ***********************
//*************************************************************************
_NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
_NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b)
{
uint32x2_t res64;
__m128i c7fffffff;
__m128 a0, b0;
c7fffffff = _mm_set1_epi32 (0x7fffffff);
a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
a0 = _mm_cmplt_ps (a0, b0);
return64f(a0);
}
_NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
_NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
{
__m128i c7fffffff;
__m128 a0, b0;
c7fffffff = _mm_set1_epi32 (0x7fffffff);
a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
a0 = _mm_cmplt_ps (a0, b0);
return (*(__m128i*)&a0);
}
//*************************Vector test bits************************************
//*****************************************************************************
/*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them
with the corresponding element of a second vector. If the result is not zero, the
corresponding element in the destination vector is set to all ones. Otherwise, it is set to
all zeros. */
_NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
_NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b)
{
int8x8_t res64;
return64(vtstq_s8(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
_NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b)
{
int16x4_t res64;
return64(vtstq_s16(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
_NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b)
{
int32x2_t res64;
return64(vtstq_s32(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
#define vtst_u8 vtst_s8
_NEON2SSESTORAGE uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
#define vtst_u16 vtst_s16
_NEON2SSESTORAGE uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
#define vtst_u32 vtst_s32
_NEON2SSESTORAGE uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
#define vtst_p8 vtst_u8
_NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
_NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0
{
__m128i zero, one, res;
zero = _mm_setzero_si128 ();
one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
res = _mm_and_si128 (a, b);
res = _mm_cmpeq_epi8 (res, zero);
return _mm_xor_si128(res, one); //invert result
}
_NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
_NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0
{
__m128i zero, one, res;
zero = _mm_setzero_si128 ();
one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
res = _mm_and_si128 (a, b);
res = _mm_cmpeq_epi16 (res, zero);
return _mm_xor_si128(res, one); //invert result
}
_NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
_NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0
{
__m128i zero, one, res;
zero = _mm_setzero_si128 ();
one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
res = _mm_and_si128 (a, b);
res = _mm_cmpeq_epi32 (res, zero);
return _mm_xor_si128(res, one); //invert result
}
_NEON2SSESTORAGE uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
#define vtstq_u8 vtstq_s8
_NEON2SSESTORAGE uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
#define vtstq_u16 vtstq_s16
_NEON2SSESTORAGE uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
#define vtstq_u32 vtstq_s32
_NEON2SSESTORAGE uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
#define vtstq_p8 vtstq_u8
//****************** Absolute difference ********************
//*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |*****
//************************************************************
_NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
_NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a, int8x8_t b)
{
int8x8_t res64;
return64(vabdq_s8(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
_NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a, int16x4_t b)
{
int16x4_t res64;
return64(vabdq_s16(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
_NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a, int32x2_t b)
{//need to deal with an intermediate overflow
int32x2_t res;
res.m64_i32[0] = (a.m64_i32[0] > b.m64_i32[0]) ? a.m64_i32[0] - b.m64_i32[0]: b.m64_i32[0] - a.m64_i32[0];
res.m64_i32[1] = (a.m64_i32[1] > b.m64_i32[1]) ? a.m64_i32[1] - b.m64_i32[1]: b.m64_i32[1] - a.m64_i32[1];
return res;
}
_NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
_NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b)
{
uint8x8_t res64;
return64(vabdq_u8(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.s16 d0,d0,d0
_NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b)
{
uint16x4_t res64;
return64(vabdq_u16(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
_NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b)
{
uint32x2_t res64;
return64(vabdq_u32(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
_NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b)
{
float32x4_t res;
__m64_128 res64;
res = vabdq_f32(_pM128(a), _pM128(b));
_M64f(res64, res);
return res64;
}
_NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
_NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0
{ //need to deal with an intermediate overflow
__m128i cmp, difab, difba;
cmp = vcgtq_s8(a,b);
difab = _mm_sub_epi8(a,b);
difba = _mm_sub_epi8(b,a);
difab = _mm_and_si128(cmp, difab);
difba = _mm_andnot_si128(cmp, difba);
return _mm_or_si128(difab, difba);
}
_NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
_NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0
{//need to deal with an intermediate overflow
__m128i cmp, difab, difba;
cmp = vcgtq_s16(a,b);
difab = _mm_sub_epi16(a,b);
difba = _mm_sub_epi16 (b,a);
difab = _mm_and_si128(cmp, difab);
difba = _mm_andnot_si128(cmp, difba);
return _mm_or_si128(difab, difba);
}
_NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
_NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0
{//need to deal with an intermediate overflow
__m128i cmp, difab, difba;
cmp = vcgtq_s32(a,b);
difab = _mm_sub_epi32(a,b);
difba = _mm_sub_epi32(b,a);
difab = _mm_and_si128(cmp, difab);
difba = _mm_andnot_si128(cmp, difba);
return _mm_or_si128(difab, difba);
}
_NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
_NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned
{
__m128i difab, difba;
difab = _mm_subs_epu8(a,b);
difba = _mm_subs_epu8 (b,a);
return _mm_or_si128(difab, difba);
}
_NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0
_NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
{
__m128i difab, difba;
difab = _mm_subs_epu16(a,b);
difba = _mm_subs_epu16 (b,a);
return _mm_or_si128(difab, difba);
}
_NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
_NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b)
{
__m128i cmp, difab, difba;
cmp = vcgtq_u32(a,b);
difab = _mm_sub_epi32(a,b);
difba = _mm_sub_epi32 (b,a);
difab = _mm_and_si128(cmp, difab);
difba = _mm_andnot_si128(cmp, difba);
return _mm_or_si128(difab, difba);
}
_NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
_NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0
{
__m128i c1;
__m128 res;
c1 = _mm_set1_epi32(0x7fffffff);
res = _mm_sub_ps (a, b);
return _mm_and_ps (res, *(__m128*)&c1);
}
//************ Absolute difference - long **************************
//********************************************************************
_NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
_NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0
{
__m128i a16, b16;
a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
return vabdq_s16(a16, b16);
}
_NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
_NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0
{
__m128i a32, b32;
a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
return vabdq_s32(a32, b32);
}
_NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
{
//no optimal SIMD solution, serial looks faster
_NEON2SSE_ALIGN_16 int64_t res[2];
if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0];
else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0];
if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1];
else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1];
return _mm_load_si128((__m128i*)res);
}
_NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
_NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b)
{
__m128i res;
res = vsubl_u8(a,b);
return _mm_abs_epi16(res);
}
_NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0
_NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b)
{
__m128i res;
res = vsubl_u16(a,b);
return _mm_abs_epi32(res);
}
_NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
_NEON2SSE_ALIGN_16 uint64_t res[2];
if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0];
else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0];
if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1];
else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1];
return _mm_load_si128((__m128i*)res);
}
//**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | *************
//*********************************************************************************************
_NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
_NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c)
{
int8x8_t res64;
return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c)));
}
_NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
_NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c)
{
int16x4_t res64;
return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c)));
}
_NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
_NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c)
{
int32x2_t res64;
return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c)));
}
_NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
_NEON2SSE_INLINE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
{
int8x8_t res64;
return64(vabaq_u8(_pM128i(a),_pM128i(b), _pM128i(c)));
}
_NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0
_NEON2SSE_INLINE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c)
{
int16x4_t res64;
return64(vabaq_u16(_pM128i(a), _pM128i(b), _pM128i(c)));
}
_NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
_NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c)
{
uint32x2_t res64;
return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c)));
}
_NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
_NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0
{
int8x16_t sub;
sub = vabdq_s8(b, c);
return vaddq_s8( a, sub);
}
_NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
_NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0
{
int16x8_t sub;
sub = vabdq_s16(b, c);
return vaddq_s16( a, sub);
}
_NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
_NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0
{
int32x4_t sub;
sub = vabdq_s32(b, c);
return vaddq_s32( a, sub);
}
_NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
_NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)
{
uint8x16_t sub;
sub = vabdq_u8(b, c);
return vaddq_u8( a, sub);
}
_NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0
_NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c)
{
uint16x8_t sub;
sub = vabdq_u16(b, c);
return vaddq_u16( a, sub);
}
_NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
_NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
{
uint32x4_t sub;
sub = vabdq_u32(b, c);
return vaddq_u32( a, sub);
}
//************** Absolute difference and accumulate - long ********************************
//*************************************************************************************
_NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
_NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0
{
__m128i b16, c16, res;
b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1,
res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
return _mm_add_epi16 (a, res);
}
_NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
_NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0
{
__m128i b32, c32, res;
b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1
c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1
res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
return _mm_add_epi32 (a, res);
}
_NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
{
__m128i res;
res = vabdl_s32(b,c);
return _mm_add_epi64(a, res);
}
_NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
_NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c)
{
__m128i b16, c16, res;
b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1,
res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
return _mm_add_epi16 (a, res);
}
_NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0
_NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c)
{
__m128i b32, c32, res;
b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1
c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1
res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
return _mm_add_epi32 (a, res);
}
_NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
{
__m128i res;
res = vabdl_u32(b,c);
return _mm_add_epi64(a, res);
}
//***********************************************************************************
//**************** Maximum and minimum operations **********************************
//***********************************************************************************
//************* Maximum: vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] *******
//***********************************************************************************
_NEON2SSESTORAGE int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
_NEON2SSE_INLINE int8x8_t vmax_s8(int8x8_t a, int8x8_t b)
{
int8x8_t res64;
__m128i res;
res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
return64(res);
}
_NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
_NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b)
{
int16x4_t res64;
return64(_mm_max_epi16(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
_NEON2SSE_INLINE int32x2_t vmax_s32(int32x2_t a, int32x2_t b)
{
int32x2_t res64;
__m128i res;
res = _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
return64(res);
}
_NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
_NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b)
{
uint8x8_t res64;
return64(_mm_max_epu8(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0
_NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b)
{
uint16x4_t res64;
return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
_NEON2SSE_INLINE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b)
{
uint32x2_t res64;
__m128i res;
res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
return64(res);
}
_NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
_NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b)
{
//serial solution looks faster than SIMD one
float32x2_t res;
res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
return res;
}
_NEON2SSESTORAGE int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
#define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1
_NEON2SSESTORAGE int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
#define vmaxq_s16 _mm_max_epi16
_NEON2SSESTORAGE int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
#define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1
_NEON2SSESTORAGE uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
#define vmaxq_u8 _mm_max_epu8
_NEON2SSESTORAGE uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0
#define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1
_NEON2SSESTORAGE uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
#define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1
_NEON2SSESTORAGE float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
#define vmaxq_f32 _mm_max_ps
_NEON2SSESTORAGE float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
#define vmaxq_f64 _mm_max_pd
//*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ********************************
//***********************************************************************************************************
_NEON2SSESTORAGE int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
_NEON2SSE_INLINE int8x8_t vmin_s8(int8x8_t a, int8x8_t b)
{
int8x8_t res64;
__m128i res;
res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
return64(res);
}
_NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
_NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b)
{
int16x4_t res64;
return64(_mm_min_epi16(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
_NEON2SSE_INLINE int32x2_t vmin_s32(int32x2_t a, int32x2_t b)
{
int32x2_t res64;
__m128i res;
res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
return64(res);
}
_NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
_NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b)
{
uint8x8_t res64;
return64(_mm_min_epu8(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0
_NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b)
{
uint16x4_t res64;
return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
_NEON2SSE_INLINE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b)
{
uint32x2_t res64;
__m128i res;
res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
return64(res);
}
_NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
_NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b)
{
//serial solution looks faster than SIMD one
float32x2_t res;
res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
return res;
}
_NEON2SSESTORAGE int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
#define vminq_s8 _MM_MIN_EPI8 //SSE4.1
_NEON2SSESTORAGE int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
#define vminq_s16 _mm_min_epi16
_NEON2SSESTORAGE int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
#define vminq_s32 _MM_MIN_EPI32 //SSE4.1
_NEON2SSESTORAGE uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
#define vminq_u8 _mm_min_epu8
_NEON2SSESTORAGE uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0
#define vminq_u16 _MM_MIN_EPU16 //SSE4.1
_NEON2SSESTORAGE uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
#define vminq_u32 _MM_MIN_EPU32 //SSE4.1
_NEON2SSESTORAGE float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
#define vminq_f32 _mm_min_ps
_NEON2SSESTORAGE float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
#define vminq_f64 _mm_min_pd
//************* Pairwise addition operations. **************************************
//************************************************************************************
//Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector
_NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
_NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0
{
//no 8 bit hadd in IA32, need to go to 16 bit and then pack
int8x8_t res64;
__m128i a16, b16, res;
a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
res = _mm_hadd_epi16 (a16, b16);
res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits
return64(res);
}
_NEON2SSESTORAGE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
_NEON2SSE_INLINE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b)
{
int16x4_t res64;
__m128i hadd128;
hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b));
hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
return64(hadd128);
}
_NEON2SSESTORAGE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
_NEON2SSE_INLINE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b)
{
int32x2_t res64;
__m128i hadd128;
hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b));
hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
return64(hadd128);
}
_NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
_NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0
{
// no 8 bit hadd in IA32, need to go to 16 bit and then pack
uint8x8_t res64;
// no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
__m128i mask8, a16, b16, res;
mask8 = _mm_set1_epi16(0xff);
a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
res = _mm_hadd_epi16 (a16, b16);
res = _mm_and_si128(res, mask8); //to avoid saturation
res = _mm_packus_epi16 (res,res); //use low 64 bits
return64(res);
}
_NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
_NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0
{
// solution may be not optimal, serial execution may be faster
// no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed
uint16x4_t res64;
__m128i c32767, cfffe, as, bs, res;
c32767 = _mm_set1_epi16 (32767);
cfffe = _mm_set1_epi16 ((int16_t)0xfffe);
as = _mm_sub_epi16 (_pM128i(a), c32767);
bs = _mm_sub_epi16 (_pM128i(b), c32767);
res = _mm_hadd_epi16 (as, bs);
res = _mm_add_epi16 (res, cfffe);
res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
return64(res);
}
_NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
_NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster
{
//hadd doesn't work for unsigned values
uint32x2_t res64;
__m128i ab, ab_sh, res;
ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1
ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0
res = _mm_add_epi32(ab, ab_sh);
res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
return64(res);
}
_NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
_NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b)
{
__m128 hadd128;
__m64_128 res64;
hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b));
hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits
_M64f(res64, hadd128);
return res64;
}
//************************** Long pairwise add **********************************
//*********************************************************************************
//Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width,
// and places the final results in the destination vector.
_NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
_NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0
{
//no 8 bit hadd in IA32, need to go to 16 bit anyway
__m128i a16;
int16x4_t res64;
a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
return64(a16);
}
_NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
_NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0
{
// solution may be not optimal, serial execution may be faster
int32x2_t res64;
__m128i r32_1;
r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a));
r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits
return64(r32_1);
}
_NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
{
int64x1_t res;
res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1];
return res;
}
_NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
_NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0
{
// no 8 bit hadd in IA32, need to go to 16 bit
// no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
uint16x4_t res64;
__m128i a16;
a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
return64(a16);
}
_NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
{
//serial solution looks faster than a SIMD one
uint32x2_t res;
res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1];
res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3];
return res;
}
_NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
{
uint64x1_t res;
res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1];
return res;
}
_NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
_NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0
{
//no 8 bit hadd in IA32, need to go to 16 bit
__m128i r16_1, r16_2;
r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
//swap hi and low part of r to process the remaining data
r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
r16_2 = _MM_CVTEPI8_EPI16 (r16_2);
return _mm_hadd_epi16 (r16_1, r16_2);
}
_NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
_NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0
{
//no 8 bit hadd in IA32, need to go to 16 bit
__m128i r32_1, r32_2;
r32_1 = _MM_CVTEPI16_EPI32(a);
//swap hi and low part of r to process the remaining data
r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
r32_2 = _MM_CVTEPI16_EPI32 (r32_2);
return _mm_hadd_epi32 (r32_1, r32_2);
}
_NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) // VPADDL.S32 q0,q0
{
_NEON2SSE_ALIGN_16 int32_t atmp[4];
_NEON2SSE_ALIGN_16 int64_t res[2];
_mm_store_si128((__m128i*)atmp, a);
res[0] = (int64_t)atmp[0] + (int64_t)atmp[1];
res[1] = (int64_t)atmp[2] + (int64_t)atmp[3];
return _mm_load_si128((__m128i*)res);
}
_NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
_NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0
{
//no 8 bit hadd in IA32, need to go to 16 bit
__m128i r16_1, r16_2;
r16_1 = _MM_CVTEPU8_EPI16(a);
//swap hi and low part of r to process the remaining data
r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
r16_2 = _MM_CVTEPU8_EPI16 (r16_2);
return _mm_hadd_epi16 (r16_1, r16_2);
}
_NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a), _NEON2SSE_REASON_SLOW_SERIAL)
{
//serial solution looks faster than a SIMD one
_NEON2SSE_ALIGN_16 uint16_t atmp[8];
_NEON2SSE_ALIGN_16 uint32_t res[4];
_mm_store_si128((__m128i*)atmp, a);
res[0] = (uint32_t)atmp[0] + (uint32_t)atmp[1];
res[1] = (uint32_t)atmp[2] + (uint32_t)atmp[3];
res[2] = (uint32_t)atmp[4] + (uint32_t)atmp[5];
res[3] = (uint32_t)atmp[6] + (uint32_t)atmp[7];
return _mm_load_si128((__m128i*)res);
}
_NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
{
_NEON2SSE_ALIGN_16 uint32_t atmp[4];
_NEON2SSE_ALIGN_16 uint64_t res[2];
_mm_store_si128((__m128i*)atmp, a);
res[0] = (uint64_t)atmp[0] + (uint64_t)atmp[1];
res[1] = (uint64_t)atmp[2] + (uint64_t)atmp[3];
return _mm_load_si128((__m128i*)res);
}
//************************ Long pairwise add and accumulate **************************
//****************************************************************************************
//VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector,
// and accumulates the values of the results into the elements of the destination (wide) vector
_NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
_NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b)
{
int16x4_t res64;
return64(vpadalq_s8(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
_NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b)
{
int32x2_t res64;
return64(vpadalq_s16(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
_NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b)
{
int64x1_t res;
res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0];
return res;
}
_NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
_NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b)
{
uint16x4_t res64;
return64(vpadalq_u8(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.s16 d0,d0
_NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b)
{
uint32x2_t res64;
return64(vpadalq_u16(_pM128i(a), _pM128i(b)));
}
_NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
_NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b)
{
uint64x1_t res;
res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0];
return res;
}
_NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
_NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0
{
int16x8_t pad;
pad = vpaddlq_s8(b);
return _mm_add_epi16 (a, pad);
}
_NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
_NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0
{
int32x4_t pad;
pad = vpaddlq_s16(b);
return _mm_add_epi32(a, pad);
}
_NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
_NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
{
int64x2_t pad;
pad = vpaddlq_s32(b);
return _mm_add_epi64 (a, pad);
}
_NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
_NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0
{
uint16x8_t pad;
pad = vpaddlq_u8(b);
return _mm_add_epi16 (a, pad);
}
_NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
uint32x4_t pad;
pad = vpaddlq_u16(b);
return _mm_add_epi32(a, pad);
} //no optimal SIMD solution, serial is faster
_NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
//no optimal SIMD solution, serial is faster
uint64x2_t pad;
pad = vpaddlq_u32(b);
return _mm_add_epi64(a, pad);
} //no optimal SIMD solution, serial is faster
//********** Folding maximum *************************************
//*******************************************************************
//VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors,
//and copies the larger of each pair into the corresponding element in the destination
// no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison
_NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
_NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0
{
int8x8_t res64;
__m128i ab, ab1, max;
_NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
_NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1
max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
return64(max); //we need 64 bits only
}
_NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
_NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0
{
//solution may be not optimal compared with the serial one
int16x4_t res64;
__m128i ab, ab1, max;
_NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
max = _mm_max_epi16 (ab, ab1);
max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
return64(max);
}
_NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
//serial solution looks faster than SIMD one
int32x2_t res;
res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
return res;
}
_NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
_NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0
{
uint8x8_t res64;
__m128i ab, ab1, max;
_NEON2SSE_ALIGN_16 static const int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
_NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab
ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
max = _mm_max_epu8 (ab, ab1); // SSE4.1
max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
return64(max);
}
_NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0
_NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0
{
//solution may be not optimal compared with the serial one
uint16x4_t res64;
__m128i ab, ab1, max;
_NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
max = _MM_MAX_EPU16 (ab, ab1);
max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
return64(max);
}
_NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
//serial solution looks faster than SIMD one
uint32x2_t res;
res.m64_u32[0] = (a.m64_u32[0] < a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
res.m64_u32[1] = (b.m64_u32[0] < b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
return res;
}
_NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
//serial solution looks faster than SIMD one
float32x2_t res;
res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
return res;
}
// ***************** Folding minimum ****************************
// **************************************************************
//vpmin -> takes minimum of adjacent pairs
_NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
_NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0
{
int8x8_t res64;
__m128i ab, ab1, min;
_NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
_NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding
min = _MM_MIN_EPI8 (ab, ab1); // SSE4.1
min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
return64(min);
}
_NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
_NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0
{
//solution may be not optimal compared with the serial one
int16x4_t res64;
__m128i ab, ab1, min;
_NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
min = _mm_min_epi16 (ab, ab1);
min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
return64(min);
}
_NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
//serial solution looks faster than SIMD one
int32x2_t res;
res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
return res;
}
_NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
_NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0
{
uint8x8_t res64;
__m128i ab, ab1, min;
_NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
_NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
min = _mm_min_epu8 (ab, ab1); // SSE4.1
min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
return64(min);
}
_NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0
_NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0
{
//solution may be not optimal compared with the serial one
uint16x4_t res64;
__m128i ab, ab1, min;
_NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask
min = _MM_MIN_EPU16 (ab, ab1);
min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
return64(min);
}
_NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
//serial solution looks faster than SIMD one
uint32x2_t res;
res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
return res;
}
_NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
//serial solution looks faster than SIMD one
float32x2_t res;
res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
return res;
}
//***************************************************************
//*********** Reciprocal/Sqrt ************************************
//***************************************************************
//****************** Reciprocal estimate *******************************
//the ARM NEON and x86 SIMD results may be slightly different
_NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
_NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits
{
float32x4_t res;
__m64_128 res64;
res = _mm_rcp_ps(_pM128(a));
_M64f(res64, res);
return res64;
}
_NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
{
//Input is fixed point number!!! No reciprocal for ints in IA32 available
uint32x2_t res;
float resf, r;
int i, q, s;
for (i =0; i<2; i++){
if((a.m64_u32[i] & 0x80000000) == 0) {
res.m64_u32[i] = 0xffffffff;
}else{
resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
r = (float)(1.0 / (((float)q + 0.5) / 512.0)); /* reciprocal r */
s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
r = (float)s / 256.0;
res.m64_u32[i] = r * (uint32_t)(1 << 31);
}
}
return res;
}
_NEON2SSESTORAGE float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
#define vrecpeq_f32 _mm_rcp_ps
_NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
{
//Input is fixed point number!!!
//We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double
_NEON2SSE_ALIGN_16 uint32_t atmp[4];
_NEON2SSE_ALIGN_16 uint32_t res[4];
_NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000};
float resf, r;
int i, q, s;
__m128i res128, mask, zero;
_mm_store_si128((__m128i*)atmp, a);
zero = _mm_setzero_si128();
for (i =0; i<4; i++){
resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31))); // 2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31))
q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */
s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
r = (float)s / 256.0;
res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
}
res128 = _mm_load_si128((__m128i*)res);
mask = _mm_and_si128(a, *(__m128i*)c80000000);
mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x7fffffff
return _mm_or_si128(res128, mask);
}
//**********Reciprocal square root estimate ****************
//**********************************************************
//no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster
//but the particular implementation for vrsqrte_u32 may vary for various ARM compilers
////the ARM NEON and x86 SIMD results may be slightly different
_NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
_NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits
{
float32x4_t res;
__m64_128 res64;
res = _mm_rsqrt_ps(_pM128(a));
_M64f(res64, res);
return res64;
}
_NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
{
//Input is fixed point number!!!
//We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
uint32x2_t res;
__m128 tmp;
float r, resf, coeff;
int i,q0, s;
for (i =0; i<2; i++){
if((a.m64_u32[i] & 0xc0000000) == 0) { //a <=0x3fffffff
res.m64_u32[i] = 0xffffffff;
}else{
resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/
q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
r = ((float)q0 + 0.5) / coeff;
tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
_mm_store_ss(&r, tmp);
s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
r = (float)(s / 256.0);
res.m64_u32[i] = r * (((uint32_t)1) << 31);
}
}
return res;
}
_NEON2SSESTORAGE float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
#define vrsqrteq_f32 _mm_rsqrt_ps
_NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
{
//Input is fixed point number!!!
//We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
_NEON2SSE_ALIGN_16 uint32_t atmp[4], res[4];
_NEON2SSE_ALIGN_16 static const uint32_t c_c0000000[4] = {0xc0000000,0xc0000000, 0xc0000000,0xc0000000};
__m128 tmp;
__m128i res128, mask, zero;
float r, resf, coeff;
int i,q0, s;
_mm_store_si128((__m128i*)atmp, a);
zero = _mm_setzero_si128();
for (i =0; i<4; i++){
resf = (float) (atmp[i] * (0.5f / (uint32_t)(1 << 31)));
coeff = (float)((resf < 0.5)? 512.0 : 256.0); /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/
q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
r = ((float)q0 + 0.5) / coeff;
tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
_mm_store_ss(&r, tmp);
s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
r = (float)s / 256.0;
res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
}
res128 = _mm_load_si128((__m128i*)res);
mask = _mm_and_si128(a, *(__m128i*)c_c0000000);
mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x3fffffff
return _mm_or_si128(res128, mask);
}
//************ Reciprocal estimate/step and 1/sqrt estimate/step ***************************
//******************************************************************************************
//******VRECPS (Vector Reciprocal Step) ***************************************************
//multiplies the elements of one vector by the corresponding elements of another vector,
//subtracts each of the results from 2, and places the final results into the elements of the destination vector.
_NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
_NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b)
{
float32x4_t res;
__m64_128 res64;
res = vrecpsq_f32(_pM128(a), _pM128(b));
_M64f(res64, res);
return res64;
}
_NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
_NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0
{
__m128 f2, mul;
f2 = _mm_set1_ps(2.);
mul = _mm_mul_ps(a,b);
return _mm_sub_ps(f2,mul);
}
//*****************VRSQRTS (Vector Reciprocal Square Root Step) *****************************
//multiplies the elements of one vector by the corresponding elements of another vector,
//subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector.
_NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
_NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b)
{
float32x2_t res;
res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2;
res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2;
return res;
}
_NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
_NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0
{
__m128 f3, f05, mul;
f3 = _mm_set1_ps(3.);
f05 = _mm_set1_ps(0.5);
mul = _mm_mul_ps(a,b);
f3 = _mm_sub_ps(f3,mul);
return _mm_mul_ps (f3, f05);
}
//********************************************************************************************
//***************************** Shifts by signed variable ***********************************
//********************************************************************************************
//***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) ***********************
//********************************************************************************************
//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
//helper macro. It matches ARM implementation for big shifts
#define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
_NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
_mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
for (i = 0; i<LEN; i++) { \
if( (btmp[i] >= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \
else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \
return _mm_load_si128((__m128i*)res);
#define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \
int ## TYPE ## x ## LEN ## _t res; int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \
for (i = 0; i<LEN; i++) { \
if( (b.m64_i ## TYPE[i] >= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \
else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \
return res;
_NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SHIFT_64(8, i, 8)
}
_NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SHIFT_64(16, i, 4)
}
_NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SHIFT_64(32, i, 2)
}
_NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SHIFT_64(64, i, 1)
}
_NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SHIFT_64(8, u, 8)
}
_NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SHIFT_64(16, u, 4)
}
_NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SHIFT_64(32, u, 2)
}
_NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
_NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing for large numbers
{
SERIAL_SHIFT_64(64, u, 1)
}
_NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SHIFT(int8_t, int8_t, 16, 16)
}
_NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SHIFT(int16_t, int16_t, 8, 8)
}
_NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SHIFT(int32_t, int32_t, 4, 4)
}
_NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SHIFT(int64_t, int64_t, 2, 2)
}
_NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SHIFT(uint8_t, int8_t, 16, 16)
}
_NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SHIFT(uint16_t, int16_t, 8, 8)
}
_NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SHIFT(uint32_t, int32_t, 4, 4)
}
_NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SHIFT(uint64_t, int64_t, 2, 2)
}
//*********** Vector saturating shift left: (negative values shift right) **********************
//********************************************************************************************
//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
#define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
_NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
_mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
for (i = 0; i<LEN; i++) { \
if (atmp[i] ==0) res[i] = 0; \
else{ \
if(btmp[i] <0) res[i] = atmp[i] >> (-btmp[i]); \
else{ \
if (btmp[i]>lanesize_1) { \
res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
}else{ \
limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
else res[i] = atmp[i] << btmp[i]; }}}} \
return _mm_load_si128((__m128i*)res);
#define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
_NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
TYPE lanesize = (sizeof(TYPE) << 3); \
_mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
for (i = 0; i<LEN; i++) { \
if (atmp[i] ==0) {res[i] = 0; \
}else{ \
if(btmp[i] < 0) res[i] = atmp[i] >> (-btmp[i]); \
else{ \
if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
else{ \
limit = (TYPE) 1 << (lanesize - btmp[i]); \
res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
return _mm_load_si128((__m128i*)res);
#define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \
int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \
int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \
for (i = 0; i<LEN; i++) { \
if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
else{ \
if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
else{ \
if (b.m64_i ## TYPE[i]>lanesize_1) { \
res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
}else{ \
limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
return res;
#define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \
int ## TYPE ## x ## LEN ## _t res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \
for (i = 0; i<LEN; i++) { \
if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
}else{ \
if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
else{ \
if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
else{ \
limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \
return res;
_NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_SHIFT_SIGNED_64(8,8)
}
_NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_SHIFT_SIGNED_64(16,4)
}
_NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_SHIFT_SIGNED_64(32,2)
}
_NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_SHIFT_SIGNED_64(64,1)
}
_NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8)
}
_NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4)
}
_NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2)
}
_NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1)
}
_NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16)
}
_NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8)
}
_NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4)
}
_NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2)
}
_NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16)
}
_NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8)
}
_NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4)
}
_NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2)
}
//******** Vector rounding shift left: (negative values shift right) **********
//****************************************************************************
//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
//rounding makes sense for right shifts only.
#define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
_NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
_mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
for (i = 0; i<LEN; i++) { \
if( btmp[i] >= 0) { \
if(btmp[i] >= lanesize) res[i] = 0; \
else res[i] = (atmp[i] << btmp[i]); \
}else{ \
res[i] = (btmp[i] < -lanesize) ? res[i] = 0 : \
(btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \
(atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); }} \
return _mm_load_si128((__m128i*)res);
#define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \
int ## TYPE ## x ## LEN ## _t res; int i; int lanesize = sizeof(int ## TYPE ## _t) << 3; \
for (i = 0; i<LEN; i++) { \
if( b.m64_i ## TYPE[i] >= 0) { \
if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \
else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \
}else{ \
res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? res.m64_ ## SIGN ## TYPE[i] = 0 : \
(b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \
(a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); }} \
return res;
_NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_ROUNDING_SHIFT_64(8,i,8)
}
_NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_ROUNDING_SHIFT_64(16,i,4)
}
_NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_ROUNDING_SHIFT_64(32,i,2)
}
_NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_ROUNDING_SHIFT_64(64,i,1)
}
_NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_ROUNDING_SHIFT_64(8,u,8)
}
_NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_ROUNDING_SHIFT_64(16,u,4)
}
_NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_ROUNDING_SHIFT_64(32,u,2)
}
_NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_ROUNDING_SHIFT_64(64,u,1)
}
_NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16)
}
_NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8)
}
_NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4)
}
_NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2)
}
_NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16)
}
_NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8)
}
_NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4)
}
_NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2)
}
//********** Vector saturating rounding shift left: (negative values shift right) ****************
//*************************************************************************************************
//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
//Saturation happens for left shifts only while rounding makes sense for right shifts only.
#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
_NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
_mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
for (i = 0; i<LEN; i++) { \
if (atmp[i] ==0) res[i] = 0; \
else{ \
if(btmp[i] <0) res[i] = (btmp[i] < (-lanesize_1)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
else{ \
if (btmp[i]>lanesize_1) { \
res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
}else{ \
limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
else res[i] = atmp[i] << btmp[i]; }}}} \
return _mm_load_si128((__m128i*)res);
#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
_NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
int lanesize = (sizeof(TYPE) << 3); \
_mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
for (i = 0; i<LEN; i++) { \
if (atmp[i] ==0) {res[i] = 0; \
}else{ \
if(btmp[i] < 0) res[i] = (btmp[i] < (-lanesize)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
else{ \
if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
else{ \
limit = (TYPE) 1 << (lanesize - btmp[i]); \
res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
return _mm_load_si128((__m128i*)res);
#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \
__m64_128 res; int ## TYPE ## _t limit; int i; \
int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \
for (i = 0; i<LEN; i++) { \
if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
else{ \
if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize_1)) ? 0 : (a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
else{ \
if (b.m64_i ## TYPE[i]>lanesize_1) { \
res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
}else{ \
limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
return res;
#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \
__m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
int lanesize = (sizeof(int ## TYPE ## _t) << 3); \
for (i = 0; i<LEN; i++) { \
if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
}else{ \
if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize)) ? 0 : (a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
else{ \
if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
else{ \
limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
return res;
_NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8)
}
_NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4)
}
_NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2)
}
_NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1)
}
_NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8)
}
_NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4)
}
_NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2)
}
_NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1)
}
_NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16)
}
_NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8)
}
_NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4)
}
_NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2)
}
_NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16)
}
_NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8)
}
_NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4)
}
_NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
{
SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2)
}
// *********************************************************************************
// ***************************** Shifts by a constant *****************************
// *********************************************************************************
//**************** Vector shift right by constant*************************************
//************************************************************************************
_NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
_NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8
{
//no 8 bit shift available, go to 16 bit
int8x8_t res64;
__m128i r;
r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
r = _mm_srai_epi16 (r, b); //SSE2
r = _mm_packs_epi16 (r,r); //we need 64 bits only
return64(r);
}
_NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
_NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b)
{
int16x4_t res64;
return64(_mm_srai_epi16(_pM128i(a), b));
}
_NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
_NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b)
{
int32x2_t res64;
return64(_mm_srai_epi32(_pM128i(a), b));
}
_NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
{
//no arithmetic shift for 64bit values, serial solution used
int64x1_t res;
if(b>=64) res.m64_i64[0] = 0;
else res.m64_i64[0] = (*(int64_t*)&a) >> b;
return res;
}
_NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
_NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8
{
//no 8 bit shift available, go to 16 bit
uint8x8_t res64;
__m128i r;
r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one
r = _mm_packus_epi16 (r,r); //we need 64 bits only
return64(r);
}
_NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.s16 d0,d0,#16
_NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b)
{
uint16x4_t res64;
return64(_mm_srli_epi16(_pM128i(a), b));
}
_NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
_NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b)
{
uint32x2_t res64;
return64(_mm_srli_epi32(_pM128i(a), b));
}
_NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
_NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
{
uint64x1_t res64;
return64(_mm_srli_epi64(_pM128i(a), b));
}
_NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
_NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8
{
//no 8 bit shift available, go to 16 bit trick
__m128i zero, mask0, a_sign, r, a_sign_mask;
_NEON2SSE_ALIGN_16 static const int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0, 0x00f8, 0x00fc, 0x00fe, 0x00ff};
zero = _mm_setzero_si128();
mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
a_sign = _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0
r = _mm_srai_epi16 (a, b);
a_sign_mask = _mm_and_si128 (mask0, a_sign);
r = _mm_andnot_si128 (mask0, r);
return _mm_or_si128 (r, a_sign_mask);
}
_NEON2SSESTORAGE int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
#define vshrq_n_s16 _mm_srai_epi16
_NEON2SSESTORAGE int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
#define vshrq_n_s32 _mm_srai_epi32
_NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
_NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
{
//SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
__m128i c1, signmask,a0, res64;
_NEON2SSE_ALIGN_16 static const uint64_t mask[] = {0x8000000000000000, 0x8000000000000000};
c1 = _mm_cmpeq_epi32(a,a); //0xffffffffffffffff
signmask = _mm_slli_epi64 (c1, (64 - b));
a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit
a0 = _MM_CMPEQ_EPI64 (a, a0);
signmask = _mm_and_si128(a0, signmask);
res64 = _mm_srli_epi64 (a, b);
return _mm_or_si128(res64, signmask);
}
_NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
_NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8
{
//no 8 bit shift available, need the special trick
__m128i mask0, r;
_NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f, 0xff07, 0xff03, 0xff01, 0xff00};
mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
r = _mm_srli_epi16 ( a, b);
return _mm_and_si128 (r, mask0);
}
_NEON2SSESTORAGE uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16
#define vshrq_n_u16 _mm_srli_epi16
_NEON2SSESTORAGE uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
#define vshrq_n_u32 _mm_srli_epi32
_NEON2SSESTORAGE uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
#define vshrq_n_u64 _mm_srli_epi64
//*************************** Vector shift left by constant *************************
//*********************************************************************************
_NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
_NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0
{
//no 8 bit shift available, go to 16 bit
int8x8_t res64;
__m128i r;
r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
r = _mm_slli_epi16 (r, b); //SSE2
r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only
return64(r);
}
_NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
_NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b)
{
int16x4_t res64;
return64(_mm_slli_epi16(_pM128i(a), b));
}
_NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
_NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b)
{
int32x2_t res64;
return64(_mm_slli_epi32(_pM128i(a), b));
}
_NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
_NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b)
{
int64x1_t res64;
return64(_mm_slli_epi64(_pM128i(a), b));
}
_NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
_NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b)
{
//no 8 bit shift available, go to 16 bit
uint8x8_t res64;
__m128i mask8;
__m128i r;
mask8 = _mm_set1_epi16(0xff);
r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
r = _mm_slli_epi16 (r, b); //SSE2
r = _mm_and_si128(r, mask8); //to avoid saturation
r = _mm_packus_epi16 (r,r); //we need 64 bits only
return64(r);
}
_NEON2SSESTORAGE uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
#define vshl_n_u16 vshl_n_s16
_NEON2SSESTORAGE uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
#define vshl_n_u32 vshl_n_s32
_NEON2SSESTORAGE uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
#define vshl_n_u64 vshl_n_s64
_NEON2SSESTORAGE int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
#define vshlq_n_s8 vshlq_n_u8
_NEON2SSESTORAGE int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
#define vshlq_n_s16 _mm_slli_epi16
_NEON2SSESTORAGE int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
#define vshlq_n_s32 _mm_slli_epi32
_NEON2SSESTORAGE int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
#define vshlq_n_s64 _mm_slli_epi64
_NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
_NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)
{
//no 8 bit shift available, need the special trick
__m128i mask0, r;
_NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff, 0xe0ff, 0xc0ff, 0x80ff, 0xff};
mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
r = _mm_slli_epi16 ( a, b);
return _mm_and_si128 (r, mask0);
}
_NEON2SSESTORAGE uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
#define vshlq_n_u16 vshlq_n_s16
_NEON2SSESTORAGE uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
#define vshlq_n_u32 vshlq_n_s32
_NEON2SSESTORAGE uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
#define vshlq_n_u64 vshlq_n_s64
//************* Vector rounding shift right by constant ******************
//*************************************************************************
//No corresponding x86 intrinsics exist, need to do some tricks
_NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
_NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8
{
//no 8 bit shift available, go to 16 bit
int8x8_t res64;
__m128i r, maskb;
r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
r = _mm_srai_epi16 (r, b);
r = _mm_add_epi16 (r, maskb); //actual rounding
r = _mm_packs_epi16 (r,r); ////we need 64 bits only
return64(r);
}
_NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
_NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b)
{
int16x4_t res64;
return64(vrshrq_n_s16(_pM128i(a), b));
}
_NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
_NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b)
{
int32x2_t res64;
return64(vrshrq_n_s32(_pM128i(a), b));
}
_NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
{
//serial solution is faster
int64x1_t res;
int64_t a_i64 = *( int64_t*)&a;
if(b==64) {
res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63;
} else {
int64_t maskb = a_i64 & (( int64_t)1 << (b - 1));
res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1));
}
return res;
}
_NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
_NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8
{
//no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one
uint8x8_t res64;
__m128i r, maskb;
r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
r = _mm_srli_epi16 (r, b);
r = _mm_add_epi16 (r, maskb); //actual rounding
r = _mm_packus_epi16 (r,r); ////we need 64 bits only
return64(r);
}
_NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16
_NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b)
{
uint16x4_t res64;
return64(vrshrq_n_u16(_pM128i(a), b));
}
_NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
_NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b)
{
uint32x2_t res64;
return64(vrshrq_n_u32(_pM128i(a), b));
}
_NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
_NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
{
uint64x1_t res64;
return64(vrshrq_n_u64(_pM128i(a), b));
}
_NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
_NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8
{
//no 8 bit shift available, go to 16 bit trick
__m128i r, mask1, maskb;
_NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
r = vshrq_n_s8 (a, b);
mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1
return _mm_add_epi8(r, maskb); //actual rounding
}
_NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
_NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
{
__m128i maskb, r;
maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
maskb = _mm_srli_epi16(maskb, 15); //1 or 0
r = _mm_srai_epi16 (a, b);
return _mm_add_epi16 (r, maskb); //actual rounding
}
_NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
_NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
{
__m128i maskb, r;
maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
maskb = _mm_srli_epi32 (maskb,31); //1 or 0
r = _mm_srai_epi32(a, b);
return _mm_add_epi32 (r, maskb); //actual rounding
}
_NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
_NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
{
//solution may be not optimal compared with a serial one
__m128i maskb;
int64x2_t r;
maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
maskb = _mm_srli_epi64 (maskb,63); //1 or 0
r = vshrq_n_s64(a, b);
return _mm_add_epi64 (r, maskb); //actual rounding
}
_NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
_NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8
{
//no 8 bit shift available, go to 16 bit trick
__m128i r, mask1, maskb;
_NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
r = vshrq_n_u8 (a, b);
mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1
return _mm_add_epi8(r, maskb); //actual rounding
}
_NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16
_NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
{
__m128i maskb, r;
maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
maskb = _mm_srli_epi16(maskb, 15); //1 or 0
r = _mm_srli_epi16 (a, b);
return _mm_add_epi16 (r, maskb); //actual rounding
}
_NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
_NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
{
__m128i maskb, r;
maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
maskb = _mm_srli_epi32 (maskb,31); //1 or 0
r = _mm_srli_epi32(a, b);
return _mm_add_epi32 (r, maskb); //actual rounding
}
_NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
_NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b)
{
//solution may be not optimal compared with a serial one
__m128i maskb, r;
maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
maskb = _mm_srli_epi64 (maskb,63); //1 or 0
r = _mm_srli_epi64(a, b);
return _mm_add_epi64 (r, maskb); //actual rounding
}
//************* Vector shift right by constant and accumulate *********
//*********************************************************************
_NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
_NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8
{
int8x8_t shift;
shift = vshr_n_s8(b, c);
return vadd_s8( a, shift);
}
_NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
_NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16
{
int16x4_t shift;
shift = vshr_n_s16( b, c);
return vadd_s16(a, shift);
}
_NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
_NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32
{
//may be not optimal compared with the serial execution
int32x2_t shift;
shift = vshr_n_s32(b, c);
return vadd_s32( a, shift);
}
_NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
_NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
{
//may be not optimal compared with a serial solution
int64x1_t shift;
shift = vshr_n_s64(b, c);
return vadd_s64( a, shift);
}
_NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
_NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8
{
uint8x8_t shift;
shift = vshr_n_u8(b, c);
return vadd_u8(a, shift);
}
_NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16
_NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16
{
uint16x4_t shift;
shift = vshr_n_u16(b, c);
return vadd_u16(a,shift);
}
_NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
_NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32
{
//may be not optimal compared with the serial execution
uint32x2_t shift;
shift = vshr_n_u32(b, c);
return vadd_u32( a, shift);
}
_NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
_NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64
{
//may be not optimal compared with the serial execution
uint64x1_t shift;
shift = vshr_n_u64(b, c);
return vadd_u64(a, shift);
}
_NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
_NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8
{
int8x16_t shift;
shift = vshrq_n_s8(b, c);
return vaddq_s8(a, shift);
}
_NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
_NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16
{
int16x8_t shift;
shift = vshrq_n_s16(b, c);
return vaddq_s16(a, shift);
}
_NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
_NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32
{
int32x4_t shift;
shift = vshrq_n_s32(b, c);
return vaddq_s32(a, shift);
}
_NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
_NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64
{
int64x2_t shift;
shift = vshrq_n_s64(b, c);
return vaddq_s64( a, shift);
}
_NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
_NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8
{
uint8x16_t shift;
shift = vshrq_n_u8(b, c);
return vaddq_u8(a, shift);
}
_NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16
_NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16
{
uint16x8_t shift;
shift = vshrq_n_u16(b, c);
return vaddq_u16(a, shift);
}
_NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
_NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32
{
uint32x4_t shift;
shift = vshrq_n_u32(b, c);
return vaddq_u32(a, shift);
}
_NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
_NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64
{
uint64x2_t shift;
shift = vshrq_n_u64(b, c);
return vaddq_u64(a, shift);
}
//************* Vector rounding shift right by constant and accumulate ****************************
//************************************************************************************************
_NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
_NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8
{
int8x8_t shift;
shift = vrshr_n_s8(b, c);
return vadd_s8( a, shift);
}
_NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
_NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16
{
int16x4_t shift;
shift = vrshr_n_s16( b, c);
return vadd_s16(a, shift);
}
_NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
_NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32
{
//may be not optimal compared with the serial execution
int32x2_t shift;
shift = vrshr_n_s32(b, c);
return vadd_s32( a, shift);
}
_NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
{
int64x1_t shift;
shift = vrshr_n_s64(b, c);
return vadd_s64( a, shift);
}
_NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
_NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8
{
uint8x8_t shift;
shift = vrshr_n_u8(b, c);
return vadd_u8(a, shift);
}
_NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16
_NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16
{
uint16x4_t shift;
shift = vrshr_n_u16(b, c);
return vadd_u16(a,shift);
}
_NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
_NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32
{
//may be not optimal compared with the serial execution
uint32x2_t shift;
shift = vrshr_n_u32(b, c);
return vadd_u32( a, shift);
}
_NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
{
//may be not optimal compared with the serial execution
uint64x1_t shift;
shift = vrshr_n_u64(b, c);
return vadd_u64( a, shift);
}
_NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
_NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8
{
int8x16_t shift;
shift = vrshrq_n_s8(b, c);
return vaddq_s8(a, shift);
}
_NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
_NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16
{
int16x8_t shift;
shift = vrshrq_n_s16(b, c);
return vaddq_s16(a, shift);
}
_NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
_NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32
{
int32x4_t shift;
shift = vrshrq_n_s32(b, c);
return vaddq_s32(a, shift);
}
_NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
_NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
{
int64x2_t shift;
shift = vrshrq_n_s64(b, c);
return vaddq_s64(a, shift);
}
_NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
_NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8
{
uint8x16_t shift;
shift = vrshrq_n_u8(b, c);
return vaddq_u8(a, shift);
}
_NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16
_NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16
{
uint16x8_t shift;
shift = vrshrq_n_u16(b, c);
return vaddq_u16(a, shift);
}
_NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
_NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32
{
uint32x4_t shift;
shift = vrshrq_n_u32(b, c);
return vaddq_u32(a, shift);
}
_NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
_NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)
{
uint64x2_t shift;
shift = vrshrq_n_u64(b, c);
return vaddq_u64(a, shift);
}
//**********************Vector saturating shift left by constant *****************************
//********************************************************************************************
//we don't check const ranges assuming they are met
_NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
_NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0
{
//no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
int8x8_t res64;
__m128i a128, r128;
a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
r128 = _mm_slli_epi16 (a128, b);
r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only
return64(r128);
}
_NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
_NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0
{
// go to 32 bit to get the auto saturation (in packs function)
int16x4_t res64;
__m128i a128, r128;
a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
r128 = _mm_slli_epi32 (a128, b); //shift_res
r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only
return64(r128);
}
_NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
_NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b)
{
//serial execution may be faster
int32x2_t res64;
return64(vqshlq_n_s32 (_pM128i(a), b));
}
_NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
{
// no effective SIMD solution here
int64x1_t res;
int64_t bmask;
int64_t a_i64 = *( int64_t*)&a;
bmask = ( int64_t)1 << (63 - b); //positive
if (a_i64 >= bmask) {
res.m64_i64[0] = ~(_SIGNBIT64);
} else {
res.m64_i64[0] = (a_i64 <= -bmask) ? _SIGNBIT64 : a_i64 << b;
}
return res;
}
_NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
_NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0
{
//no 8 bit shift available in IA32 SIMD, go to 16 bit
uint8x8_t res64;
__m128i a128, r128;
a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
r128 = _mm_slli_epi16 (a128, b); //shift_res
r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
return64(r128);
}
_NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0
_NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0
{
// go to 32 bit to get the auto saturation (in packus function)
uint16x4_t res64;
__m128i a128, r128;
a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1
r128 = _mm_slli_epi32 (a128, b); //shift_res
r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16
return64(r128);
}
_NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
_NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b)
{
uint32x2_t res64;
return64(vqshlq_n_u32(_pM128i(a), b));
}
_NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
{
// no effective SIMD solution here
uint64x1_t res;
uint64_t bmask;
uint64_t a_i64 = *(uint64_t*)&a;
bmask = ( uint64_t)1 << (64 - b);
res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a
return res;
}
_NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
_NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0
{
// go to 16 bit to get the auto saturation (in packs function)
__m128i a128, r128_1, r128_2;
a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
r128_1 = _mm_slli_epi16 (a128, b);
//swap hi and low part of a128 to process the remaining data
a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
a128 = _MM_CVTEPI8_EPI16 (a128);
r128_2 = _mm_slli_epi16 (a128, b);
return _mm_packs_epi16 (r128_1, r128_2); //saturated s8
}
_NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
_NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0
{
// manual saturation solution looks LESS optimal than 32 bits conversion one
// go to 32 bit to get the auto saturation (in packs function)
__m128i a128, r128_1, r128_2;
a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
r128_1 = _mm_slli_epi32 (a128, b); //shift_res
//swap hi and low part of a128 to process the remaining data
a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
a128 = _MM_CVTEPI16_EPI32 (a128);
r128_2 = _mm_slli_epi32 (a128, b);
return _mm_packs_epi32 (r128_1, r128_2); //saturated s16
}
_NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
_NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0
{
// no 64 bit saturation option available, special tricks necessary
__m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask;
c1 = _mm_cmpeq_epi32(a,a); //0xff..ff
maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones
saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0 otherwise
c7ffffff_mask = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not
shift_res = _mm_slli_epi32 (a, b);
shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
//result with positive numbers saturated
shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask);
//treat negative numbers
maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones and (32-b+1) zeros
saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0 otherwise
c7ffffff_mask = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not
shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
return _mm_or_si128 (c7ffffff_mask, shift_res_mask);
}
_NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
{
// no effective SIMD solution here
_NEON2SSE_ALIGN_16 int64_t atmp[2], res[2];
int64_t bmask;
int i;
bmask = ( int64_t)1 << (63 - b); //positive
_mm_store_si128((__m128i*)atmp, a);
for (i = 0; i<2; i++) {
if (atmp[i] >= bmask) {
res[i] = ~(_SIGNBIT64);
} else {
res[i] = (atmp[i] <= -bmask) ? _SIGNBIT64 : atmp[i] << b;
}
}
return _mm_load_si128((__m128i*)res);
}
_NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
_NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0
{
// go to 16 bit to get the auto saturation (in packs function)
__m128i a128, r128_1, r128_2;
a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1
r128_1 = _mm_slli_epi16 (a128, b);
//swap hi and low part of a128 to process the remaining data
a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
a128 = _MM_CVTEPU8_EPI16 (a128);
r128_2 = _mm_slli_epi16 (a128, b);
return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
}
_NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0
_NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0
{
// manual saturation solution looks more optimal than 32 bits conversion one
__m128i cb, c8000, a_signed, saturation_mask, shift_res;
cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 );
c8000 = _mm_set1_epi16 ((int16_t)0x8000);
//no unsigned shorts comparison in SSE, only signed available, so need the trick
a_signed = _mm_sub_epi16(a, c8000); //go to signed
saturation_mask = _mm_cmpgt_epi16 (a_signed, cb);
shift_res = _mm_slli_epi16 (a, b);
return _mm_or_si128 (shift_res, saturation_mask);
}
_NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
_NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0
{
// manual saturation solution, no 64 bit saturation option, the serial version may be faster
__m128i cb, c80000000, a_signed, saturation_mask, shift_res;
cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 );
c80000000 = _mm_set1_epi32 (0x80000000);
//no unsigned ints comparison in SSE, only signed available, so need the trick
a_signed = _mm_sub_epi32(a, c80000000); //go to signed
saturation_mask = _mm_cmpgt_epi32 (a_signed, cb);
shift_res = _mm_slli_epi32 (a, b);
return _mm_or_si128 (shift_res, saturation_mask);
}
_NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
{
// no effective SIMD solution here
_NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2];
uint64_t bmask;
int i;
bmask = ( uint64_t)1 << (64 - b);
_mm_store_si128((__m128i*)atmp, a);
for (i = 0; i<2; i++) {
res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a
}
return _mm_load_si128((__m128i*)res);
}
//**************Vector signed->unsigned saturating shift left by constant *************
//*************************************************************************************
_NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
_NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0
{
//no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
uint8x8_t res64;
__m128i a128, r128;
a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
r128 = _mm_slli_epi16 (a128, b);
r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
return64(r128);
}
_NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
_NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0
{
uint16x4_t res64;
__m128i a128, r128;
a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
r128 = _mm_slli_epi32 (a128, b); //shift_res
r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only
return64(r128);
}
_NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
_NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b)
{
int32x2_t res64;
return64( vqshluq_n_s32(_pM128i(a), b));
}
_NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster
{
uint64x1_t res;
uint64_t limit;
if (a.m64_i64[0]<=0) {
res.m64_u64[0] = 0;
} else {
limit = (uint64_t) 1 << (64 - b);
res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? res.m64_u64[0] = ~((uint64_t)0) : a.m64_i64[0] << b;
}
return res;
}
_NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
_NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0
{
__m128i a128, r128_1, r128_2;
a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
r128_1 = _mm_slli_epi16 (a128, b);
//swap hi and low part of a128 to process the remaining data
a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
a128 = _MM_CVTEPI8_EPI16 (a128);
r128_2 = _mm_slli_epi16 (a128, b);
return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
}
_NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
_NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0
{
// manual saturation solution looks LESS optimal than 32 bits conversion one
__m128i a128, r128_1, r128_2;
a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
r128_1 = _mm_slli_epi32 (a128, b); //shift_res
//swap hi and low part of a128 to process the remaining data
a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
a128 = _MM_CVTEPI16_EPI32 (a128);
r128_2 = _mm_slli_epi32 (a128, b);
return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16
}
_NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
_NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0
{
//solution may be not optimal compared with the serial one
__m128i zero, maskA, maskGT0, a0, a_masked, a_shift;
zero = _mm_setzero_si128();
maskA = _mm_cmpeq_epi32(a, a);
maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros
//saturate negative numbers to zero
maskGT0 = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers)
a0 = _mm_and_si128 (a, maskGT0); //negative are zeros now
//saturate positive to 0xffffffff
a_masked = _mm_and_si128 (a0, maskA);
a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise
a_shift = _mm_slli_epi32 (a0, b);
return _mm_or_si128 (a_shift, a_masked); //actual saturation
}
_NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
{
// no effective SIMD solution here, serial execution looks faster
_NEON2SSE_ALIGN_16 int64_t atmp[2];
_NEON2SSE_ALIGN_16 uint64_t res[2];
uint64_t limit;
int i;
_mm_store_si128((__m128i*)atmp, a);
for (i = 0; i<2; i++) {
if (atmp[i]<=0) {
res[i] = 0;
} else {
limit = (uint64_t) 1 << (64 - b);
res[i] = ( ((uint64_t)atmp[i]) >= limit) ? res[i] = ~((uint64_t)0) : atmp[i] << b;
}
}
return _mm_load_si128((__m128i*)res);
}
//************** Vector narrowing shift right by constant **************
//**********************************************************************
_NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
_NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
{
int8x8_t res64;
__m128i r16;
r16 = vshrq_n_s16(a,b);
r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
return64(r16);
}
_NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
_NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
{
int16x4_t res64;
__m128i r32;
r32 = vshrq_n_s32(a,b);
r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
return64(r32);
}
_NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
_NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
{
int32x2_t res64;
__m128i r64;
r64 = vshrq_n_s64(a,b);
r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
return64(r64);
}
_NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
_NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
{
uint8x8_t res64;
__m128i mask, r16;
mask = _mm_set1_epi16(0xff);
r16 = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
r16 = _mm_and_si128(r16, mask); //to avoid saturation
r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only
return64(r16);
}
_NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
_NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
{
uint16x4_t res64;
__m128i mask, r32;
mask = _mm_set1_epi32(0xffff);
r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16)
r32 = _mm_and_si128(r32, mask); //to avoid saturation
r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
return64(r32);
}
_NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
_NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
{
uint32x2_t res64;
__m128i r64;
r64 = vshrq_n_u64(a,b);
r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
return64(r64);
}
//************** Vector signed->unsigned narrowing saturating shift right by constant ********
//*********************************************************************************************
_NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
_NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8
{
uint8x8_t res64;
__m128i r16;
r16 = vshrq_n_s16(a,b);
r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only
return64(r16);
}
_NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
_NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16
{
uint16x4_t res64;
__m128i r32;
r32 = vshrq_n_s32(a,b);
r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow(signed to unsigned), use low 64 bits only
return64(r32);
}
_NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
{
_NEON2SSE_ALIGN_16 int64_t atmp[2];
uint32x2_t res;
int64_t res64;
_mm_store_si128((__m128i*)atmp, a);
if (atmp[0] < 0) {
res.m64_u32[0] = 0;
} else {
res64 = (atmp[0] >> b);
res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64;
}
if (atmp[1] < 0) {
res.m64_u32[1] = 0;
} else {
res64 = (atmp[1] >> b);
res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64;
}
return res;
}
//**** Vector signed->unsigned rounding narrowing saturating shift right by constant *****
_NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
_NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8
{
//solution may be not optimal compared with the serial one
__m128i r16;
uint8x8_t res64;
r16 = vrshrq_n_s16(a,b);
r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only
return64(r16);
}
_NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
_NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16
{
//solution may be not optimal compared with the serial one
__m128i r32;
uint16x4_t res64;
r32 = vrshrq_n_s32(a,b);
r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow (signed to unsigned), use low 64 bits only
return64(r32);
}
_NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
{
_NEON2SSE_ALIGN_16 int64_t atmp[2];
uint32x2_t res;
int64_t res64;
_mm_store_si128((__m128i*)atmp, a);
if (atmp[0] < 0) {
res.m64_u32[0] = 0;
} else {
res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) );
res.m64_u32[0] = (uint32_t) ((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64);
}
if (atmp[1] < 0) {
res.m64_u32[1] = 0;
} else {
res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) );
res.m64_u32[1] = (uint32_t)((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64);
}
return res;
}
//***** Vector narrowing saturating shift right by constant ******
//*****************************************************************
_NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
_NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8
{
int8x8_t res64;
__m128i r16;
r16 = vshrq_n_s16(a,b);
r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
return64(r16);
}
_NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
_NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16
{
int16x4_t res64;
__m128i r32;
r32 = vshrq_n_s32(a,b);
r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only
return64(r32);
}
_NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
{
//no optimal SIMD solution found
_NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2];
int32x2_t res;
_mm_store_si128((__m128i*)atmp, a);
res64[0] = (atmp[0] >> b);
res64[1] = (atmp[1] >> b);
if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
res.m64_i32[0] = (int32_t)res64[0];
res.m64_i32[1] = (int32_t)res64[1];
return res;
}
_NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.s16 d0,q0,#8
_NEON2SSE_INLINE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQSHRN.s16 d0,q0,#8
{
uint8x8_t res64;
__m128i r16;
r16 = vshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
return64(r16);
}
_NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
_NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16
{
uint16x4_t res64;
__m128i r32;
r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
return64(r32);
}
_NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
_NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
{
//serial solution may be faster
uint32x2_t res64;
__m128i r64, res_hi, zero;
zero = _mm_setzero_si128();
r64 = vshrq_n_u64(a,b);
res_hi = _mm_srli_epi64(r64, 32);
res_hi = _mm_cmpgt_epi32(res_hi, zero);
r64 = _mm_or_si128(r64, res_hi);
r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
return64(r64);
}
//********* Vector rounding narrowing shift right by constant *************************
//****************************************************************************************
_NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
_NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
{
int8x8_t res64;
__m128i r16;
r16 = vrshrq_n_s16(a,b);
r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
return64(r16);
}
_NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
_NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
{
int16x4_t res64;
__m128i r32;
r32 = vrshrq_n_s32(a,b);
r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
return64(r32);
}
_NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
_NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
{
int32x2_t res64;
__m128i r64;
r64 = vrshrq_n_s64(a,b);
r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
return64(r64);
}
_NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
_NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
{
uint8x8_t res64;
__m128i mask, r16;
mask = _mm_set1_epi16(0xff);
r16 = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
r16 = _mm_and_si128(r16, mask); //to avoid saturation
r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
return64(r16);
}
_NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
_NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
{
uint16x4_t res64;
__m128i mask, r32;
mask = _mm_set1_epi32(0xffff);
r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
r32 = _mm_and_si128(r32, mask); //to avoid saturation
r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
return64(r32);
}
_NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
_NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster
{
uint32x2_t res64;
__m128i r64;
r64 = vrshrq_n_u64(a,b);
r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
return64(r64);
}
//************* Vector rounding narrowing saturating shift right by constant ************
//****************************************************************************************
_NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
_NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8
{
int8x8_t res64;
__m128i r16;
r16 = vrshrq_n_s16(a,b);
r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
return64(r16);
}
_NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
_NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16
{
int16x4_t res64;
__m128i r32;
r32 = vrshrq_n_s32(a,b);
r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only
return64(r32);
}
_NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
{
//no optimal SIMD solution found
_NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2];
int32x2_t res;
_mm_store_si128((__m128i*)atmp, a);
maskb[0] = atmp[0] & (( int64_t)1 << (b - 1));
res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result
maskb[1] = atmp[1] & (( int64_t)1 << (b - 1));
res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result
if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
res.m64_i32[0] = (int32_t)res64[0];
res.m64_i32[1] = (int32_t)res64[1];
return res;
}
_NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.s16 d0,q0,#8
_NEON2SSE_INLINE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQRSHRN.s16 d0,q0,#8
{
uint8x8_t res64;
__m128i r16;
r16 = vrshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
return64(r16);
}
_NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
_NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16
{
uint16x4_t res64;
__m128i r32;
r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
return64(r32);
}
_NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
_NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
{
//serial solution may be faster
uint32x2_t res64;
__m128i r64, res_hi, zero;
zero = _mm_setzero_si128();
r64 = vrshrq_n_u64(a,b);
res_hi = _mm_srli_epi64(r64, 32);
res_hi = _mm_cmpgt_epi32(res_hi, zero);
r64 = _mm_or_si128(r64, res_hi);
r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
return64(r64);
}
//************** Vector widening shift left by constant ****************
//************************************************************************
_NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
_NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0
{
__m128i r;
r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
return _mm_slli_epi16 (r, b);
}
_NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
_NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0
{
__m128i r;
r = _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1,
return _mm_slli_epi32 (r, b);
}
_NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
_NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0
{
__m128i r;
r = _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1,
return _mm_slli_epi64 (r, b);
}
_NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
_NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0
{
//no uint8 to uint16 conversion available, manual conversion used
__m128i zero, r;
zero = _mm_setzero_si128 ();
r = _mm_unpacklo_epi8(_pM128i(a), zero);
return _mm_slli_epi16 (r, b);
}
_NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0
_NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0
{
//no uint16 to uint32 conversion available, manual conversion used
__m128i zero, r;
zero = _mm_setzero_si128 ();
r = _mm_unpacklo_epi16(_pM128i(a), zero);
return _mm_slli_epi32 (r, b);
}
_NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
_NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0
{
//no uint32 to uint64 conversion available, manual conversion used
__m128i zero, r;
zero = _mm_setzero_si128 ();
r = _mm_unpacklo_epi32(_pM128i(a), zero);
return _mm_slli_epi64 (r, b);
}
//************************************************************************************
//**************************** Shifts with insert ************************************
//************************************************************************************
//takes each element in a vector, shifts them by an immediate value,
//and inserts the results in the destination vector. Bits shifted out of the each element are lost.
//**************** Vector shift right and insert ************************************
//Actually the "c" left bits from "a" are the only bits remained from "a" after the shift.
//All other bits are taken from b shifted.
_NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
_NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c)
{
int8x8_t res64;
return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c));
}
_NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
_NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c)
{
int16x4_t res64;
return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c));
}
_NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
_NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c)
{
int32x2_t res64;
return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c));
}
_NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
_NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
{
int64x1_t res;
if (c ==64)
res = a;
else{
res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros
}
return res;
}
_NEON2SSESTORAGE uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
#define vsri_n_u8 vsri_n_s8
_NEON2SSESTORAGE uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
#define vsri_n_u16 vsri_n_s16
_NEON2SSESTORAGE uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
#define vsri_n_u32 vsri_n_s32
_NEON2SSESTORAGE uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
#define vsri_n_u64 vsri_n_s64
_NEON2SSESTORAGE poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
#define vsri_n_p8 vsri_n_u8
_NEON2SSESTORAGE poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
#define vsri_n_p16 vsri_n_u16
_NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
_NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8
{
__m128i maskA, a_masked;
uint8x16_t b_shift;
_NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used
maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros
a_masked = _mm_and_si128 (a, maskA);
b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift
return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a)
}
_NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
_NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16
{
//to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a
uint16x8_t b_shift;
uint16x8_t a_c;
b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift
a_c = vshrq_n_u16( a, (16 - c));
a_c = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a
return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
}
_NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
_NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32
{
//to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a
uint32x4_t b_shift;
uint32x4_t a_c;
b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift
a_c = vshrq_n_u32( a, (32 - c));
a_c = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a
return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
}
_NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
_NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
{
//serial solution may be faster
uint64x2_t b_shift;
uint64x2_t a_c;
b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift
a_c = _mm_srli_epi64(a, (64 - c));
a_c = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a
return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
}
_NEON2SSESTORAGE uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
#define vsriq_n_u8 vsriq_n_s8
_NEON2SSESTORAGE uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
#define vsriq_n_u16 vsriq_n_s16
_NEON2SSESTORAGE uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
#define vsriq_n_u32 vsriq_n_s32
_NEON2SSESTORAGE uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
#define vsriq_n_u64 vsriq_n_s64
_NEON2SSESTORAGE poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
#define vsriq_n_p8 vsriq_n_u8
_NEON2SSESTORAGE poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
#define vsriq_n_p16 vsriq_n_u16
//***** Vector shift left and insert *********************************************
//*********************************************************************************
//Actually the "c" right bits from "a" are the only bits remained from "a" after the shift.
//All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted".
_NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
_NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c)
{
int8x8_t res64;
return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c));
}
_NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
_NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c)
{
int16x4_t res64;
return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c));
}
_NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
_NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c)
{
int32x2_t res64;
return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c));
}
_NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
_NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c)
{
int64x1_t res;
res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros
return res;
}
_NEON2SSESTORAGE uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
#define vsli_n_u8 vsli_n_s8
_NEON2SSESTORAGE uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
#define vsli_n_u16 vsli_n_s16
_NEON2SSESTORAGE uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
#define vsli_n_u32 vsli_n_s32
_NEON2SSESTORAGE uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
#define vsli_n_u64 vsli_n_s64
_NEON2SSESTORAGE poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
#define vsli_n_p8 vsli_n_u8
_NEON2SSESTORAGE poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
#define vsli_n_p16 vsli_n_u16
_NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
_NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0
{
__m128i maskA, a_masked;
int8x16_t b_shift;
_NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask
maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones
b_shift = vshlq_n_s8( b, c);
a_masked = _mm_and_si128 (a, maskA);
return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a)
}
_NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
_NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0
{
//to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
int16x8_t b_shift;
int16x8_t a_c;
b_shift = vshlq_n_s16( b, c);
a_c = vshlq_n_s16( a, (16 - c));
a_c = _mm_srli_epi16(a_c, (16 - c));
return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
}
_NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
_NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0
{
//solution may be not optimal compared with the serial one
//to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
int32x4_t b_shift;
int32x4_t a_c;
b_shift = vshlq_n_s32( b, c);
a_c = vshlq_n_s32( a, (32 - c));
a_c = _mm_srli_epi32(a_c, (32 - c));
return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
}
_NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
_NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0
{
//solution may be not optimal compared with the serial one
//to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
int64x2_t b_shift;
int64x2_t a_c;
b_shift = vshlq_n_s64( b, c);
a_c = vshlq_n_s64( a, (64 - c));
a_c = _mm_srli_epi64(a_c, (64 - c));
return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
}
_NEON2SSESTORAGE uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
#define vsliq_n_u8 vsliq_n_s8
_NEON2SSESTORAGE uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
#define vsliq_n_u16 vsliq_n_s16
_NEON2SSESTORAGE uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
#define vsliq_n_u32 vsliq_n_s32
_NEON2SSESTORAGE uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
#define vsliq_n_u64 vsliq_n_s64
_NEON2SSESTORAGE poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
#define vsliq_n_p8 vsliq_n_u8
_NEON2SSESTORAGE poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
#define vsliq_n_p16 vsliq_n_u16
// ***********************************************************************************************
// ****************** Loads and stores of a single vector ***************************************
// ***********************************************************************************************
//Performs loads and stores of a single vector of some type.
//******************************* Loads ********************************************************
// ***********************************************************************************************
//We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);.
//also for SSE3 supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous.
// it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access
//If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead;
#define LOAD_SI128(ptr) \
( ((uintptr_t)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr))
_NEON2SSESTORAGE uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
#define vld1q_u8 LOAD_SI128
_NEON2SSESTORAGE uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
#define vld1q_u16 LOAD_SI128
_NEON2SSESTORAGE uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
#define vld1q_u32 LOAD_SI128
_NEON2SSESTORAGE uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
#define vld1q_u64 LOAD_SI128
_NEON2SSESTORAGE int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
#define vld1q_s8 LOAD_SI128
_NEON2SSESTORAGE int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
#define vld1q_s16 LOAD_SI128
_NEON2SSESTORAGE int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
#define vld1q_s32 LOAD_SI128
_NEON2SSESTORAGE int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
#define vld1q_s64 LOAD_SI128
_NEON2SSESTORAGE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers
/* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0]
{__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
__m128 f2;
f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]);
}*/
_NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
_NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr)
{
if( (((uintptr_t)(ptr)) & 15 ) == 0 ) //16 bits aligned
return _mm_load_ps(ptr);
else
return _mm_loadu_ps(ptr);
}
_NEON2SSESTORAGE poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
#define vld1q_p8 LOAD_SI128
_NEON2SSESTORAGE poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
#define vld1q_p16 LOAD_SI128
_NEON2SSESTORAGE uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
#define vld1_u8(ptr) *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr))
_NEON2SSESTORAGE uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
#define vld1_u16 vld1_u8
_NEON2SSESTORAGE uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
#define vld1_u32 vld1_u8
_NEON2SSESTORAGE uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
#define vld1_u64 vld1_u8
_NEON2SSESTORAGE int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
#define vld1_s8 vld1_u8
_NEON2SSESTORAGE int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
#define vld1_s16 vld1_u16
_NEON2SSESTORAGE int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
#define vld1_s32 vld1_u32
_NEON2SSESTORAGE int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
#define vld1_s64 vld1_u64
_NEON2SSESTORAGE float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
_NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
_NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr)
{
float32x2_t res;
res.m64_f32[0] = *(ptr);
res.m64_f32[1] = *(ptr + 1);
return res;
}
_NEON2SSESTORAGE poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
#define vld1_p8 vld1_u8
_NEON2SSESTORAGE poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
#define vld1_p16 vld1_u16
_NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
_NEON2SSE_INLINE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr)
{
if ((((uintptr_t)(ptr)) & 15) == 0) //16 bits aligned
return _mm_load_pd(ptr);
else
return _mm_loadu_pd(ptr);
}
//***********************************************************************************************************
//******* Lane load functions - insert the data at vector's given position (lane) *************************
//***********************************************************************************************************
_NEON2SSESTORAGE uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
#define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
_NEON2SSESTORAGE uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
#define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
_NEON2SSESTORAGE uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
#define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
_NEON2SSESTORAGE uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
#define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane); // _p;
_NEON2SSESTORAGE int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
#define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
_NEON2SSESTORAGE int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
#define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
_NEON2SSESTORAGE int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
#define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
_NEON2SSESTORAGE float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
//current IA SIMD doesn't support float16
_NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
_NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane)
{
//we need to deal with ptr 16bit NOT aligned case
__m128 p;
p = _mm_set1_ps(*(ptr));
return _MM_INSERT_PS(vec, p, _INSERTPS_NDX(0, lane));
}
_NEON2SSESTORAGE int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
#define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
_NEON2SSESTORAGE poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
#define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
_NEON2SSESTORAGE poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
#define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
_NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
_NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane)
{
uint8x8_t res;
res = vec;
res.m64_u8[lane] = *(ptr);
return res;
}
_NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
_NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane)
{
uint16x4_t res;
res = vec;
res.m64_u16[lane] = *(ptr);
return res;
}
_NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
_NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane)
{
uint32x2_t res;
res = vec;
res.m64_u32[lane] = *(ptr);
return res;
}
_NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
_NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane)
{
uint64x1_t res;
res.m64_u64[0] = *(ptr);
return res;
}
_NEON2SSESTORAGE int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
#define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane)
_NEON2SSESTORAGE int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
#define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane)
_NEON2SSESTORAGE int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
#define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane)
_NEON2SSESTORAGE float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
//current IA SIMD doesn't support float16
_NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
_NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane)
{
float32x2_t res;
res = vec;
res.m64_f32[lane] = *(ptr);
return res;
}
_NEON2SSESTORAGE int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
#define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane)
_NEON2SSESTORAGE poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
#define vld1_lane_p8 vld1_lane_u8
_NEON2SSESTORAGE poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
#define vld1_lane_p16 vld1_lane_s16
// ****************** Load single value ( set all lanes of vector with same value from memory)**********************
// ******************************************************************************************************************
_NEON2SSESTORAGE uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
#define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr))
_NEON2SSESTORAGE uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
#define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr))
_NEON2SSESTORAGE uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
#define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr))
_NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
_NEON2SSE_INLINE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr)
{
_NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)};
return LOAD_SI128(val);
}
_NEON2SSESTORAGE int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
#define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr))
_NEON2SSESTORAGE int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
#define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr))
_NEON2SSESTORAGE int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
#define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr))
_NEON2SSESTORAGE int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
#define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr)
_NEON2SSESTORAGE float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
//current IA SIMD doesn't support float16, need to go to 32 bits
_NEON2SSESTORAGE float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
#define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr))
_NEON2SSESTORAGE poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
#define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr))
_NEON2SSESTORAGE poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
#define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr))
_NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
{
uint8x8_t res;
int i;
for(i = 0; i<8; i++) {
res.m64_u8[i] = *(ptr);
}
return res;
}
_NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
{
uint16x4_t res;
int i;
for(i = 0; i<4; i++) {
res.m64_u16[i] = *(ptr);
}
return res;
}
_NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
{
uint32x2_t res;
res.m64_u32[0] = *(ptr);
res.m64_u32[1] = *(ptr);
return res;
}
_NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
_NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr)
{
uint64x1_t res;
res.m64_u64[0] = *(ptr);
return res;
}
_NEON2SSESTORAGE int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
#define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr)
_NEON2SSESTORAGE int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
#define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr)
_NEON2SSESTORAGE int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
#define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr)
_NEON2SSESTORAGE int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
#define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr)
_NEON2SSESTORAGE float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
//current IA SIMD doesn't support float16
_NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
_NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr)
{
float32x2_t res;
res.m64_f32[0] = *(ptr);
res.m64_f32[1] = res.m64_f32[0];
return res; // use last 64bits only
}
_NEON2SSESTORAGE poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
#define vld1_dup_p8 vld1_dup_u8
_NEON2SSESTORAGE poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
#define vld1_dup_p16 vld1_dup_u16
//*************************************************************************************
//********************************* Store **********************************************
//*************************************************************************************
// If ptr is 16bit aligned and you need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val);
//here we assume the case of NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro
#define STORE_SI128(ptr, val) \
(((uintptr_t)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val);
_NEON2SSESTORAGE void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
#define vst1q_u8 STORE_SI128
_NEON2SSESTORAGE void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
#define vst1q_u16 STORE_SI128
_NEON2SSESTORAGE void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
#define vst1q_u32 STORE_SI128
_NEON2SSESTORAGE void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
#define vst1q_u64 STORE_SI128
_NEON2SSESTORAGE void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
#define vst1q_s8 STORE_SI128
_NEON2SSESTORAGE void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
#define vst1q_s16 STORE_SI128
_NEON2SSESTORAGE void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
#define vst1q_s32 STORE_SI128
_NEON2SSESTORAGE void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
#define vst1q_s64 STORE_SI128
_NEON2SSESTORAGE void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
// IA32 SIMD doesn't work with 16bit floats currently
_NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
_NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val)
{
if( ((uintptr_t)(ptr) & 15) == 0 ) //16 bits aligned
_mm_store_ps (ptr, val);
else
_mm_storeu_ps (ptr, val);
}
_NEON2SSESTORAGE void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
#define vst1q_p8 vst1q_u8
_NEON2SSESTORAGE void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
#define vst1q_p16 vst1q_u16
_NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
_NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val)
{
int i;
for (i = 0; i<8; i++) {
*(ptr + i) = ((uint8_t*)&val)[i];
}
//_mm_storel_epi64((__m128i*)ptr, val);
return;
}
_NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
_NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val)
{
int i;
for (i = 0; i<4; i++) {
*(ptr + i) = ((uint16_t*)&val)[i];
}
//_mm_storel_epi64((__m128i*)ptr, val);
return;
}
_NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
_NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val)
{
int i;
for (i = 0; i<2; i++) {
*(ptr + i) = ((uint32_t*)&val)[i];
}
//_mm_storel_epi64((__m128i*)ptr, val);
return;
}
_NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
_NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val)
{
*(ptr) = *((uint64_t*)&val);
//_mm_storel_epi64((__m128i*)ptr, val);
return;
}
_NEON2SSESTORAGE void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
#define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val)
_NEON2SSESTORAGE void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
#define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val)
_NEON2SSESTORAGE void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
#define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val)
_NEON2SSESTORAGE void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
#define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val)
_NEON2SSESTORAGE void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
//current IA SIMD doesn't support float16
_NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
_NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val)
{
*(ptr) = val.m64_f32[0];
*(ptr + 1) = val.m64_f32[1];
return;
}
_NEON2SSESTORAGE void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
#define vst1_p8 vst1_u8
_NEON2SSESTORAGE void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
#define vst1_p16 vst1_u16
//***********Store a lane of a vector into memory (extract given lane) *********************
//******************************************************************************************
_NEON2SSESTORAGE void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
#define vst1q_lane_u8(ptr, val, lane) *(ptr) = (uint8_t) _MM_EXTRACT_EPI8 (val, lane)
_NEON2SSESTORAGE void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
#define vst1q_lane_u16(ptr, val, lane) *(ptr) = (uint16_t) _MM_EXTRACT_EPI16 (val, lane)
_NEON2SSESTORAGE void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
#define vst1q_lane_u32(ptr, val, lane) *(ptr) = (uint32_t) _MM_EXTRACT_EPI32 (val, lane)
_NEON2SSESTORAGE void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
#define vst1q_lane_u64(ptr, val, lane) *(ptr) = (uint64_t) _MM_EXTRACT_EPI64 (val, lane)
_NEON2SSESTORAGE void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
#define vst1q_lane_s8(ptr, val, lane) *(ptr) = (int8_t) _MM_EXTRACT_EPI8 (val, lane)
_NEON2SSESTORAGE void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
#define vst1q_lane_s16(ptr, val, lane) *(ptr) = (int16_t) _MM_EXTRACT_EPI16 (val, lane)
_NEON2SSESTORAGE void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
#define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
_NEON2SSESTORAGE void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
#define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
_NEON2SSESTORAGE void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
//current IA SIMD doesn't support float16
_NEON2SSESTORAGE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
_NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane)
{
int32_t ilane;
ilane = _MM_EXTRACT_PS(val,lane);
*(ptr) = *((float*)&ilane);
}
_NEON2SSESTORAGE void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
#define vst1q_lane_p8 vst1q_lane_u8
_NEON2SSESTORAGE void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
#define vst1q_lane_p16 vst1q_lane_s16
_NEON2SSESTORAGE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
_NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane)
{
*(ptr) = val.m64_u8[lane];
}
_NEON2SSESTORAGE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
_NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane)
{
*(ptr) = val.m64_u16[lane];
}
_NEON2SSESTORAGE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
_NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane)
{
*(ptr) = val.m64_u32[lane];
}
_NEON2SSESTORAGE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
_NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane)
{
*(ptr) = val.m64_u64[0];
}
_NEON2SSESTORAGE void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
#define vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane)
_NEON2SSESTORAGE void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
#define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane)
_NEON2SSESTORAGE void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
#define vst1_lane_s32(ptr, val, lane) vst1_lane_u32((uint32_t*)ptr, val, lane)
_NEON2SSESTORAGE void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
#define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane)
_NEON2SSESTORAGE void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
//current IA SIMD doesn't support float16
_NEON2SSESTORAGE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
_NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane)
{
*(ptr) = val.m64_f32[lane];
}
_NEON2SSESTORAGE void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
#define vst1_lane_p8 vst1_lane_u8
_NEON2SSESTORAGE void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
#define vst1_lane_p16 vst1_lane_s16
//***********************************************************************************************
//**************** Loads and stores of an N-element structure **********************************
//***********************************************************************************************
//These intrinsics load or store an n-element structure. The array structures are defined in the beginning
//We assume ptr is NOT aligned in general case, for more details see "Loads and stores of a single vector functions"
//****************** 2 elements load *********************************************
_NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
_NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0]
{
uint8x16x2_t v;
v.val[0] = vld1q_u8(ptr);
v.val[1] = vld1q_u8((ptr + 16));
v = vuzpq_s8(v.val[0], v.val[1]);
return v;
}
_NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
_NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0]
{
uint16x8x2_t v;
v.val[0] = vld1q_u16( ptr);
v.val[1] = vld1q_u16( (ptr + 8));
v = vuzpq_s16(v.val[0], v.val[1]);
return v;
}
_NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
_NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
{
uint32x4x2_t v;
v.val[0] = vld1q_u32 ( ptr);
v.val[1] = vld1q_u32 ( (ptr + 4));
v = vuzpq_s32(v.val[0], v.val[1]);
return v;
}
_NEON2SSESTORAGE int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);
#define vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr)
_NEON2SSESTORAGE int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
#define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr)
_NEON2SSESTORAGE int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
#define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr)
_NEON2SSESTORAGE float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
_NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
_NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
{
float32x4x2_t v;
v.val[0] = vld1q_f32 (ptr);
v.val[1] = vld1q_f32 ((ptr + 4));
v = vuzpq_f32(v.val[0], v.val[1]);
return v;
}
_NEON2SSESTORAGE poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
#define vld2q_p8 vld2q_u8
_NEON2SSESTORAGE poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
#define vld2q_p16 vld2q_u16
_NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
_NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr)
{
uint8x8x2_t v;
__m128i ld128;
ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit
ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask8_16_even_odd);
vst1q_u8((v.val), ld128); // v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
return v;
}
_NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
_NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr)
{
_NEON2SSE_ALIGN_16 uint16x4x2_t v;
__m128i ld128;
ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit
ld128 = _mm_shuffle_epi8(ld128, *(__m128i*) mask8_32_even_odd);
vst1q_u16((v.val), ld128);
return v;
}
_NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
_NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr)
{
_NEON2SSE_ALIGN_16 uint32x2x2_t v;
__m128i ld128;
ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit
ld128 = _mm_shuffle_epi32(ld128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
vst1q_u32((v.val), ld128);
return v;
}
_NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
_NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr)
{
uint64x1x2_t v;
v.val[0].m64_u64[0] = *(ptr);
v.val[1].m64_u64[0] = *(ptr + 1);
return v;
}
_NEON2SSESTORAGE int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
#define vld2_s8(ptr) vld2_u8((uint8_t*)ptr)
_NEON2SSESTORAGE int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
#define vld2_s16(ptr) vld2_u16((uint16_t*)ptr)
_NEON2SSESTORAGE int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
#define vld2_s32(ptr) vld2_u32((uint32_t*)ptr)
_NEON2SSESTORAGE int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
#define vld2_s64(ptr) vld2_u64((uint64_t*)ptr)
_NEON2SSESTORAGE float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example
_NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
_NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr)
{
float32x2x2_t v;
v.val[0].m64_f32[0] = *(ptr);
v.val[0].m64_f32[1] = *(ptr + 2);
v.val[1].m64_f32[0] = *(ptr + 1);
v.val[1].m64_f32[1] = *(ptr + 3);
return v;
}
_NEON2SSESTORAGE poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
#define vld2_p8 vld2_u8
_NEON2SSESTORAGE poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
#define vld2_p16 vld2_u16
//******************** Triplets ***************************************
//*********************************************************************
_NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
_NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0]
{
//a0,a1,a2,a3,...a7,a8,...a15, b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 ->
//a:0,3,6,9,12,15,b:2,5,8,11,14, c:1,4,7,10,13
//a:1,4,7,10,13, b:0,3,6,9,12,15,c:2,5,8,11,14,
//a:2,5,8,11,14, b:1,4,7,10,13, c:0,3,6,9,12,15
uint8x16x3_t v;
__m128i tmp0, tmp1,tmp2, tmp3;
_NEON2SSE_ALIGN_16 static const int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
_NEON2SSE_ALIGN_16 static const int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13};
_NEON2SSE_ALIGN_16 static const int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15};
v.val[0] = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15
v.val[1] = vld1q_u8 ((ptr + 16)); //b0,b1,b2,b3...b7, ...b15
v.val[2] = vld1q_u8 ((ptr + 32)); //c0,c1,c2,c3,...c7,...c15
tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11
tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13
tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15
tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15
tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x
tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
v.val[0] = _mm_or_si128(v.val[0],tmp3); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13,
tmp3 = _mm_slli_si128(tmp0, 5); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
v.val[1] = _mm_slli_si128(v.val[1], 5); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13,
v.val[1] = _mm_slli_si128(v.val[1],5); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
v.val[1] = _mm_srli_si128(v.val[1], 5); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
tmp3 = _mm_slli_si128(tmp3,11); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14,
tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
v.val[2] = _mm_slli_si128(v.val[2],5); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
v.val[2] = _mm_or_si128(v.val[2],tmp0); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15,
return v;
}
_NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
_NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0]
{
//a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
uint16x8x3_t v;
__m128i tmp0, tmp1,tmp2, tmp3;
_NEON2SSE_ALIGN_16 static const int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
_NEON2SSE_ALIGN_16 static const int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13};
_NEON2SSE_ALIGN_16 static const int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15};
v.val[0] = vld1q_u16 (ptr); //a0,a1,a2,a3,...a7,
v.val[1] = vld1q_u16 ((ptr + 8)); //b0,b1,b2,b3...b7
v.val[2] = vld1q_u16 ((ptr + 16)); //c0,c1,c2,c3,...c7
tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5,
tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6
tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7
tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,a0,a3,a6,
tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a0,a3,a6,b1,b4,b7,x,x
tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7
tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0
v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5,
v.val[0] = _mm_or_si128(v.val[0],tmp3); //a0,a3,a6,b1,b4,b7,c2,c5
tmp3 = _mm_slli_si128(tmp0, 4); //0,0,a0,a3,a6,a1,a4,a7
tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0
v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0
v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6,
v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,b0,b3,b6,
v.val[1] = _mm_slli_si128(v.val[1],6); //0,0,0,a1,a4,a7,b2,b5,
v.val[1] = _mm_srli_si128(v.val[1], 6); //a1,a4,a7,b2,b5,0,0,0,
tmp3 = _mm_srli_si128(tmp2,4); //c0,c3,c6, c1,c4,c7,0,0
tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,c0,c3,c6,
v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6,
tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0
tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7,
v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0
v.val[2] = _mm_slli_si128(v.val[2],4); //0,0, b0,b3,b6,0,0,0
v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0, b0,b3,b6,c1,c4,c7,
tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0
v.val[2] = _mm_or_si128(v.val[2],tmp0); //a2,a5,b0,b3,b6,c1,c4,c7,
return v;
}
_NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
_NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
{
//a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
uint32x4x3_t v;
__m128i tmp0, tmp1,tmp2, tmp3;
v.val[0] = vld1q_u32 (ptr); //a0,a1,a2,a3,
v.val[1] = vld1q_u32 ((ptr + 4)); //b0,b1,b2,b3
v.val[2] = vld1q_u32 ((ptr + 8)); //c0,c1,c2,c3,
tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2
tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1
tmp2 = _mm_shuffle_epi32(v.val[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3
tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2
v.val[0] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1
tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1
v.val[1] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0,
v.val[1] = _mm_unpackhi_epi64(v.val[1], tmp3); //a1,b0, b3,c2
v.val[2] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3
return v;
}
_NEON2SSESTORAGE int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
#define vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr))
_NEON2SSESTORAGE int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
#define vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr))
_NEON2SSESTORAGE int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
#define vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr))
_NEON2SSESTORAGE float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
_NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
_NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
{
//a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
float32x4x3_t v;
__m128 tmp0, tmp1,tmp2, tmp3;
v.val[0] = vld1q_f32 (ptr); //a0,a1,a2,a3,
v.val[1] = vld1q_f32 ((ptr + 4)); //b0,b1,b2,b3
v.val[2] = vld1q_f32 ((ptr + 8)); //c0,c1,c2,c3,
tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2
tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1
tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3
tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2
v.val[0] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1
tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1
v.val[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0,
v.val[1] = _mm_movehl_ps(tmp3,v.val[1]); //a1,b0, b3,c2
v.val[2] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3
return v;
}
poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
#define vld3q_p8 vld3q_u8
_NEON2SSESTORAGE poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
#define vld3q_p16 vld3q_u16
_NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
_NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0]
{
//a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
uint8x8x3_t v;
__m128i val0, val1, val2, tmp0, tmp1;
_NEON2SSE_ALIGN_16 static const int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14};
_NEON2SSE_ALIGN_16 static const int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0};
val0 = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7
val2 = _mm_loadl_epi64((__m128i*)(ptr + 16)); //c0,c1,c2,c3,...c7
tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6,
tmp1 = _mm_shuffle_epi8(val2, *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x
val0 = _mm_slli_si128(tmp0,10);
val0 = _mm_srli_si128(val0,10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0
val2 = _mm_slli_si128(tmp1,6); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x
val0 = _mm_or_si128(val0,val2); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x
_M64(v.val[0], val0);
val1 = _mm_slli_si128(tmp0,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5,
val1 = _mm_srli_si128(val1,11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0,
val2 = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0
val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0
val1 = _mm_or_si128(val1,val2); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x
_M64(v.val[1], val1);
tmp0 = _mm_srli_si128(tmp0,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0,
val2 = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0
val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c1,c4,c7,
val2 = _mm_or_si128(tmp0, val2); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x
_M64(v.val[2], val2);
return v;
}
_NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
_NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0]
{
//a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
uint16x4x3_t v;
__m128i val0, val1, val2, tmp0, tmp1;
_NEON2SSE_ALIGN_16 static const int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
val0 = vld1q_u16 (ptr); //a0,a1,a2,a3, b0,b1,b2,b3
val2 = _mm_loadl_epi64((__m128i*)(ptr + 8)); //c0,c1,c2,c3, x,x,x,x
tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1
tmp1 = _mm_shufflelo_epi16(val2, 201); //11 00 10 01 : c1, c2, c0, c3,
val0 = _mm_slli_si128(tmp0,10);
val0 = _mm_srli_si128(val0,10); //a0, a3, b2, 0,0, 0,0,
val2 = _mm_slli_si128(tmp1,14); //0,0,0,0,0,0,0,c1
val2 = _mm_srli_si128(val2,8); //0,0,0,c1,0,0,0,0
val0 = _mm_or_si128(val0,val2); //a0, a3, b2, c1, x,x,x,x
_M64(v.val[0], val0);
val1 = _mm_slli_si128(tmp0,4); //0,0,0,0,0,a1, b0, b3
val1 = _mm_srli_si128(val1,10); //a1, b0, b3, 0,0, 0,0,
val2 = _mm_srli_si128(tmp1,2); //c2, 0,0,0,0,0,0,0,
val2 = _mm_slli_si128(val2,6); //0,0,0,c2,0,0,0,0
val1 = _mm_or_si128(val1,val2); //a1, b0, b3, c2, x,x,x,x
_M64(v.val[1], val1);
tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0
tmp1 = _mm_srli_si128(tmp1,4);
tmp1 = _mm_slli_si128(tmp1,4); //0,0,c0, c3,
val2 = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3,
_M64(v.val[2], val2);
return v;
}
_NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
_NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0]
{
//a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1
uint32x2x3_t v;
__m128i val0, val1, val2;
val0 = vld1q_u32 (ptr); //a0,a1, b0,b1,
val2 = _mm_loadl_epi64((__m128i*) (ptr + 4)); //c0,c1, x,x
val0 = _mm_shuffle_epi32(val0, 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0
_M64(v.val[0], val0);
val2 = _mm_slli_si128(val2, 8); //x, x,c0,c1,
val1 = _mm_unpackhi_epi32(val0,val2); //a1,c0, b0, c1
_M64(v.val[1], val1);
val2 = _mm_srli_si128(val1, 8); //b0, c1, x, x,
_M64(v.val[2], val2);
return v;
}
_NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
_NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
{
uint64x1x3_t v;
v.val[0].m64_u64[0] = *(ptr);
v.val[1].m64_u64[0] = *(ptr + 1);
v.val[2].m64_u64[0] = *(ptr + 2);
return v;
}
_NEON2SSESTORAGE int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
#define vld3_s8(ptr) vld3_u8((uint8_t*)ptr)
_NEON2SSESTORAGE int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
#define vld3_s16(ptr) vld3_u16((uint16_t*)ptr)
_NEON2SSESTORAGE int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
#define vld3_s32(ptr) vld3_u32((uint32_t*)ptr)
int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
#define vld3_s64(ptr) vld3_u64((uint64_t*)ptr)
_NEON2SSESTORAGE float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
_NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
_NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr)
{
//a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1
float32x2x3_t v;
v.val[0].m64_f32[0] = *(ptr);
v.val[0].m64_f32[1] = *(ptr + 3);
v.val[1].m64_f32[0] = *(ptr + 1);
v.val[1].m64_f32[1] = *(ptr + 4);
v.val[2].m64_f32[0] = *(ptr + 2);
v.val[2].m64_f32[1] = *(ptr + 5);
return v;
}
_NEON2SSESTORAGE poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
#define vld3_p8 vld3_u8
_NEON2SSESTORAGE poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
#define vld3_p16 vld3_u16
//*************** Quadruples load ********************************
//*****************************************************************
_NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
_NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0]
{
uint8x16x4_t v;
__m128i tmp3, tmp2, tmp1, tmp0;
v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15
v.val[1] = vld1q_u8 ( (ptr + 16)); //b0, b1,b2,...b7.... b15
v.val[2] = vld1q_u8 ( (ptr + 32)); //c0, c1,c2,...c7....c15
v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15
tmp0 = _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
tmp1 = _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
tmp2 = _mm_unpackhi_epi8(v.val[0],v.val[1]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
tmp3 = _mm_unpackhi_epi8(v.val[2],v.val[3]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11
v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
tmp0 = _mm_unpacklo_epi32(v.val[0], v.val[2] ); ///a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
tmp1 = _mm_unpackhi_epi32(v.val[0], v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
tmp2 = _mm_unpacklo_epi32(v.val[1], v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13,
tmp3 = _mm_unpackhi_epi32(v.val[1], v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15
v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12
v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13
v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14
v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15
return v;
}
_NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
_NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0]
{
uint16x8x4_t v;
__m128i tmp3, tmp2, tmp1, tmp0;
tmp0 = vld1q_u16 (ptr); //a0,a1,a2,...a7
tmp1 = vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7
tmp2 = vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7
tmp3 = vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7
v.val[0] = _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3,
v.val[1] = _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3,
v.val[2] = _mm_unpackhi_epi16(tmp0,tmp1); //a4,b4, a5,b5, a6,b6, a7,b7
v.val[3] = _mm_unpackhi_epi16(tmp2,tmp3); //c4,d4, c5,d5, c6,d6, c7,d7
tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]); //a0,a4, b0,b4, a1,a5, b1,b5
tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7
tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5
tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]); //c2,c6, d2,d6, c3,c7, d3,d7
v.val[0] = _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4,
v.val[1] = _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5
v.val[2] = _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6,
v.val[3] = _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7
return v;
}
_NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
_NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
{
uint32x4x4_t v;
__m128i tmp3, tmp2, tmp1, tmp0;
v.val[0] = vld1q_u32 (ptr);
v.val[1] = vld1q_u32 ((ptr + 4));
v.val[2] = vld1q_u32 ((ptr + 8));
v.val[3] = vld1q_u32 ((ptr + 12));
tmp0 = _mm_unpacklo_epi32(v.val[0],v.val[1]);
tmp1 = _mm_unpacklo_epi32(v.val[2],v.val[3]);
tmp2 = _mm_unpackhi_epi32(v.val[0],v.val[1]);
tmp3 = _mm_unpackhi_epi32(v.val[2],v.val[3]);
v.val[0] = _mm_unpacklo_epi64(tmp0, tmp1);
v.val[1] = _mm_unpackhi_epi64(tmp0, tmp1);
v.val[2] = _mm_unpacklo_epi64(tmp2, tmp3);
v.val[3] = _mm_unpackhi_epi64(tmp2, tmp3);
return v;
}
_NEON2SSESTORAGE int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
#define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr)
_NEON2SSESTORAGE int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
#define vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr)
_NEON2SSESTORAGE int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
#define vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr)
_NEON2SSESTORAGE float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
_NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
_NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
{
float32x4x4_t v;
__m128 tmp3, tmp2, tmp1, tmp0;
v.val[0] = vld1q_f32 ((float*) ptr);
v.val[1] = vld1q_f32 ((float*) (ptr + 4));
v.val[2] = vld1q_f32 ((float*) (ptr + 8));
v.val[3] = vld1q_f32 ((float*) (ptr + 12));
tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]);
tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]);
tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]);
tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]);
v.val[0] = _mm_movelh_ps(tmp0, tmp2);
v.val[1] = _mm_movehl_ps(tmp2, tmp0);
v.val[2] = _mm_movelh_ps(tmp1, tmp3);
v.val[3] = _mm_movehl_ps(tmp3, tmp1);
return v;
}
_NEON2SSESTORAGE poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
#define vld4q_p8 vld4q_u8
_NEON2SSESTORAGE poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
#define vld4q_p16 vld4q_s16
_NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
_NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0]
{
uint8x8x4_t v;
__m128i sh0, sh1;
__m128i val0, val2;
_NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
val0 = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1]
val2 = vld1q_u8(( ptr + 16)); //load third and forth 64-bits in val[2], val[3]
sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_8);
sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_8);
val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29
vst1q_u8(&v.val[0], val0 );
val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31
vst1q_u8(&v.val[2], val2 );
return v;
}
_NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
_NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0]
{
uint16x4x4_t v;
__m128i sh0, sh1;
__m128i val0, val2;
_NEON2SSE_ALIGN_16 static const int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7
val0 = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1]
val2 = vld1q_u16 ( (ptr + 8)); //load third and forth 64-bits in val[2], val[3]
sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_16);
sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_16);
val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13
vst1q_u16(&v.val[0], val0 );
val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15
vst1q_u16(&v.val[2], val2 );
return v;
}
_NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
_NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr)
{
//a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
uint32x2x4_t v;
__m128i val0, val01, val2;
val0 = vld1q_u32 (ptr); //a0,a1, b0,b1,
val2 = vld1q_u32 ((ptr + 4)); //c0,c1, d0,d1
val01 = _mm_unpacklo_epi32(val0,val2); //a0, c0, a1,c1,
val2 = _mm_unpackhi_epi32(val0,val2); //b0,d0, b1, d1
vst1q_u32(&v.val[0], val01);
vst1q_u32(&v.val[2], val2 );
return v;
}
_NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
_NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
{
uint64x1x4_t v;
v.val[0].m64_u64[0] = *(ptr); //load first 64-bits in val[0] and val[1]
v.val[1].m64_u64[0] = *(ptr + 1); //load first 64-bits in val[0] and val[1]
v.val[2].m64_u64[0] = *(ptr + 2); //load third and forth 64-bits in val[2], val[3]
v.val[3].m64_u64[0] = *(ptr + 3); //load third and forth 64-bits in val[2], val[3]
return v;
}
_NEON2SSESTORAGE int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
#define vld4_s8(ptr) vld4_u8((uint8_t*)ptr)
_NEON2SSESTORAGE int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
#define vld4_s16(ptr) vld4_u16((uint16_t*)ptr)
_NEON2SSESTORAGE int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
#define vld4_s32(ptr) vld4_u32((uint32_t*)ptr)
int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
#define vld4_s64(ptr) vld4_u64((uint64_t*)ptr)
_NEON2SSESTORAGE float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
_NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
_NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0]
{
//a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
float32x2x4_t res;
res.val[0].m64_f32[0] = *(ptr);
res.val[0].m64_f32[1] = *(ptr + 4);
res.val[1].m64_f32[0] = *(ptr + 1);
res.val[1].m64_f32[1] = *(ptr + 5);
res.val[2].m64_f32[0] = *(ptr + 2);
res.val[2].m64_f32[1] = *(ptr + 6);
res.val[3].m64_f32[0] = *(ptr + 3);
res.val[3].m64_f32[1] = *(ptr + 7);
return res;
}
_NEON2SSESTORAGE poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
#define vld4_p8 vld4_u8
_NEON2SSESTORAGE poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
#define vld4_p16 vld4_u16
//************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes *******************
//*******************************************************************************************************************
_NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
_NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0]
{
uint8x8x2_t v;
__m128i val0, val1;
val0 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x
val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x,
val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x
val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
vst1q_u8(v.val, val0);
return v;
}
_NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
_NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0]
{
uint16x4x2_t v;
__m128i val0, val1;
val1 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x
val0 = _mm_shufflelo_epi16(val1, 0); //00 00 00 00 (all 0)
_M64(v.val[0], val0);
val1 = _mm_shufflelo_epi16(val1, 85); //01 01 01 01 (all 1)
_M64(v.val[1], val1);
return v;
}
_NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
_NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
{
uint32x2x2_t v;
__m128i val0;
val0 = LOAD_SI128(ptr); //0,1,x,x
val0 = _mm_shuffle_epi32(val0, 0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1
vst1q_u32(v.val, val0);
return v;
}
_NEON2SSESTORAGE uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
#define vld2_dup_u64 vld2_u64
_NEON2SSESTORAGE int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
#define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr)
_NEON2SSESTORAGE int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
#define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr)
_NEON2SSESTORAGE int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
#define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr)
_NEON2SSESTORAGE int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
#define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr)
_NEON2SSESTORAGE float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
_NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
_NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
{
float32x2x2_t v;
v.val[0].m64_f32[0] = *(ptr); //0,0
v.val[0].m64_f32[1] = *(ptr); //0,0
v.val[1].m64_f32[0] = *(ptr + 1); //1,1
v.val[1].m64_f32[1] = *(ptr + 1); //1,1
return v;
}
_NEON2SSESTORAGE poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
#define vld2_dup_p8 vld2_dup_u8
_NEON2SSESTORAGE poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
#define vld2_dup_p16 vld2_dup_s16
//************* Duplicate (or propagate)triplets: *******************
//********************************************************************
//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes
_NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
_NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0]
{
uint8x8x3_t v;
__m128i val0, val1, val2;
val0 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x
val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x,
val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x,
val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x,
vst1q_u8(v.val, val0);
_M64(v.val[2], val2);
return v;
}
_NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
_NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0]
{
uint16x4x3_t v;
__m128i val0, val1, val2;
val2 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x
val0 = _mm_shufflelo_epi16(val2, 0); //00 00 00 00 (all 0)
val1 = _mm_shufflelo_epi16(val2, 85); //01 01 01 01 (all 1)
val2 = _mm_shufflelo_epi16(val2, 170); //10 10 10 10 (all 2)
_M64(v.val[0], val0);
_M64(v.val[1], val1);
_M64(v.val[2], val2);
return v;
}
_NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
_NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
{
uint32x2x3_t v;
__m128i val0, val1, val2;
val2 = LOAD_SI128(ptr); //0,1,2,x
val0 = _mm_shuffle_epi32(val2, 0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2
val1 = _mm_shuffle_epi32(val2, 1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2
val2 = _mm_srli_si128(val0, 8); //2,2,0x0,0x0
_M64(v.val[0], val0);
_M64(v.val[1], val1);
_M64(v.val[2], val2);
return v;
}
_NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
_NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
{
uint64x1x3_t v;
v.val[0].m64_u64[0] = *(ptr);
v.val[1].m64_u64[0] = *(ptr + 1);
v.val[2].m64_u64[0] = *(ptr + 2);
return v;
}
_NEON2SSESTORAGE int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
#define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr)
_NEON2SSESTORAGE int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
#define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr)
_NEON2SSESTORAGE int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
#define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr)
int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
#define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr)
_NEON2SSESTORAGE float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
_NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
_NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
{
float32x2x3_t v;
int i;
for (i = 0; i<3; i++) {
v.val[i].m64_f32[0] = *(ptr + i);
v.val[i].m64_f32[1] = *(ptr + i);
}
return v;
}
_NEON2SSESTORAGE poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
#define vld3_dup_p8 vld3_dup_u8
_NEON2SSESTORAGE poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
#define vld3_dup_p16 vld3_dup_s16
//************* Duplicate (or propagate) quadruples: *******************
//***********************************************************************
//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes and ptr[3] to all val[3] lanes
_NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
_NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
{
uint8x8x4_t v;
__m128i val0, val1, val2;
val0 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x
val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x,
val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3
val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3
vst1q_u8(&v.val[0], val0);
vst1q_u8(&v.val[2], val2);
return v;
}
_NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
_NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
{
uint16x4x4_t v;
__m128i val0, val1, val2, val3;
val3 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x
val0 = _mm_shufflelo_epi16(val3, 0); //00 00 00 00 (all 0)
val1 = _mm_shufflelo_epi16(val3, 85); //01 01 01 01 (all 1)
val2 = _mm_shufflelo_epi16(val3, 170); //10 10 10 10 (all 2)
val3 = _mm_shufflelo_epi16(val3, 255); //11 11 11 11 (all 3)
_M64(v.val[0], val0);
_M64(v.val[1], val1);
_M64(v.val[2], val2);
_M64(v.val[3], val3);
return v;
}
_NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
_NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
{
uint32x2x4_t v;
__m128i val0, val1, val2, val3;
val3 = LOAD_SI128(ptr); //0,1,2,3
val0 = _mm_shuffle_epi32(val3, 0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3
val1 = _mm_shuffle_epi32(val3, 1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3
val2 = _mm_shuffle_epi32(val3, 2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3
val3 = _mm_shuffle_epi32(val3, 3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2
_M64(v.val[0], val0);
_M64(v.val[1], val1);
_M64(v.val[2], val2);
_M64(v.val[3], val3);
return v;
}
_NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
_NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
{
uint64x1x4_t v;
v.val[0].m64_u64[0] = *(ptr);
v.val[1].m64_u64[0] = *(ptr + 1);
v.val[2].m64_u64[0] = *(ptr + 2);
v.val[3].m64_u64[0] = *(ptr + 3);
return v;
}
_NEON2SSESTORAGE int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
#define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr)
_NEON2SSESTORAGE int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
#define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr)
_NEON2SSESTORAGE int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
#define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr)
int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
#define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr)
_NEON2SSESTORAGE float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
_NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
_NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
{
float32x2x4_t v;
int i;
for (i = 0; i<4; i++) {
v.val[i].m64_f32[0] = *(ptr + i);
v.val[i].m64_f32[1] = *(ptr + i);
}
return v;
}
_NEON2SSESTORAGE poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
#define vld4_dup_p8 vld4_dup_u8
_NEON2SSESTORAGE poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
#define vld4_dup_p16 vld4_dup_u16
//**********************************************************************************
//*******************Lane loads for an N-element structures ***********************
//**********************************************************************************
//********************** Lane pairs ************************************************
//does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon
//we assume src is 16 bit aligned
//!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error
//to fix it the all functions below work with xxxxxx_2t pointers and the corresponding original functions are redefined
//uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
_NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane) // VLD2.16 {d0[0], d2[0]}, [r0]
{
uint16x8x2_t v;
v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane);
v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane);
return v;
}
#define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane)
//uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
_NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
{
uint32x4x2_t v;
v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane);
return v;
}
#define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane)
//int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
_NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t* src, __constrange(0,7) int lane)
{
int16x8x2_t v;
v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane);
v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane);
return v;
}
#define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane)
//int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
_NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t* src, __constrange(0,3) int lane)
{
int32x4x2_t v;
v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane);
return v;
}
#define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane)
//float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
//current IA SIMD doesn't support float16
//float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
_NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
{
float32x4x2_t v;
v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane);
v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane);
return v;
}
#define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane)
//poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
#define vld2q_lane_p16 vld2q_lane_u16
_NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
_NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane) // VLD2.8 {d0[0], d1[0]}, [r0]
{
uint8x8x2_t v;
v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
return v;
}
_NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
_NEON2SSE_INLINE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3) int lane)
{
uint16x4x2_t v;
v.val[0] = vld1_lane_u16(ptr, src.val[0], lane);
v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane);
return v;
}
_NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
_NEON2SSE_INLINE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1) int lane)
{
uint32x2x2_t v;
v.val[0] = vld1_lane_u32(ptr, src.val[0], lane);
v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane);
return v;
}
_NEON2SSESTORAGE int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
#define vld2_lane_s8(ptr, src, lane) vld2_lane_u8(( uint8_t*) ptr, src, lane)
_NEON2SSESTORAGE int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
#define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane)
_NEON2SSESTORAGE int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
#define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane)
//float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
//current IA SIMD doesn't support float16
_NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t src,__constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
_NEON2SSE_INLINE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t src,__constrange(0,1) int lane)
{
float32x2x2_t v;
v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
return v;
}
//poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
#define vld2_lane_p8 vld2_lane_u8
//poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
_NEON2SSESTORAGE poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
#define vld2_lane_p16 vld2_lane_u16
//*********** Lane triplets **********************
//*************************************************
//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon
//we assume src is 16 bit aligned
//uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
_NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
{
uint16x8x3_t v;
v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane);
v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane);
v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane);
return v;
}
#define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane)
//uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
_NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
{
uint32x4x3_t v;
v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane);
v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane);
v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane);
return v;
}
#define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane)
//int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
_NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
{
int16x8x3_t v;
v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane);
v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane);
v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane);
return v;
}
#define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane)
//int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
_NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
{
int32x4x3_t v;
v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane);
v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane);
v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane);
return v;
}
#define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane)
_NEON2SSESTORAGE float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
//current IA SIMD doesn't support float16
#define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane)
//float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
_NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
{
float32x4x3_t v;
v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
return v;
}
#define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane)
_NEON2SSESTORAGE poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
#define vld3q_lane_p16 vld3q_lane_u16
_NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
_NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
{
uint8x8x3_t v;
v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
v.val[2] = vld1_lane_u8((ptr + 2), src.val[2], lane);
return v;
}
_NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
_NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
{
uint16x4x3_t v;
v.val[0] = vld1_lane_u16(ptr, src.val[0], lane);
v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane);
v.val[2] = vld1_lane_u16((ptr + 2), src.val[2], lane);
return v;
}
_NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
_NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
{
//need to merge into 128 bit anyway
uint32x2x3_t v;
v.val[0] = vld1_lane_u32(ptr, src.val[0], lane);;
v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane);;
v.val[2] = vld1_lane_u32((ptr + 2), src.val[2], lane);;
return v;
}
_NEON2SSESTORAGE int8x8x3_t vld3_lane_s8(__transfersize(3) int8_t const * ptr, int8x8x3_t src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
#define vld3_lane_s8(ptr, src, lane) vld3_lane_u8(( uint8_t*) ptr, src, lane)
_NEON2SSESTORAGE int16x4x3_t vld3_lane_s16(__transfersize(3) int16_t const * ptr, int16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
#define vld3_lane_s16(ptr, src, lane) vld3_lane_u16(( uint16_t*) ptr, src, lane)
_NEON2SSESTORAGE int32x2x3_t vld3_lane_s32(__transfersize(3) int32_t const * ptr, int32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
#define vld3_lane_s32(ptr, src, lane) vld3_lane_u32(( uint32_t*) ptr, src, lane)
_NEON2SSESTORAGE float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
//current IA SIMD doesn't support float16
_NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
_NEON2SSE_INLINE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
{
float32x2x3_t v;
v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
v.val[2] = vld1_lane_f32((ptr + 2), src.val[2], lane);
return v;
}
_NEON2SSESTORAGE poly8x8x3_t vld3_lane_p8(__transfersize(3) poly8_t const * ptr, poly8x8x3_t src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
#define vld3_lane_p8 vld3_lane_u8
_NEON2SSESTORAGE poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
#define vld3_lane_p16 vld3_lane_u16
//******************* Lane Quadruples load ***************************
//*********************************************************************
//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon
//we assume src is 16 bit aligned
//uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t* src,__constrange(0,7) int lane)
{
uint16x8x4_t v;
v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane);
v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane);
v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane);
v.val[3] = _MM_INSERT_EPI16 ( src->val[3], ptr[3], lane);
return v;
}
#define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane)
//uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t* src,__constrange(0,3) int lane)
{
uint32x4x4_t v;
v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane);
v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane);
v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane);
v.val[3] = _MM_INSERT_EPI32 ( src->val[3], ptr[3], lane);
return v;
}
#define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane)
//int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSESTORAGE int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
#define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane)
//int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSESTORAGE int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
#define vld4q_lane_s32(ptr, src, lane) vld4q_lane_u32(( uint32_t*) ptr, src, lane)
//float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSESTORAGE float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
//current IA SIMD doesn't support float16
//float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t* src,__constrange(0,3) int lane)
{
float32x4x4_t v;
v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane);
return v;
}
#define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane)
//poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
_NEON2SSESTORAGE poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
#define vld4q_lane_p16 vld4q_lane_u16
_NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)
{
uint8x8x4_t v;
v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
v.val[2] = vld1_lane_u8((ptr + 2), src.val[2], lane);
v.val[3] = vld1_lane_u8((ptr + 3), src.val[3], lane);
return v;
}
_NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3) int lane)
{
uint16x4x4_t v;
v.val[0] = vld1_lane_u16(ptr, src.val[0], lane);
v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane);
v.val[2] = vld1_lane_u16((ptr + 2), src.val[2], lane);
v.val[3] = vld1_lane_u16((ptr + 3), src.val[3], lane);
return v;
}
_NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1) int lane)
{
uint32x2x4_t v;
v.val[0] = vld1_lane_u32(ptr, src.val[0], lane);
v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane);
v.val[2] = vld1_lane_u32((ptr + 2), src.val[2], lane);
v.val[3] = vld1_lane_u32((ptr + 3), src.val[3], lane);
return v;
}
_NEON2SSESTORAGE int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
#define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane)
_NEON2SSESTORAGE int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
#define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane)
_NEON2SSESTORAGE int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
#define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane)
//float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSESTORAGE float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane);
//current IA SIMD doesn't support float16
_NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
_NEON2SSE_INLINE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)
{
//serial solution may be faster
float32x2x4_t v;
v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
v.val[2] = vld1_lane_f32((ptr + 2), src.val[2], lane);
v.val[3] = vld1_lane_f32((ptr + 3), src.val[3], lane);
return v;
}
_NEON2SSESTORAGE poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
#define vld4_lane_p8 vld4_lane_u8
_NEON2SSESTORAGE poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
#define vld4_lane_p16 vld4_lane_u16
//******************* Store duplets *********************************************
//********************************************************************************
//void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0]
_NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t* val)
{
uint8x16x2_t v;
v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]);
v.val[1] = _mm_unpackhi_epi8(val->val[0], val->val[1]);
vst1q_u8 (ptr, v.val[0]);
vst1q_u8 ((ptr + 16), v.val[1]);
}
#define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val)
//void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0]
_NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t* val)
{
uint16x8x2_t v;
v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]);
v.val[1] = _mm_unpackhi_epi16(val->val[0], val->val[1]);
vst1q_u16 (ptr, v.val[0]);
vst1q_u16 ((ptr + 8), v.val[1]);
}
#define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val)
//void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0]
_NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr, uint32x4x2_t* val)
{
uint32x4x2_t v;
v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]);
v.val[1] = _mm_unpackhi_epi32(val->val[0], val->val[1]);
vst1q_u32 (ptr, v.val[0]);
vst1q_u32 ((ptr + 4), v.val[1]);
}
#define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val)
//void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0]
_NEON2SSESTORAGE void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val);
#define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val)
//void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0]
_NEON2SSESTORAGE void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val);
#define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val)
//void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0]
_NEON2SSESTORAGE void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val);
#define vst2q_s32(ptr, val) vst2q_u32((uint32_t*)(ptr), val)
//void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0]
_NEON2SSESTORAGE void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val);
// IA32 SIMD doesn't work with 16bit floats currently
//void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0]
_NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(8) float32_t* ptr, float32x4x2_t* val)
{
float32x4x2_t v;
v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]);
v.val[1] = _mm_unpackhi_ps(val->val[0], val->val[1]);
vst1q_f32 (ptr, v.val[0]);
vst1q_f32 ((ptr + 4), v.val[1]);
}
#define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val)
//void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0]
_NEON2SSESTORAGE void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val);
#define vst2q_p8 vst2q_u8
//void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0]
_NEON2SSESTORAGE void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val);
#define vst2q_p16 vst2q_u16
_NEON2SSESTORAGE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0]
_NEON2SSE_INLINE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val)
{
__m128i v0;
v0 = _mm_unpacklo_epi8(_pM128i(val.val[0]), _pM128i(val.val[1]));
vst1q_u8 (ptr, v0);
}
_NEON2SSESTORAGE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0]
_NEON2SSE_INLINE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val)
{
__m128i v0;
v0 = _mm_unpacklo_epi16(_pM128i(val.val[0]), _pM128i(val.val[1]));
vst1q_u16 (ptr, v0);
}
_NEON2SSESTORAGE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0]
_NEON2SSE_INLINE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val)
{
__m128i v0;
v0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), _pM128i(val.val[1]));
vst1q_u32 (ptr, v0);
}
_NEON2SSESTORAGE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0]
_NEON2SSE_INLINE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val)
{
*(ptr) = val.val[0].m64_u64[0];
*(ptr + 1) = val.val[1].m64_u64[0];
}
_NEON2SSESTORAGE void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0]
#define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val)
_NEON2SSESTORAGE void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
#define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val)
_NEON2SSESTORAGE void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
#define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val)
_NEON2SSESTORAGE void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val);
#define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val)
//void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0]
//current IA SIMD doesn't support float16
_NEON2SSESTORAGE void vst2_f32(__transfersize(4) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0]
_NEON2SSE_INLINE void vst2_f32(__transfersize(4) float32_t* ptr, float32x2x2_t val)
{
*(ptr) = val.val[0].m64_f32[0];
*(ptr + 1) = val.val[1].m64_f32[0];
*(ptr + 2) = val.val[0].m64_f32[1];
*(ptr + 3) = val.val[1].m64_f32[1];
}
_NEON2SSESTORAGE void vst2_p8(__transfersize(16) poly8_t * ptr, poly8x8x2_t val); // VST2.8 {d0, d1}, [r0]
#define vst2_p8 vst2_u8
_NEON2SSESTORAGE void vst2_p16(__transfersize(8) poly16_t * ptr, poly16x4x2_t val); // VST2.16 {d0, d1}, [r0]
#define vst2_p16 vst2_u16
//******************** Triplets store *****************************************
//******************************************************************************
//void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0]
_NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t* val)
{
uint8x16x3_t v;
__m128i v0,v1,v2, cff, bldmask;
_NEON2SSE_ALIGN_16 static const uint8_t mask0[16] = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10};
_NEON2SSE_ALIGN_16 static const uint8_t mask1[16] = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10};
_NEON2SSE_ALIGN_16 static const uint8_t mask2[16] = {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff};
_NEON2SSE_ALIGN_16 static const uint8_t mask2lo[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff};
_NEON2SSE_ALIGN_16 static const uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff};
_NEON2SSE_ALIGN_16 static const uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15};
v0 = _mm_unpacklo_epi8(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22
v2 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //24,25, 27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46
v1 = _mm_alignr_epi8(v2, v0, 11); //12,13, 15,16, 18,19, 21,22, 24,25, 27,28, 30,31, 33,34
v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
cff = _mm_cmpeq_epi8(v0, v0); //all ff
bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff);
v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
vst1q_u8(ptr, v.val[0]);
v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff);
v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
vst1q_u8((ptr + 16), v.val[1]);
v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff);
v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
vst1q_u8((ptr + 32), v.val[2]);
}
#define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val)
//void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0]
_NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t* val)
{
uint16x8x3_t v;
__m128i v0,v1,v2, cff, bldmask;
_NEON2SSE_ALIGN_16 static const uint8_t mask0[16] = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11};
_NEON2SSE_ALIGN_16 static const uint8_t mask1[16] = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9};
_NEON2SSE_ALIGN_16 static const uint8_t mask2[16] = {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff};
_NEON2SSE_ALIGN_16 static const uint8_t mask2lo[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff};
_NEON2SSE_ALIGN_16 static const uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff};
_NEON2SSE_ALIGN_16 static const uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15};
v0 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10
v2 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //12,13, 15,16, 18,19, 21,22,
v1 = _mm_alignr_epi8(v2, v0, 12); //9,10, 12,13, 15,16, 18,19
v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
cff = _mm_cmpeq_epi16(v0, v0); //all ff
bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff);
v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
vst1q_u16(ptr, v.val[0]);
v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff);
v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
vst1q_u16((ptr + 8), v.val[1]);
v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff);
v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
vst1q_u16((ptr + 16), v.val[2]);
}
#define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val)
//void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
_NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t* val)
{
//a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3
uint32x4x3_t v;
__m128i tmp0, tmp1,tmp2;
tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1
tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //a2,b2,a3,b3
tmp2 = _mm_unpacklo_epi32(val->val[1], val->val[2]); //b0,c0,b1,c1
v.val[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2,
v.val[2] = _mm_unpackhi_epi64(tmp1, val->val[2]); //a3,b3, c2,c3
v.val[2] = _mm_shuffle_epi32(v.val[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3
tmp1 = _mm_unpacklo_epi32(tmp2,val->val[0]); //b0,a0,c0,a1
v.val[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1,
vst1q_u32(ptr, v.val[0]);
vst1q_u32((ptr + 4), v.val[1]);
vst1q_u32((ptr + 8), v.val[2]);
}
#define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val)
//void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val);
_NEON2SSESTORAGE void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val);
#define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val)
//void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val);
_NEON2SSESTORAGE void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val);
#define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val)
//void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val);
_NEON2SSESTORAGE void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val);
#define vst3q_s32(ptr, val) vst3q_u32((uint32_t*)(ptr), val)
//void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
_NEON2SSESTORAGE void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val);
// IA32 SIMD doesn't work with 16bit floats currently
//void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
_NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr