blob: 8dd7c5e802427e3b15d1fc065674b857a136e6ee [file] [log] [blame]
#include <stdio.h>
#include <assert.h>
#include <malloc.h> // memalign
#include <string.h> // memset
#include <math.h> // isnormal
typedef unsigned char UChar;
typedef unsigned short int UShort;
typedef unsigned int UInt;
typedef signed int Int;
typedef unsigned char UChar;
typedef unsigned long long int ULong;
typedef unsigned char Bool;
#define False ((Bool)0)
#define True ((Bool)1)
#define ITERS 1
typedef
enum { TySF=1234, TyDF, TyB, TyH, TyS, TyD, TyNONE }
LaneTy;
union _V128 {
UChar u8[16];
UShort u16[8];
UInt u32[4];
ULong u64[2];
float f32[4];
double f64[2];
};
typedef union _V128 V128;
static inline UChar randUChar ( void )
{
static UInt seed = 80021;
seed = 1103515245 * seed + 12345;
return (seed >> 17) & 0xFF;
}
static ULong randULong ( LaneTy ty )
{
Int i;
ULong r = 0;
for (i = 0; i < 8; i++) {
r = (r << 8) | (ULong)(0xFF & randUChar());
}
return r;
}
/* Generates a random V128. Ensures that that it contains normalised
FP numbers when viewed as either F32x4 or F64x2, so that it is
reasonable to use in FP test cases. */
static void randV128 ( /*OUT*/V128* v, LaneTy ty )
{
static UInt nCalls = 0, nIters = 0;
Int i;
nCalls++;
while (1) {
nIters++;
for (i = 0; i < 16; i++) {
v->u8[i] = randUChar();
}
if (isnormal(v->f32[0]) && isnormal(v->f32[1]) && isnormal(v->f32[2])
&& isnormal(v->f32[3]) && isnormal(v->f64[0]) && isnormal(v->f64[1]))
break;
}
if (0 == (nCalls & 0xFF))
printf("randV128: %u calls, %u iters\n", nCalls, nIters);
}
static void showV128 ( V128* v )
{
Int i;
for (i = 15; i >= 0; i--)
printf("%02x", (Int)v->u8[i]);
}
__attribute__((unused))
static void* memalign16(size_t szB)
{
void* x;
x = memalign(16, szB);
assert(x);
assert(0 == ((16-1) & (unsigned long)x));
return x;
}
/* ---------------------------------------------------------------- */
/* -- Test functions -- */
/* ---------------------------------------------------------------- */
/* Note this also sets the destination register to a known value (0x55..55)
since it can sometimes be an input to the instruction too. */
#define GEN_UNARY_TEST(INSN,SUFFIXD,SUFFIXN) \
__attribute__((noinline)) \
static void test_##INSN##_##SUFFIXD##_##SUFFIXN ( LaneTy ty ) { \
Int i; \
for (i = 0; i < ITERS; i++) { \
V128 block[2]; \
memset(block, 0x55, sizeof(block)); \
randV128(&block[0], ty); \
randV128(&block[1], ty); \
__asm__ __volatile__( \
"ldr q7, [%0, #0] ; " \
"ldr q8, [%0, #16] ; " \
#INSN " v8." #SUFFIXD ", v7." #SUFFIXN " ; " \
"str q8, [%0, #16] " \
: : "r"(&block[0]) : "memory", "v7", "v8" \
); \
printf(#INSN " v8." #SUFFIXD ", v7." #SUFFIXN); \
showV128(&block[0]); printf(" "); \
showV128(&block[1]); printf("\n"); \
} \
}
/* Note this also sets the destination register to a known value (0x55..55)
since it can sometimes be an input to the instruction too. */
#define GEN_BINARY_TEST(INSN,SUFFIXD,SUFFIXN,SUFFIXM) \
__attribute__((noinline)) \
static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_##SUFFIXM ( LaneTy ty ) { \
Int i; \
for (i = 0; i < ITERS; i++) { \
V128 block[3]; \
memset(block, 0x55, sizeof(block)); \
randV128(&block[0], ty); \
randV128(&block[1], ty); \
randV128(&block[2], ty); \
__asm__ __volatile__( \
"ldr q7, [%0, #0] ; " \
"ldr q8, [%0, #16] ; " \
"ldr q9, [%0, #32] ; " \
#INSN " v9." #SUFFIXD ", v7." #SUFFIXN ", v8." #SUFFIXM " ; " \
"str q9, [%0, #32] " \
: : "r"(&block[0]) : "memory", "v7", "v8", "v9" \
); \
printf(#INSN " v9." #SUFFIXD \
", v7." #SUFFIXN ", v8." #SUFFIXM " "); \
showV128(&block[0]); printf(" "); \
showV128(&block[1]); printf(" "); \
showV128(&block[2]); printf("\n"); \
} \
}
/* Note this also sets the destination register to a known value (0x55..55)
since it can sometimes be an input to the instruction too. */
#define GEN_SHIFT_TEST(INSN,SUFFIXD,SUFFIXN,AMOUNT) \
__attribute__((noinline)) \
static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_##AMOUNT ( LaneTy ty ) { \
Int i; \
for (i = 0; i < ITERS; i++) { \
V128 block[2]; \
memset(block, 0x55, sizeof(block)); \
randV128(&block[0], ty); \
randV128(&block[1], ty); \
__asm__ __volatile__( \
"ldr q7, [%0, #0] ; " \
"ldr q8, [%0, #16] ; " \
#INSN " v8." #SUFFIXD ", v7." #SUFFIXN ", #" #AMOUNT " ; " \
"str q8, [%0, #16] " \
: : "r"(&block[0]) : "memory", "v7", "v8" \
); \
printf(#INSN " v8." #SUFFIXD ", v7." #SUFFIXN ", #" #AMOUNT " "); \
showV128(&block[0]); printf(" "); \
showV128(&block[1]); printf("\n"); \
} \
}
/* Generate a test that involves one integer reg and one vector reg,
with no bias as towards which is input or output. */
#define GEN_ONEINT_ONEVEC_TEST(TESTNAME,INSN,INTREGNO,VECREGNO) \
__attribute__((noinline)) \
static void test_##TESTNAME ( LaneTy ty ) { \
Int i; \
for (i = 0; i < ITERS; i++) { \
V128 block[4]; \
memset(block, 0x55, sizeof(block)); \
randV128(&block[0], ty); \
randV128(&block[1], ty); \
randV128(&block[2], ty); \
randV128(&block[3], ty); \
__asm__ __volatile__( \
"ldr q"#VECREGNO", [%0, #0] ; " \
"ldr x"#INTREGNO", [%0, #16] ; " \
INSN " ; " \
"str q"#VECREGNO", [%0, #32] ; " \
"str x"#INTREGNO", [%0, #48] ; " \
: : "r"(&block[0]) : "memory", "v"#VECREGNO, "x"#INTREGNO \
); \
printf(INSN " "); \
showV128(&block[0]); printf(" "); \
showV128(&block[1]); printf(" "); \
showV128(&block[2]); printf(" "); \
showV128(&block[3]); printf("\n"); \
} \
}
/* Generate a test that involves two vector regs,
with no bias as towards which is input or output. */
#define GEN_TWOVEC_TEST(TESTNAME,INSN,VECREG1NO,VECREG2NO) \
__attribute__((noinline)) \
static void test_##TESTNAME ( LaneTy ty ) { \
Int i; \
for (i = 0; i < ITERS; i++) { \
V128 block[4]; \
memset(block, 0x55, sizeof(block)); \
randV128(&block[0], ty); \
randV128(&block[1], ty); \
randV128(&block[2], ty); \
randV128(&block[3], ty); \
__asm__ __volatile__( \
"ldr q"#VECREG1NO", [%0, #0] ; " \
"ldr q"#VECREG2NO", [%0, #16] ; " \
INSN " ; " \
"str q"#VECREG1NO", [%0, #32] ; " \
"str q"#VECREG2NO", [%0, #48] ; " \
: : "r"(&block[0]) : "memory", "v"#VECREG1NO, "v"#VECREG2NO \
); \
printf(INSN " "); \
showV128(&block[0]); printf(" "); \
showV128(&block[1]); printf(" "); \
showV128(&block[2]); printf(" "); \
showV128(&block[3]); printf("\n"); \
} \
}
/* Generate a test that involves three vector regs,
with no bias as towards which is input or output. It's also OK
to use v16, v17, v18 as scratch. */
#define GEN_THREEVEC_TEST(TESTNAME,INSN,VECREG1NO,VECREG2NO,VECREG3NO) \
__attribute__((noinline)) \
static void test_##TESTNAME ( LaneTy ty ) { \
Int i; \
for (i = 0; i < ITERS; i++) { \
V128 block[6]; \
memset(block, 0x55, sizeof(block)); \
randV128(&block[0], ty); \
randV128(&block[1], ty); \
randV128(&block[2], ty); \
randV128(&block[3], ty); \
randV128(&block[4], ty); \
randV128(&block[5], ty); \
__asm__ __volatile__( \
"ldr q"#VECREG1NO", [%0, #0] ; " \
"ldr q"#VECREG2NO", [%0, #16] ; " \
"ldr q"#VECREG3NO", [%0, #32] ; " \
INSN " ; " \
"str q"#VECREG1NO", [%0, #48] ; " \
"str q"#VECREG2NO", [%0, #64] ; " \
"str q"#VECREG3NO", [%0, #80] ; " \
: : "r"(&block[0]) \
: "memory", "v"#VECREG1NO, "v"#VECREG2NO, "v"#VECREG3NO, \
"v16", "v17", "v18" \
); \
printf(INSN " "); \
showV128(&block[0]); printf(" "); \
showV128(&block[1]); printf(" "); \
showV128(&block[2]); printf(" "); \
showV128(&block[3]); printf(" "); \
showV128(&block[4]); printf(" "); \
showV128(&block[5]); printf("\n"); \
} \
}
void test_UMINV ( void )
{
int i;
V128 block[2];
/* -- 4s -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyS);
randV128(&block[1], TyS);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"uminv s8, v7.4s ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("UMINV v8, v7.4s ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
/* -- 8h -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyH);
randV128(&block[1], TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"uminv h8, v7.8h ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("UMINV h8, v7.8h ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
/* -- 4h -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyH);
randV128(&block[1], TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"uminv h8, v7.4h ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("UMINV h8, v7.4h ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
/* -- 16b -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyB);
randV128(&block[1], TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"uminv b8, v7.16b ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("UMINV b8, v7.16b ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
/* -- 8b -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyB);
randV128(&block[1], TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"uminv b8, v7.8b ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("UMINV b8, v7.8b ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
}
void test_UMAXV ( void )
{
int i;
V128 block[2];
/* -- 4s -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyS);
randV128(&block[1], TyS);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"umaxv s8, v7.4s ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("UMAXV v8, v7.4s ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
/* -- 8h -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyH);
randV128(&block[1], TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"umaxv h8, v7.8h ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("UMAXV h8, v7.8h ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
/* -- 4h -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyH);
randV128(&block[1], TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"umaxv h8, v7.4h ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("UMAXV h8, v7.4h ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
/* -- 16b -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyB);
randV128(&block[1], TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"umaxv b8, v7.16b ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("UMAXV b8, v7.16b ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
/* -- 8b -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyB);
randV128(&block[1], TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"umaxv b8, v7.8b ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("UMAXV b8, v7.8b ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
}
void test_INS_general ( void )
{
V128 block[3];
/* -- D[0..1] -- */
memset(&block, 0x55, sizeof(block));
block[1].u64[0] = randULong(TyD);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
"ins v7.d[0], x19 ; "
"str q7, [%0, #32] "
: : "r"(&block[0]) : "memory", "x19", "v7"
);
printf("INS v7.u64[0],x19 ");
showV128(&block[0]); printf(" %016llx ", block[1].u64[0]);
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
block[1].u64[0] = randULong(TyD);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
"ins v7.d[1], x19 ; "
"str q7, [%0, #32] "
: : "r"(&block[0]) : "memory", "x19", "v7"
);
printf("INS v7.d[1],x19 ");
showV128(&block[0]); printf(" %016llx ", block[1].u64[0]);
showV128(&block[2]); printf("\n");
/* -- S[0..3] -- */
memset(&block, 0x55, sizeof(block));
block[1].u64[0] = randULong(TyS);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
"ins v7.s[0], w19 ; "
"str q7, [%0, #32] "
: : "r"(&block[0]) : "memory", "x19", "v7"
);
printf("INS v7.s[0],x19 ");
showV128(&block[0]); printf(" %016llx ", block[1].u64[0]);
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
block[1].u64[0] = randULong(TyS);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
"ins v7.s[1], w19 ; "
"str q7, [%0, #32] "
: : "r"(&block[0]) : "memory", "x19", "v7"
);
printf("INS v7.s[1],x19 ");
showV128(&block[0]); printf(" %016llx ", block[1].u64[0]);
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
block[1].u64[0] = randULong(TyS);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
"ins v7.s[2], w19 ; "
"str q7, [%0, #32] "
: : "r"(&block[0]) : "memory", "x19", "v7"
);
printf("INS v7.s[2],x19 ");
showV128(&block[0]); printf(" %016llx ", block[1].u64[0]);
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
block[1].u64[0] = randULong(TyS);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
"ins v7.s[3], w19 ; "
"str q7, [%0, #32] "
: : "r"(&block[0]) : "memory", "x19", "v7"
);
printf("INS v7.s[3],x19 ");
showV128(&block[0]); printf(" %016llx ", block[1].u64[0]);
showV128(&block[2]); printf("\n");
/* -- H[0..7] -- */
memset(&block, 0x55, sizeof(block));
block[1].u64[0] = randULong(TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
"ins v7.h[0], w19 ; "
"str q7, [%0, #32] "
: : "r"(&block[0]) : "memory", "x19", "v7"
);
printf("INS v7.h[0],x19 ");
showV128(&block[0]); printf(" %016llx ", block[1].u64[0]);
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
block[1].u64[0] = randULong(TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
"ins v7.h[1], w19 ; "
"str q7, [%0, #32] "
: : "r"(&block[0]) : "memory", "x19", "v7"
);
printf("INS v7.h[1],x19 ");
showV128(&block[0]); printf(" %016llx ", block[1].u64[0]);
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
block[1].u64[0] = randULong(TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
"ins v7.h[2], w19 ; "
"str q7, [%0, #32] "
: : "r"(&block[0]) : "memory", "x19", "v7"
);
printf("INS v7.h[2],x19 ");
showV128(&block[0]); printf(" %016llx ", block[1].u64[0]);
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
block[1].u64[0] = randULong(TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
"ins v7.h[3], w19 ; "
"str q7, [%0, #32] "
: : "r"(&block[0]) : "memory", "x19", "v7"
);
printf("INS v7.h[3],x19 ");
showV128(&block[0]); printf(" %016llx ", block[1].u64[0]);
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
block[1].u64[0] = randULong(TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
"ins v7.h[4], w19 ; "
"str q7, [%0, #32] "
: : "r"(&block[0]) : "memory", "x19", "v7"
);
printf("INS v7.h[4],x19 ");
showV128(&block[0]); printf(" %016llx ", block[1].u64[0]);
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
block[1].u64[0] = randULong(TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
"ins v7.h[5], w19 ; "
"str q7, [%0, #32] "
: : "r"(&block[0]) : "memory", "x19", "v7"
);
printf("INS v7.h[5],x19 ");
showV128(&block[0]); printf(" %016llx ", block[1].u64[0]);
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
block[1].u64[0] = randULong(TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
"ins v7.h[6], w19 ; "
"str q7, [%0, #32] "
: : "r"(&block[0]) : "memory", "x19", "v7"
);
printf("INS v7.h[6],x19 ");
showV128(&block[0]); printf(" %016llx ", block[1].u64[0]);
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
block[1].u64[0] = randULong(TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
"ins v7.h[7], w19 ; "
"str q7, [%0, #32] "
: : "r"(&block[0]) : "memory", "x19", "v7"
);
printf("INS v7.h[7],x19 ");
showV128(&block[0]); printf(" %016llx ", block[1].u64[0]);
showV128(&block[2]); printf("\n");
/* -- B[0,15] -- */
memset(&block, 0x55, sizeof(block));
block[1].u64[0] = randULong(TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
"ins v7.b[0], w19 ; "
"str q7, [%0, #32] "
: : "r"(&block[0]) : "memory", "x19", "v7"
);
printf("INS v7.b[0],x19 ");
showV128(&block[0]); printf(" %016llx ", block[1].u64[0]);
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
block[1].u64[0] = randULong(TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
"ins v7.b[15], w19 ; "
"str q7, [%0, #32] "
: : "r"(&block[0]) : "memory", "x19", "v7"
);
printf("INS v7.b[15],x19 ");
showV128(&block[0]); printf(" %016llx ", block[1].u64[0]);
showV128(&block[2]); printf("\n");
}
void test_SMINV ( void )
{
int i;
V128 block[2];
/* -- 4s -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyS);
randV128(&block[1], TyS);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"sminv s8, v7.4s ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("SMINV v8, v7.4s ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
/* -- 8h -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyH);
randV128(&block[1], TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"sminv h8, v7.8h ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("SMINV h8, v7.8h ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
/* -- 4h -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyH);
randV128(&block[1], TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"sminv h8, v7.4h ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("SMINV h8, v7.4h ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
/* -- 16b -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyB);
randV128(&block[1], TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"sminv b8, v7.16b ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("SMINV b8, v7.16b ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
/* -- 8b -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyB);
randV128(&block[1], TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"sminv b8, v7.8b ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("SMINV b8, v7.8b ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
}
void test_SMAXV ( void )
{
int i;
V128 block[2];
/* -- 4s -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyS);
randV128(&block[1], TyS);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"smaxv s8, v7.4s ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("SMAXV v8, v7.4s ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
/* -- 8h -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyH);
randV128(&block[1], TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"smaxv h8, v7.8h ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("SMAXV h8, v7.8h ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
/* -- 4h -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyH);
randV128(&block[1], TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"smaxv h8, v7.4h ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("SMAXV h8, v7.4h ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
/* -- 16b -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyB);
randV128(&block[1], TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"smaxv b8, v7.16b ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("SMAXV b8, v7.16b ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
/* -- 8b -- */
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
randV128(&block[0], TyB);
randV128(&block[1], TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"smaxv b8, v7.8b ; "
"str q8, [%0, #16] "
: : "r"(&block[0]) : "memory", "v7", "v8"
);
printf("SMAXV b8, v7.8b ");
showV128(&block[0]); printf(" ");
showV128(&block[1]); printf("\n");
}
}
GEN_BINARY_TEST(umax, 4s, 4s, 4s)
GEN_BINARY_TEST(umax, 2s, 2s, 2s)
GEN_BINARY_TEST(umax, 8h, 8h, 8h)
GEN_BINARY_TEST(umax, 4h, 4h, 4h)
GEN_BINARY_TEST(umax, 16b, 16b, 16b)
GEN_BINARY_TEST(umax, 8b, 8b, 8b)
GEN_BINARY_TEST(umin, 4s, 4s, 4s)
GEN_BINARY_TEST(umin, 2s, 2s, 2s)
GEN_BINARY_TEST(umin, 8h, 8h, 8h)
GEN_BINARY_TEST(umin, 4h, 4h, 4h)
GEN_BINARY_TEST(umin, 16b, 16b, 16b)
GEN_BINARY_TEST(umin, 8b, 8b, 8b)
GEN_BINARY_TEST(smax, 4s, 4s, 4s)
GEN_BINARY_TEST(smax, 2s, 2s, 2s)
GEN_BINARY_TEST(smax, 8h, 8h, 8h)
GEN_BINARY_TEST(smax, 4h, 4h, 4h)
GEN_BINARY_TEST(smax, 16b, 16b, 16b)
GEN_BINARY_TEST(smax, 8b, 8b, 8b)
GEN_BINARY_TEST(smin, 4s, 4s, 4s)
GEN_BINARY_TEST(smin, 2s, 2s, 2s)
GEN_BINARY_TEST(smin, 8h, 8h, 8h)
GEN_BINARY_TEST(smin, 4h, 4h, 4h)
GEN_BINARY_TEST(smin, 16b, 16b, 16b)
GEN_BINARY_TEST(smin, 8b, 8b, 8b)
GEN_BINARY_TEST(add, 2d, 2d, 2d)
GEN_BINARY_TEST(add, 4s, 4s, 4s)
GEN_BINARY_TEST(add, 2s, 2s, 2s)
GEN_BINARY_TEST(add, 8h, 8h, 8h)
GEN_BINARY_TEST(add, 4h, 4h, 4h)
GEN_BINARY_TEST(add, 16b, 16b, 16b)
GEN_BINARY_TEST(add, 8b, 8b, 8b)
GEN_BINARY_TEST(sub, 2d, 2d, 2d)
GEN_BINARY_TEST(sub, 4s, 4s, 4s)
GEN_BINARY_TEST(sub, 2s, 2s, 2s)
GEN_BINARY_TEST(sub, 8h, 8h, 8h)
GEN_BINARY_TEST(sub, 4h, 4h, 4h)
GEN_BINARY_TEST(sub, 16b, 16b, 16b)
GEN_BINARY_TEST(sub, 8b, 8b, 8b)
GEN_BINARY_TEST(mul, 4s, 4s, 4s)
GEN_BINARY_TEST(mul, 2s, 2s, 2s)
GEN_BINARY_TEST(mul, 8h, 8h, 8h)
GEN_BINARY_TEST(mul, 4h, 4h, 4h)
GEN_BINARY_TEST(mul, 16b, 16b, 16b)
GEN_BINARY_TEST(mul, 8b, 8b, 8b)
GEN_BINARY_TEST(mla, 4s, 4s, 4s)
GEN_BINARY_TEST(mla, 2s, 2s, 2s)
GEN_BINARY_TEST(mla, 8h, 8h, 8h)
GEN_BINARY_TEST(mla, 4h, 4h, 4h)
GEN_BINARY_TEST(mla, 16b, 16b, 16b)
GEN_BINARY_TEST(mla, 8b, 8b, 8b)
GEN_BINARY_TEST(mls, 4s, 4s, 4s)
GEN_BINARY_TEST(mls, 2s, 2s, 2s)
GEN_BINARY_TEST(mls, 8h, 8h, 8h)
GEN_BINARY_TEST(mls, 4h, 4h, 4h)
GEN_BINARY_TEST(mls, 16b, 16b, 16b)
GEN_BINARY_TEST(mls, 8b, 8b, 8b)
GEN_BINARY_TEST(and, 16b, 16b, 16b)
GEN_BINARY_TEST(and, 8b, 8b, 8b)
GEN_BINARY_TEST(bic, 16b, 16b, 16b)
GEN_BINARY_TEST(bic, 8b, 8b, 8b)
GEN_BINARY_TEST(orr, 16b, 16b, 16b)
GEN_BINARY_TEST(orr, 8b, 8b, 8b)
GEN_BINARY_TEST(orn, 16b, 16b, 16b)
GEN_BINARY_TEST(orn, 8b, 8b, 8b)
GEN_BINARY_TEST(eor, 16b, 16b, 16b)
GEN_BINARY_TEST(eor, 8b, 8b, 8b)
GEN_BINARY_TEST(bsl, 16b, 16b, 16b)
GEN_BINARY_TEST(bsl, 8b, 8b, 8b)
GEN_BINARY_TEST(bit, 16b, 16b, 16b)
GEN_BINARY_TEST(bit, 8b, 8b, 8b)
GEN_BINARY_TEST(bif, 16b, 16b, 16b)
GEN_BINARY_TEST(bif, 8b, 8b, 8b)
GEN_BINARY_TEST(cmeq, 2d, 2d, 2d)
GEN_BINARY_TEST(cmeq, 4s, 4s, 4s)
GEN_BINARY_TEST(cmeq, 2s, 2s, 2s)
GEN_BINARY_TEST(cmeq, 8h, 8h, 8h)
GEN_BINARY_TEST(cmeq, 4h, 4h, 4h)
GEN_BINARY_TEST(cmeq, 16b, 16b, 16b)
GEN_BINARY_TEST(cmeq, 8b, 8b, 8b)
GEN_BINARY_TEST(cmtst, 2d, 2d, 2d)
GEN_BINARY_TEST(cmtst, 4s, 4s, 4s)
GEN_BINARY_TEST(cmtst, 2s, 2s, 2s)
GEN_BINARY_TEST(cmtst, 8h, 8h, 8h)
GEN_BINARY_TEST(cmtst, 4h, 4h, 4h)
GEN_BINARY_TEST(cmtst, 16b, 16b, 16b)
GEN_BINARY_TEST(cmtst, 8b, 8b, 8b)
GEN_BINARY_TEST(cmhi, 2d, 2d, 2d)
GEN_BINARY_TEST(cmhi, 4s, 4s, 4s)
GEN_BINARY_TEST(cmhi, 2s, 2s, 2s)
GEN_BINARY_TEST(cmhi, 8h, 8h, 8h)
GEN_BINARY_TEST(cmhi, 4h, 4h, 4h)
GEN_BINARY_TEST(cmhi, 16b, 16b, 16b)
GEN_BINARY_TEST(cmhi, 8b, 8b, 8b)
GEN_BINARY_TEST(cmgt, 2d, 2d, 2d)
GEN_BINARY_TEST(cmgt, 4s, 4s, 4s)
GEN_BINARY_TEST(cmgt, 2s, 2s, 2s)
GEN_BINARY_TEST(cmgt, 8h, 8h, 8h)
GEN_BINARY_TEST(cmgt, 4h, 4h, 4h)
GEN_BINARY_TEST(cmgt, 16b, 16b, 16b)
GEN_BINARY_TEST(cmgt, 8b, 8b, 8b)
GEN_BINARY_TEST(cmhs, 2d, 2d, 2d)
GEN_BINARY_TEST(cmhs, 4s, 4s, 4s)
GEN_BINARY_TEST(cmhs, 2s, 2s, 2s)
GEN_BINARY_TEST(cmhs, 8h, 8h, 8h)
GEN_BINARY_TEST(cmhs, 4h, 4h, 4h)
GEN_BINARY_TEST(cmhs, 16b, 16b, 16b)
GEN_BINARY_TEST(cmhs, 8b, 8b, 8b)
GEN_BINARY_TEST(cmge, 2d, 2d, 2d)
GEN_BINARY_TEST(cmge, 4s, 4s, 4s)
GEN_BINARY_TEST(cmge, 2s, 2s, 2s)
GEN_BINARY_TEST(cmge, 8h, 8h, 8h)
GEN_BINARY_TEST(cmge, 4h, 4h, 4h)
GEN_BINARY_TEST(cmge, 16b, 16b, 16b)
GEN_BINARY_TEST(cmge, 8b, 8b, 8b)
GEN_SHIFT_TEST(ushr, 2d, 2d, 1)
GEN_SHIFT_TEST(ushr, 2d, 2d, 13)
GEN_SHIFT_TEST(ushr, 2d, 2d, 63)
GEN_SHIFT_TEST(sshr, 2d, 2d, 1)
GEN_SHIFT_TEST(sshr, 2d, 2d, 13)
GEN_SHIFT_TEST(sshr, 2d, 2d, 63)
GEN_SHIFT_TEST(shl, 2d, 2d, 1)
GEN_SHIFT_TEST(shl, 2d, 2d, 13)
GEN_SHIFT_TEST(shl, 2d, 2d, 63)
GEN_SHIFT_TEST(ushr, 4s, 4s, 1)
GEN_SHIFT_TEST(ushr, 4s, 4s, 13)
GEN_SHIFT_TEST(ushr, 4s, 4s, 31)
GEN_SHIFT_TEST(sshr, 4s, 4s, 1)
GEN_SHIFT_TEST(sshr, 4s, 4s, 13)
GEN_SHIFT_TEST(sshr, 4s, 4s, 31)
GEN_SHIFT_TEST(shl, 4s, 4s, 1)
GEN_SHIFT_TEST(shl, 4s, 4s, 13)
GEN_SHIFT_TEST(shl, 4s, 4s, 31)
GEN_SHIFT_TEST(ushr, 2s, 2s, 1)
GEN_SHIFT_TEST(ushr, 2s, 2s, 13)
GEN_SHIFT_TEST(ushr, 2s, 2s, 31)
GEN_SHIFT_TEST(sshr, 2s, 2s, 1)
GEN_SHIFT_TEST(sshr, 2s, 2s, 13)
GEN_SHIFT_TEST(sshr, 2s, 2s, 31)
GEN_SHIFT_TEST(shl, 2s, 2s, 1)
GEN_SHIFT_TEST(shl, 2s, 2s, 13)
GEN_SHIFT_TEST(shl, 2s, 2s, 31)
GEN_SHIFT_TEST(ushr, 8h, 8h, 1)
GEN_SHIFT_TEST(ushr, 8h, 8h, 13)
GEN_SHIFT_TEST(ushr, 8h, 8h, 15)
GEN_SHIFT_TEST(sshr, 8h, 8h, 1)
GEN_SHIFT_TEST(sshr, 8h, 8h, 13)
GEN_SHIFT_TEST(sshr, 8h, 8h, 15)
GEN_SHIFT_TEST(shl, 8h, 8h, 1)
GEN_SHIFT_TEST(shl, 8h, 8h, 13)
GEN_SHIFT_TEST(shl, 8h, 8h, 15)
GEN_SHIFT_TEST(ushr, 4h, 4h, 1)
GEN_SHIFT_TEST(ushr, 4h, 4h, 13)
GEN_SHIFT_TEST(ushr, 4h, 4h, 15)
GEN_SHIFT_TEST(sshr, 4h, 4h, 1)
GEN_SHIFT_TEST(sshr, 4h, 4h, 13)
GEN_SHIFT_TEST(sshr, 4h, 4h, 15)
GEN_SHIFT_TEST(shl, 4h, 4h, 1)
GEN_SHIFT_TEST(shl, 4h, 4h, 13)
GEN_SHIFT_TEST(shl, 4h, 4h, 15)
GEN_SHIFT_TEST(ushr, 16b, 16b, 1)
GEN_SHIFT_TEST(ushr, 16b, 16b, 7)
GEN_SHIFT_TEST(sshr, 16b, 16b, 1)
GEN_SHIFT_TEST(sshr, 16b, 16b, 7)
GEN_SHIFT_TEST(shl, 16b, 16b, 1)
GEN_SHIFT_TEST(shl, 16b, 16b, 7)
GEN_SHIFT_TEST(ushr, 8b, 8b, 1)
GEN_SHIFT_TEST(ushr, 8b, 8b, 7)
GEN_SHIFT_TEST(sshr, 8b, 8b, 1)
GEN_SHIFT_TEST(sshr, 8b, 8b, 7)
GEN_SHIFT_TEST(shl, 8b, 8b, 1)
GEN_SHIFT_TEST(shl, 8b, 8b, 7)
GEN_SHIFT_TEST(ushll, 2d, 2s, 0)
GEN_SHIFT_TEST(ushll, 2d, 2s, 15)
GEN_SHIFT_TEST(ushll, 2d, 2s, 31)
GEN_SHIFT_TEST(ushll2, 2d, 4s, 0)
GEN_SHIFT_TEST(ushll2, 2d, 4s, 15)
GEN_SHIFT_TEST(ushll2, 2d, 4s, 31)
GEN_SHIFT_TEST(sshll, 2d, 2s, 0)
GEN_SHIFT_TEST(sshll, 2d, 2s, 15)
GEN_SHIFT_TEST(sshll, 2d, 2s, 31)
GEN_SHIFT_TEST(sshll2, 2d, 4s, 0)
GEN_SHIFT_TEST(sshll2, 2d, 4s, 15)
GEN_SHIFT_TEST(sshll2, 2d, 4s, 31)
GEN_UNARY_TEST(xtn, 2s, 2d)
GEN_UNARY_TEST(xtn2, 4s, 2d)
GEN_UNARY_TEST(xtn, 4h, 4s)
GEN_UNARY_TEST(xtn2, 8h, 4s)
GEN_UNARY_TEST(xtn, 8b, 8h)
GEN_UNARY_TEST(xtn2, 16b, 8h)
GEN_ONEINT_ONEVEC_TEST(umov_x_d0, "umov x9, v10.d[0]", 9, 10)
GEN_ONEINT_ONEVEC_TEST(umov_x_d1, "umov x9, v10.d[1]", 9, 10)
GEN_ONEINT_ONEVEC_TEST(umov_w_s0, "umov w9, v10.s[0]", 9, 10)
GEN_ONEINT_ONEVEC_TEST(umov_w_s3, "umov w9, v10.s[3]", 9, 10)
GEN_ONEINT_ONEVEC_TEST(umov_w_h0, "umov w9, v10.h[0]", 9, 10)
GEN_ONEINT_ONEVEC_TEST(umov_w_h7, "umov w9, v10.h[7]", 9, 10)
GEN_ONEINT_ONEVEC_TEST(umov_w_b0, "umov w9, v10.b[0]", 9, 10)
GEN_ONEINT_ONEVEC_TEST(umov_w_b15, "umov w9, v10.b[15]", 9, 10)
GEN_ONEINT_ONEVEC_TEST(smov_x_s0, "smov x9, v10.s[0]", 9, 10)
GEN_ONEINT_ONEVEC_TEST(smov_x_s3, "smov x9, v10.s[3]", 9, 10)
GEN_ONEINT_ONEVEC_TEST(smov_x_h0, "smov x9, v10.h[0]", 9, 10)
GEN_ONEINT_ONEVEC_TEST(smov_x_h7, "smov x9, v10.h[7]", 9, 10)
GEN_ONEINT_ONEVEC_TEST(smov_w_h0, "smov w9, v10.h[0]", 9, 10)
GEN_ONEINT_ONEVEC_TEST(smov_w_h7, "smov w9, v10.h[7]", 9, 10)
GEN_ONEINT_ONEVEC_TEST(smov_x_b0, "smov x9, v10.b[0]", 9, 10)
GEN_ONEINT_ONEVEC_TEST(smov_x_b15, "smov x9, v10.b[15]", 9, 10)
GEN_ONEINT_ONEVEC_TEST(smov_w_b0, "smov w9, v10.b[0]", 9, 10)
GEN_ONEINT_ONEVEC_TEST(smov_w_b15, "smov w9, v10.b[15]", 9, 10)
GEN_TWOVEC_TEST(fcvtn_2s_2d, "fcvtn v22.2s, v23.2d", 22, 23)
GEN_TWOVEC_TEST(fcvtn_4s_2d, "fcvtn2 v22.4s, v23.2d", 22, 23)
GEN_UNARY_TEST(neg, 2d, 2d)
GEN_UNARY_TEST(neg, 4s, 4s)
GEN_UNARY_TEST(neg, 2s, 2s)
GEN_UNARY_TEST(neg, 8h, 8h)
GEN_UNARY_TEST(neg, 4h, 4h)
GEN_UNARY_TEST(neg, 16b, 16b)
GEN_UNARY_TEST(neg, 8b, 8b)
GEN_BINARY_TEST(fadd, 2d, 2d, 2d)
GEN_BINARY_TEST(fadd, 4s, 4s, 4s)
GEN_BINARY_TEST(fadd, 2s, 2s, 2s)
GEN_BINARY_TEST(fsub, 2d, 2d, 2d)
GEN_BINARY_TEST(fsub, 4s, 4s, 4s)
GEN_BINARY_TEST(fsub, 2s, 2s, 2s)
GEN_BINARY_TEST(fmul, 2d, 2d, 2d)
GEN_BINARY_TEST(fmul, 4s, 4s, 4s)
GEN_BINARY_TEST(fmul, 2s, 2s, 2s)
GEN_BINARY_TEST(fdiv, 2d, 2d, 2d)
GEN_BINARY_TEST(fdiv, 4s, 4s, 4s)
GEN_BINARY_TEST(fdiv, 2s, 2s, 2s)
GEN_BINARY_TEST(fmla, 2d, 2d, 2d)
GEN_BINARY_TEST(fmla, 4s, 4s, 4s)
GEN_BINARY_TEST(fmla, 2s, 2s, 2s)
GEN_BINARY_TEST(fmls, 2d, 2d, 2d)
GEN_BINARY_TEST(fmls, 4s, 4s, 4s)
GEN_BINARY_TEST(fmls, 2s, 2s, 2s)
GEN_BINARY_TEST(fabd, 2d, 2d, 2d)
GEN_BINARY_TEST(fabd, 4s, 4s, 4s)
GEN_BINARY_TEST(fabd, 2s, 2s, 2s)
GEN_THREEVEC_TEST(add_d_d_d, "add d21, d22, d23", 21, 22, 23)
GEN_THREEVEC_TEST(sub_d_d_d, "sub d21, d22, d23", 21, 22, 23)
/* overkill -- don't need two vecs, only one */
GEN_TWOVEC_TEST(fmov_d_imm_01, "fmov d22, #0.125", 22, 23)
GEN_TWOVEC_TEST(fmov_d_imm_02, "fmov d22, #-4.0", 22, 23)
GEN_TWOVEC_TEST(fmov_d_imm_03, "fmov d22, #1.0", 22, 23)
GEN_TWOVEC_TEST(fmov_s_imm_01, "fmov s22, #0.125", 22, 23)
GEN_TWOVEC_TEST(fmov_s_imm_02, "fmov s22, #-4.0", 22, 23)
GEN_TWOVEC_TEST(fmov_s_imm_03, "fmov s22, #-1.0", 22, 23)
GEN_ONEINT_ONEVEC_TEST(fmov_s_w, "fmov s7, w15", 15, 7)
GEN_ONEINT_ONEVEC_TEST(fmov_d_x, "fmov d7, x15", 15, 7)
GEN_ONEINT_ONEVEC_TEST(fmov_d1_x, "fmov v7.d[1], x15", 15, 7)
GEN_ONEINT_ONEVEC_TEST(fmov_w_s, "fmov w15, s7", 15, 7)
GEN_ONEINT_ONEVEC_TEST(fmov_x_d, "fmov x15, d7", 15, 7)
GEN_ONEINT_ONEVEC_TEST(fmov_x_d1, "fmov x15, v7.d[1]", 15, 7)
GEN_TWOVEC_TEST(fmov_2d_imm_01, "fmov v22.2d, #0.125", 22, 23)
GEN_TWOVEC_TEST(fmov_2d_imm_02, "fmov v22.2d, #-4.0", 22, 23)
GEN_TWOVEC_TEST(fmov_2d_imm_03, "fmov v22.2d, #1.0", 22, 23)
GEN_TWOVEC_TEST(fmov_4s_imm_01, "fmov v22.4s, #0.125", 22, 23)
GEN_TWOVEC_TEST(fmov_4s_imm_02, "fmov v22.4s, #-4.0", 22, 23)
GEN_TWOVEC_TEST(fmov_4s_imm_03, "fmov v22.4s, #1.0", 22, 23)
GEN_TWOVEC_TEST(fmov_2s_imm_01, "fmov v22.2s, #0.125", 22, 23)
GEN_TWOVEC_TEST(fmov_2s_imm_02, "fmov v22.2s, #-4.0", 22, 23)
GEN_TWOVEC_TEST(fmov_2s_imm_03, "fmov v22.2s, #1.0", 22, 23)
GEN_ONEINT_ONEVEC_TEST(scvtf_s_w, "scvtf s7, w15", 15, 7)
GEN_ONEINT_ONEVEC_TEST(scvtf_d_w, "scvtf d7, w15", 15, 7)
GEN_ONEINT_ONEVEC_TEST(scvtf_s_x, "scvtf s7, x15", 15, 7)
GEN_ONEINT_ONEVEC_TEST(scvtf_d_x, "scvtf d7, x15", 15, 7)
GEN_ONEINT_ONEVEC_TEST(ucvtf_s_w, "ucvtf s7, w15", 15, 7)
GEN_ONEINT_ONEVEC_TEST(ucvtf_d_w, "ucvtf d7, w15", 15, 7)
GEN_ONEINT_ONEVEC_TEST(ucvtf_s_x, "ucvtf s7, x15", 15, 7)
GEN_ONEINT_ONEVEC_TEST(ucvtf_d_x, "ucvtf d7, x15", 15, 7)
GEN_THREEVEC_TEST(fadd_d_d_d, "fadd d2, d11, d29", 2, 11, 29)
GEN_THREEVEC_TEST(fadd_s_s_s, "fadd s2, s11, s29", 2, 11, 29)
GEN_THREEVEC_TEST(fsub_d_d_d, "fsub d2, d11, d29", 2, 11, 29)
GEN_THREEVEC_TEST(fsub_s_s_s, "fsub s2, s11, s29", 2, 11, 29)
GEN_THREEVEC_TEST(fmul_d_d_d, "fmul d2, d11, d29", 2, 11, 29)
GEN_THREEVEC_TEST(fmul_s_s_s, "fmul s2, s11, s29", 2, 11, 29)
GEN_THREEVEC_TEST(fdiv_d_d_d, "fdiv d2, d11, d29", 2, 11, 29)
GEN_THREEVEC_TEST(fdiv_s_s_s, "fdiv s2, s11, s29", 2, 11, 29)
GEN_THREEVEC_TEST(fnmul_d_d_d, "fnmul d2, d11, d29", 2, 11, 29)
GEN_THREEVEC_TEST(fnmul_s_s_s, "fnmul s2, s11, s29", 2, 11, 29)
GEN_THREEVEC_TEST(fabd_d_d_d, "fabd d2, d11, d29", 2, 11, 29)
GEN_THREEVEC_TEST(fabd_s_s_s, "fabd s2, s11, s29", 2, 11, 29)
GEN_TWOVEC_TEST(fmov_d_d, "fmov d22, d23", 22, 23)
GEN_TWOVEC_TEST(fmov_s_s, "fmov s22, s23", 22, 23)
GEN_TWOVEC_TEST(fabs_d_d, "fabs d22, d23", 22, 23)
GEN_TWOVEC_TEST(fabs_s_s, "fabs s22, s23", 22, 23)
GEN_TWOVEC_TEST(fneg_d_d, "fneg d22, d23", 22, 23)
GEN_TWOVEC_TEST(fneg_s_s, "fneg s22, s23", 22, 23)
GEN_TWOVEC_TEST(fsqrt_d_d, "fsqrt d22, d23", 22, 23)
GEN_TWOVEC_TEST(fsqrt_s_s, "fsqrt s22, s23", 22, 23)
GEN_UNARY_TEST(fneg, 2d, 2d)
GEN_UNARY_TEST(fneg, 4s, 4s)
GEN_UNARY_TEST(fneg, 2s, 2s)
GEN_UNARY_TEST(fabs, 2d, 2d)
GEN_UNARY_TEST(fabs, 4s, 4s)
GEN_UNARY_TEST(fabs, 2s, 2s)
GEN_BINARY_TEST(fcmeq, 2d, 2d, 2d)
GEN_BINARY_TEST(fcmeq, 4s, 4s, 4s)
GEN_BINARY_TEST(fcmeq, 2s, 2s, 2s)
GEN_BINARY_TEST(fcmge, 2d, 2d, 2d)
GEN_BINARY_TEST(fcmge, 4s, 4s, 4s)
GEN_BINARY_TEST(fcmge, 2s, 2s, 2s)
GEN_BINARY_TEST(fcmgt, 2d, 2d, 2d)
GEN_BINARY_TEST(fcmgt, 4s, 4s, 4s)
GEN_BINARY_TEST(fcmgt, 2s, 2s, 2s)
GEN_BINARY_TEST(facge, 2d, 2d, 2d)
GEN_BINARY_TEST(facge, 4s, 4s, 4s)
GEN_BINARY_TEST(facge, 2s, 2s, 2s)
GEN_BINARY_TEST(facgt, 2d, 2d, 2d)
GEN_BINARY_TEST(facgt, 4s, 4s, 4s)
GEN_BINARY_TEST(facgt, 2s, 2s, 2s)
// Uses v15 as the first table entry
GEN_THREEVEC_TEST(
tbl_16b_1reg, "tbl v21.16b, {v15.16b}, v23.16b", 21, 15, 23)
// and v15 ^ v21 as the second table entry
GEN_THREEVEC_TEST(
tbl_16b_2reg, "eor v16.16b, v15.16b, v21.16b ; "
"tbl v21.16b, {v15.16b, v16.16b}, v23.16b", 21, 15, 23)
// and v15 ^ v23 as the third table entry
GEN_THREEVEC_TEST(
tbl_16b_3reg, "eor v16.16b, v15.16b, v21.16b ; "
"eor v17.16b, v15.16b, v23.16b ; "
"tbl v21.16b, {v15.16b, v16.16b, v17.16b}, v23.16b",
21, 15, 23)
// and v21 ^ v23 as the fourth table entry
GEN_THREEVEC_TEST(
tbl_16b_4reg, "eor v16.16b, v15.16b, v21.16b ; "
"eor v17.16b, v15.16b, v23.16b ; "
"eor v18.16b, v21.16b, v23.16b ; "
"tbl v21.16b, {v15.16b, v16.16b, v17.16b, v18.16b}, v23.16b",
21, 15, 23)
// Same register scheme for tbl .8b, tbx .16b, tbx.8b
GEN_THREEVEC_TEST(
tbl_8b_1reg, "tbl v21.8b, {v15.16b}, v23.8b", 21, 15, 23)
GEN_THREEVEC_TEST(
tbl_8b_2reg, "eor v16.16b, v15.16b, v21.16b ; "
"tbl v21.8b, {v15.16b, v16.16b}, v23.8b", 21, 15, 23)
GEN_THREEVEC_TEST(
tbl_8b_3reg, "eor v16.16b, v15.16b, v21.16b ; "
"eor v17.16b, v15.16b, v23.16b ; "
"tbl v21.8b, {v15.16b, v16.16b, v17.16b}, v23.8b",
21, 15, 23)
GEN_THREEVEC_TEST(
tbl_8b_4reg, "eor v16.16b, v15.16b, v21.16b ; "
"eor v17.16b, v15.16b, v23.16b ; "
"eor v18.16b, v21.16b, v23.16b ; "
"tbl v21.8b, {v15.16b, v16.16b, v17.16b, v18.16b}, v23.8b",
21, 15, 23)
GEN_THREEVEC_TEST(
tbx_16b_1reg, "tbx v21.16b, {v15.16b}, v23.16b", 21, 15, 23)
GEN_THREEVEC_TEST(
tbx_16b_2reg, "eor v16.16b, v15.16b, v21.16b ; "
"tbx v21.16b, {v15.16b, v16.16b}, v23.16b", 21, 15, 23)
GEN_THREEVEC_TEST(
tbx_16b_3reg, "eor v16.16b, v15.16b, v21.16b ; "
"eor v17.16b, v15.16b, v23.16b ; "
"tbx v21.16b, {v15.16b, v16.16b, v17.16b}, v23.16b",
21, 15, 23)
GEN_THREEVEC_TEST(
tbx_16b_4reg, "eor v16.16b, v15.16b, v21.16b ; "
"eor v17.16b, v15.16b, v23.16b ; "
"eor v18.16b, v21.16b, v23.16b ; "
"tbx v21.16b, {v15.16b, v16.16b, v17.16b, v18.16b}, v23.16b",
21, 15, 23)
// Same register scheme for tbx .8b, tbx .16b, tbx.8b
GEN_THREEVEC_TEST(
tbx_8b_1reg, "tbx v21.8b, {v15.16b}, v23.8b", 21, 15, 23)
GEN_THREEVEC_TEST(
tbx_8b_2reg, "eor v16.16b, v15.16b, v21.16b ; "
"tbx v21.8b, {v15.16b, v16.16b}, v23.8b", 21, 15, 23)
GEN_THREEVEC_TEST(
tbx_8b_3reg, "eor v16.16b, v15.16b, v21.16b ; "
"eor v17.16b, v15.16b, v23.16b ; "
"tbx v21.8b, {v15.16b, v16.16b, v17.16b}, v23.8b",
21, 15, 23)
GEN_THREEVEC_TEST(
tbx_8b_4reg, "eor v16.16b, v15.16b, v21.16b ; "
"eor v17.16b, v15.16b, v23.16b ; "
"eor v18.16b, v21.16b, v23.16b ; "
"tbx v21.8b, {v15.16b, v16.16b, v17.16b, v18.16b}, v23.8b",
21, 15, 23)
GEN_TWOVEC_TEST(cmge_zero_2d_2d, "cmge v5.2d, v22.2d, #0", 5, 22)
GEN_TWOVEC_TEST(cmge_zero_4s_4s, "cmge v5.4s, v22.4s, #0", 5, 22)
GEN_TWOVEC_TEST(cmge_zero_2s_2s, "cmge v5.2s, v22.2s, #0", 5, 22)
GEN_TWOVEC_TEST(cmge_zero_8h_8h, "cmge v5.8h, v22.8h, #0", 5, 22)
GEN_TWOVEC_TEST(cmge_zero_4h_4h, "cmge v5.4h, v22.4h, #0", 5, 22)
GEN_TWOVEC_TEST(cmge_zero_16b_16b, "cmge v5.16b, v22.16b, #0", 5, 22)
GEN_TWOVEC_TEST(cmge_zero_8b_8b, "cmge v5.8b, v22.8b, #0", 5, 22)
GEN_TWOVEC_TEST(cmgt_zero_2d_2d, "cmgt v5.2d, v22.2d, #0", 5, 22)
GEN_TWOVEC_TEST(cmgt_zero_4s_4s, "cmgt v5.4s, v22.4s, #0", 5, 22)
GEN_TWOVEC_TEST(cmgt_zero_2s_2s, "cmgt v5.2s, v22.2s, #0", 5, 22)
GEN_TWOVEC_TEST(cmgt_zero_8h_8h, "cmgt v5.8h, v22.8h, #0", 5, 22)
GEN_TWOVEC_TEST(cmgt_zero_4h_4h, "cmgt v5.4h, v22.4h, #0", 5, 22)
GEN_TWOVEC_TEST(cmgt_zero_16b_16b, "cmgt v5.16b, v22.16b, #0", 5, 22)
GEN_TWOVEC_TEST(cmgt_zero_8b_8b, "cmgt v5.8b, v22.8b, #0", 5, 22)
GEN_TWOVEC_TEST(cmle_zero_2d_2d, "cmle v5.2d, v22.2d, #0", 5, 22)
GEN_TWOVEC_TEST(cmle_zero_4s_4s, "cmle v5.4s, v22.4s, #0", 5, 22)
GEN_TWOVEC_TEST(cmle_zero_2s_2s, "cmle v5.2s, v22.2s, #0", 5, 22)
GEN_TWOVEC_TEST(cmle_zero_8h_8h, "cmle v5.8h, v22.8h, #0", 5, 22)
GEN_TWOVEC_TEST(cmle_zero_4h_4h, "cmle v5.4h, v22.4h, #0", 5, 22)
GEN_TWOVEC_TEST(cmle_zero_16b_16b, "cmle v5.16b, v22.16b, #0", 5, 22)
GEN_TWOVEC_TEST(cmle_zero_8b_8b, "cmle v5.8b, v22.8b, #0", 5, 22)
GEN_TWOVEC_TEST(cmeq_zero_2d_2d, "cmeq v5.2d, v22.2d, #0", 5, 22)
GEN_TWOVEC_TEST(cmeq_zero_4s_4s, "cmeq v5.4s, v22.4s, #0", 5, 22)
GEN_TWOVEC_TEST(cmeq_zero_2s_2s, "cmeq v5.2s, v22.2s, #0", 5, 22)
GEN_TWOVEC_TEST(cmeq_zero_8h_8h, "cmeq v5.8h, v22.8h, #0", 5, 22)
GEN_TWOVEC_TEST(cmeq_zero_4h_4h, "cmeq v5.4h, v22.4h, #0", 5, 22)
GEN_TWOVEC_TEST(cmeq_zero_16b_16b, "cmeq v5.16b, v22.16b, #0", 5, 22)
GEN_TWOVEC_TEST(cmeq_zero_8b_8b, "cmeq v5.8b, v22.8b, #0", 5, 22)
GEN_TWOVEC_TEST(cmlt_zero_2d_2d, "cmlt v5.2d, v22.2d, #0", 5, 22)
GEN_TWOVEC_TEST(cmlt_zero_4s_4s, "cmlt v5.4s, v22.4s, #0", 5, 22)
GEN_TWOVEC_TEST(cmlt_zero_2s_2s, "cmlt v5.2s, v22.2s, #0", 5, 22)
GEN_TWOVEC_TEST(cmlt_zero_8h_8h, "cmlt v5.8h, v22.8h, #0", 5, 22)
GEN_TWOVEC_TEST(cmlt_zero_4h_4h, "cmlt v5.4h, v22.4h, #0", 5, 22)
GEN_TWOVEC_TEST(cmlt_zero_16b_16b, "cmlt v5.16b, v22.16b, #0", 5, 22)
GEN_TWOVEC_TEST(cmlt_zero_8b_8b, "cmlt v5.8b, v22.8b, #0", 5, 22)
/* ---------------------------------------------------------------- */
/* -- main() -- */
/* ---------------------------------------------------------------- */
int main ( void )
{
assert(sizeof(V128) == 16);
// ======================== FP ========================
// fabs d,s
// fabs 2d,4s,2s
test_fabs_d_d(TyDF);
test_fabs_s_s(TySF);
test_fabs_2d_2d(TyDF);
test_fabs_4s_4s(TySF);
test_fabs_2s_2s(TyDF);
test_fneg_2d_2d(TySF);
test_fneg_4s_4s(TyDF);
test_fneg_2s_2s(TySF);
// fneg d,s
// fneg 2d,4s,2s
test_fneg_d_d(TyDF);
test_fneg_s_s(TySF);
// fsqrt d,s
// fsqrt 2d,4s,2s
test_fsqrt_d_d(TyDF);
test_fsqrt_s_s(TySF);
// fadd d,s
// fsub d,s
test_fadd_d_d_d(TyDF);
test_fadd_s_s_s(TySF);
test_fsub_d_d_d(TyDF);
test_fsub_s_s_s(TySF);
// fadd 2d,4s,2s
// fsub 2d,4s,2s
test_fadd_2d_2d_2d(TyDF);
test_fadd_4s_4s_4s(TySF);
test_fadd_2s_2s_2s(TySF);
test_fsub_2d_2d_2d(TyDF);
test_fsub_4s_4s_4s(TySF);
test_fsub_2s_2s_2s(TySF);
// fabd d,s
// fabd 2d,4s,2s
test_fabd_d_d_d(TyDF);
test_fabd_s_s_s(TySF);
test_fabd_2d_2d_2d(TyDF);
test_fabd_4s_4s_4s(TySF);
test_fabd_2s_2s_2s(TySF);
// faddp d,s (floating add pair)
// faddp 2d,4s,2s
// fccmp d,s (floating point conditional quiet compare)
// fccmpe d,s (floating point conditional signaling compare)
// fcmeq d,s
// fcmge d,s
// fcmgt d,s
// facgt d,s (floating abs compare GE)
// facge d,s (floating abs compare GE)
// fcmeq 2d,4s,2s
// fcmge 2d,4s,2s
// fcmgt 2d,4s,2s
// facge 2d,4s,2s
// facgt 2d,4s,2s
test_fcmeq_2d_2d_2d(TyDF);
test_fcmeq_4s_4s_4s(TySF);
test_fcmeq_2s_2s_2s(TySF);
test_fcmge_2d_2d_2d(TyDF);
test_fcmge_4s_4s_4s(TySF);
test_fcmge_2s_2s_2s(TySF);
test_fcmgt_2d_2d_2d(TyDF);
test_fcmgt_4s_4s_4s(TySF);
test_fcmgt_2s_2s_2s(TySF);
test_facge_2d_2d_2d(TyDF);
test_facge_4s_4s_4s(TySF);
test_facge_2s_2s_2s(TySF);
test_facgt_2d_2d_2d(TyDF);
test_facgt_4s_4s_4s(TySF);
test_facgt_2s_2s_2s(TySF);
// fcmeq_z d,s
// fcmge_z d,s
// fcmgt_z d,s
// fcmle_z d,s
// fcmlt_z d,s
// fcmeq_z 2d,4s,2s
// fcmge_z 2d,4s,2s
// fcmgt_z 2d,4s,2s
// fcmle_z 2d,4s,2s
// fcmlt_z 2d,4s,2s
// fcmp_z d,s
// fcmpe_z d,s
// fcmp d,s (floating point quiet, set flags)
// fcmpe d,s (floating point signaling, set flags)
// fcsel d,s (fp cond select)
// fdiv d,s
// fdiv 2d,4s,2s
test_fdiv_d_d_d(TyDF);
test_fdiv_s_s_s(TySF);
test_fdiv_2d_2d_2d(TyDF);
test_fdiv_4s_4s_4s(TySF);
test_fdiv_2s_2s_2s(TySF);
// fmadd d,s
// fnmadd d,s
// fmsub d,s
// fnmsub d,s
// fnmul d,s
test_fnmul_d_d_d(TyDF);
test_fnmul_s_s_s(TySF);
// fmax d,s
// fmin d,s
// fmaxnm d,s ("max number")
// fminnm d,s
// fmax 2d,4s,2s
// fmin 2d,4s,2s
// fmaxnm 2d,4s,2s
// fminnm 2d,4s,2s
// fmaxnmp d_2d,s_2s ("max number pairwise")
// fminnmp d_2d,s_2s
// fmaxnmp 2d,4s,2s
// fminnmp 2d,4s,2s
// fmaxnmv s_4s (maxnum across vector)
// fminnmv s_4s
// fmaxp d_2d,s_2s (max of a pair)
// fminp d_2d,s_2s (max of a pair)
// fmaxp 2d,4s,2s (max pairwise)
// fminp 2d,4s,2s
// fmaxv s_4s (max across vector)
// fminv s_4s
// fmla 2d,4s,2s
// fmls 2d,4s,2s
test_fmla_2d_2d_2d(TyDF);
test_fmla_4s_4s_4s(TySF);
test_fmla_2s_2s_2s(TySF);
test_fmls_2d_2d_2d(TyDF);
test_fmls_4s_4s_4s(TySF);
test_fmls_2s_2s_2s(TySF);
// fmla d_d_d[],s_s_s[] (by element)
// fmls d_d_d[],s_s_s[] (by element)
// fmla 2d_2d_d[],4s_4s_s[],2s_2s_s[]
// fmls 2d_2d_d[],4s_4s_s[],2s_2s_s[]
// fmov 2d,4s,2s #imm (part of the MOVI/MVNI/ORR/BIC imm group)
// INCOMPLETE
test_fmov_2d_imm_01(TyD);
test_fmov_2d_imm_02(TyD);
test_fmov_2d_imm_03(TyD);
if (0) test_fmov_4s_imm_01(TyS);
if (0) test_fmov_4s_imm_02(TyS);
if (0) test_fmov_4s_imm_03(TyS);
if (0) test_fmov_2s_imm_01(TyS);
if (0) test_fmov_2s_imm_02(TyS);
if (0) test_fmov_2s_imm_03(TyS);
// fmov d_d,s_s
test_fmov_d_d(TyDF);
test_fmov_s_s(TySF);
// fmov s_w,w_s,d_x,d[1]_x,x_d,x_d[1]
test_fmov_s_w(TyS);
test_fmov_d_x(TyD);
test_fmov_d1_x(TyD);
test_fmov_w_s(TyS);
test_fmov_x_d(TyD);
test_fmov_x_d1(TyD);
// fmov d,s #imm
test_fmov_d_imm_01(TyNONE);
test_fmov_d_imm_02(TyNONE);
test_fmov_d_imm_03(TyNONE);
test_fmov_s_imm_01(TyNONE);
test_fmov_s_imm_02(TyNONE);
test_fmov_s_imm_03(TyNONE);
// fmul d_d_d[],s_s_s[]
// fmul 2d_2d_d[],4s_4s_s[],2s_2s_s[]
// fmul 2d,4s,2s
// fmul d,s
test_fmul_d_d_d(TyDF);
test_fmul_s_s_s(TySF);
test_fmul_2d_2d_2d(TyDF);
test_fmul_4s_4s_4s(TySF);
test_fmul_2s_2s_2s(TySF);
// fmulx d_d_d[],s_s_s[]
// fmulx 2d_2d_d[],4s_4s_s[],2s_2s_s[]
// fmulx d,s
// fmulx 2d,4s,2s
// frecpe d,s (recip estimate)
// frecpe 2d,4s,2s
// frecps d,s (recip step)
// frecps 2d,4s,2s
// frecpx d,s (recip exponent)
// frinta d,s
// frinti d,s
// frintm d,s
// frintn d,s
// frintp d,s
// frintx d,s
// frintz d,s
// frinta 2d,4s,2s (round to integral, nearest away)
// frinti 2d,4s,2s (round to integral, per FPCR)
// frintm 2d,4s,2s (round to integral, minus inf)
// frintn 2d,4s,2s (round to integral, nearest, to even)
// frintp 2d,4s,2s (round to integral, plus inf)
// frintx 2d,4s,2s (round to integral exact, per FPCR)
// frintz 2d,4s,2s (round to integral, zero)
// frsqrte d,s (est)
// frsqrte 2d,4s,2s
// frsqrts d,s (step)
// frsqrts 2d,4s,2s
// ======================== CONV ========================
// fcvt s_h,d_h,h_s,d_s,h_d,s_d (fp convert, scalar)
// fcvtl{2} 4s/4h, 4s/8h, 2d/2s, 2d/4s (float convert to longer form)
// fcvtn{2} 4h/4s, 8h/4s, 2s/2d, 4s/2d (float convert to narrower form)
// INCOMPLETE
test_fcvtn_2s_2d(TyDF);
test_fcvtn_4s_2d(TyDF);
// fcvtas d,s (fcvt to signed int, nearest, ties away)
// fcvtau d,s (fcvt to unsigned int, nearest, ties away)
// fcvtas 2d,4s,2s
// fcvtau 2d,4s,2s
// fcvtas w_s,x_s,w_d,x_d
// fcvtau w_s,x_s,w_d,x_d
// fcvtms d,s (fcvt to signed int, minus inf)
// fcvtmu d,s (fcvt to unsigned int, minus inf)
// fcvtms 2d,4s,2s
// fcvtmu 2d,4s,2s
// fcvtms w_s,x_s,w_d,x_d
// fcvtmu w_s,x_s,w_d,x_d
// fcvtns d,s (fcvt to signed int, nearest)
// fcvtnu d,s (fcvt to unsigned int, nearest)
// fcvtns 2d,4s,2s
// fcvtnu 2d,4s,2s
// fcvtns w_s,x_s,w_d,x_d
// fcvtnu w_s,x_s,w_d,x_d
// fcvtps d,s (fcvt to signed int, plus inf)
// fcvtpu d,s (fcvt to unsigned int, plus inf)
// fcvtps 2d,4s,2s
// fcvtpu 2d,4s,2s
// fcvtps w_s,x_s,w_d,x_d
// fcvtpu w_s,x_s,w_d,x_d
// fcvtzs d,s (fcvt to signed integer, to zero)
// fcvtzu d,s (fcvt to unsigned integer, to zero)
// fcvtzs 2d,4s,2s
// fcvtzu 2d,4s,2s
// fcvtzs w_s,x_s,w_d,x_d
// fcvtzu w_s,x_s,w_d,x_d
// fcvtzs d,s (fcvt to signed fixedpt, to zero) (w/ #fbits)
// fcvtzu d,s (fcvt to unsigned fixedpt, to zero) (w/ #fbits)
// fcvtzs 2d,4s,2s
// fcvtzu 2d,4s,2s
// fcvtzs w_s,x_s,w_d,x_d (fcvt to signed fixedpt, to zero) (w/ #fbits)
// fcvtzu w_s,x_s,w_d,x_d (fcvt to unsigned fixedpt, to zero) (w/ #fbits)
// fcvtxn s_d (fcvt to lower prec narrow, rounding to odd)
// fcvtxn 2s_2d,4s_2d
// scvtf d,s _#fbits
// ucvtf d,s _#fbits
// scvtf 2d,4s,2s _#fbits
// ucvtf 2d,4s,2s _#fbits
// scvtf d,s
// ucvtf d,s
// scvtf 2d,4s,2s
// ucvtf 2d,4s,2s
// scvtf s_w, d_w, s_x, d_x, _#fbits
// ucvtf s_w, d_w, s_x, d_x, _#fbits
// scvtf s_w, d_w, s_x, d_x
// ucvtf s_w, d_w, s_x, d_x
test_scvtf_s_w(TyS);
test_scvtf_d_w(TyS);
test_scvtf_s_x(TyD);
test_scvtf_d_x(TyD);
test_ucvtf_s_w(TyS);
test_ucvtf_d_w(TyS);
test_ucvtf_s_x(TyD);
test_ucvtf_d_x(TyD);
// ======================== INT ========================
// abs d
// neg d
// abs 2d,4s,2s,8h,4h,16b,8b
// neg 2d,4s,2s,8h,4h,16b,8b
test_neg_2d_2d(TyD);
test_neg_4s_4s(TyS);
test_neg_2s_2s(TyS);
test_neg_8h_8h(TyH);
test_neg_4h_4h(TyH);
test_neg_16b_16b(TyB);
test_neg_8b_8b(TyB);
// add d
// sub d
test_add_d_d_d(TyD);
test_sub_d_d_d(TyD);
// add 2d,4s,2s,8h,4h,16b,8b
// sub 2d,4s,2s,8h,4h,16b,8b
test_add_2d_2d_2d(TyD);
test_add_4s_4s_4s(TyS);
test_add_2s_2s_2s(TyS);
test_add_8h_8h_8h(TyH);
test_add_4h_4h_4h(TyH);
test_add_16b_16b_16b(TyB);
test_add_8b_8b_8b(TyB);
test_sub_2d_2d_2d(TyD);
test_sub_4s_4s_4s(TyS);
test_sub_2s_2s_2s(TyS);
test_sub_8h_8h_8h(TyH);
test_sub_4h_4h_4h(TyH);
test_sub_16b_16b_16b(TyB);
test_sub_8b_8b_8b(TyB);
// addhn{2} 2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
// subhn{2} 2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
// raddhn{2} 2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
// rsubhn{2} 2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
// addp d (add pairs, across)
// addp 2d,4s,2s,8h,4h,16b,8b
// addv 4s,8h,4h,16b,18b (reduce across vector)
// and 16b,8b
// bic 16b,8b (vector,reg) (bit clear)
// orn 16b,8b
// orr 16b,8b
test_and_16b_16b_16b(TyB);
test_and_8b_8b_8b(TyB);
test_bic_16b_16b_16b(TyB);
test_bic_8b_8b_8b(TyB);
test_orr_16b_16b_16b(TyB);
test_orr_8b_8b_8b(TyB);
test_orn_16b_16b_16b(TyB);
test_orn_8b_8b_8b(TyB);
// orr 8h,4h #imm8, LSL #0 or 8
// orr 4s,2s #imm8, LSL #0, 8, 16 or 24
// bic 8h,4h #imm8, LSL #0 or 8
// bic 4s,2s #imm8, LSL #0, 8, 16 or 24
// also movi, mvni
// bif 16b,8b (vector) (bit insert if false)
// bit 16b,8b (vector) (bit insert if true)
// bsl 16b,8b (vector) (bit select)
// eor 16b,8b (vector)
test_bif_16b_16b_16b(TyB);
test_bif_8b_8b_8b(TyB);
test_bit_16b_16b_16b(TyB);
test_bit_8b_8b_8b(TyB);
test_bsl_16b_16b_16b(TyB);
test_bsl_8b_8b_8b(TyB);
test_eor_16b_16b_16b(TyB);
test_eor_8b_8b_8b(TyB);
// cls 4s,2s,8h,4h,16b,8b (count leading sign bits)
// clz 4s,2s,8h,4h,16b,8b (count leading zero bits)
// cmeq d
// cmge d
// cmgt d
// cmhi d
// cmhs d
// cmtst d
// cmeq 2d,4s,2s,8h,4h,16b,8b
// cmge 2d,4s,2s,8h,4h,16b,8b
// cmgt 2d,4s,2s,8h,4h,16b,8b
// cmhi 2d,4s,2s,8h,4h,16b,8b
// cmhs 2d,4s,2s,8h,4h,16b,8b
// cmtst 2d,4s,2s,8h,4h,16b,8b
test_cmeq_2d_2d_2d(TyD);
test_cmeq_4s_4s_4s(TyS);
test_cmeq_2s_2s_2s(TyS);
test_cmeq_8h_8h_8h(TyH);
test_cmeq_4h_4h_4h(TyH);
test_cmeq_16b_16b_16b(TyB);
test_cmeq_8b_8b_8b(TyB);
test_cmge_2d_2d_2d(TyD);
test_cmge_4s_4s_4s(TyS);
test_cmge_2s_2s_2s(TyS);
test_cmge_8h_8h_8h(TyH);
test_cmge_4h_4h_4h(TyH);
test_cmge_16b_16b_16b(TyB);
test_cmge_8b_8b_8b(TyB);
test_cmgt_2d_2d_2d(TyD);
test_cmgt_4s_4s_4s(TyS);
test_cmgt_2s_2s_2s(TyS);
test_cmgt_8h_8h_8h(TyH);
test_cmgt_4h_4h_4h(TyH);
test_cmgt_16b_16b_16b(TyB);
test_cmgt_8b_8b_8b(TyB);
test_cmhi_2d_2d_2d(TyD);
test_cmhi_4s_4s_4s(TyS);
test_cmhi_2s_2s_2s(TyS);
test_cmhi_8h_8h_8h(TyH);
test_cmhi_4h_4h_4h(TyH);
test_cmhi_16b_16b_16b(TyB);
test_cmhi_8b_8b_8b(TyB);
test_cmhs_2d_2d_2d(TyD);
test_cmhs_4s_4s_4s(TyS);
test_cmhs_2s_2s_2s(TyS);
test_cmhs_8h_8h_8h(TyH);
test_cmhs_4h_4h_4h(TyH);
test_cmhs_16b_16b_16b(TyB);
test_cmhs_8b_8b_8b(TyB);
test_cmtst_2d_2d_2d(TyD);
test_cmtst_4s_4s_4s(TyS);
test_cmtst_2s_2s_2s(TyS);
test_cmtst_8h_8h_8h(TyH);
test_cmtst_4h_4h_4h(TyH);
test_cmtst_16b_16b_16b(TyB);
test_cmtst_8b_8b_8b(TyB);
// cmeq_z d
// cmge_z d
// cmgt_z d
// cmle_z d
// cmlt_z d
// cmeq_z 2d,4s,2s,8h,4h,16b,8b
// cmge_z 2d,4s,2s,8h,4h,16b,8b
// cmgt_z 2d,4s,2s,8h,4h,16b,8b
// cmle_z 2d,4s,2s,8h,4h,16b,8b
// cmlt_z 2d,4s,2s,8h,4h,16b,8b
test_cmeq_zero_2d_2d(TyD);
test_cmeq_zero_4s_4s(TyS);
test_cmeq_zero_2s_2s(TyS);
test_cmeq_zero_8h_8h(TyH);
test_cmeq_zero_4h_4h(TyH);
test_cmeq_zero_16b_16b(TyB);
test_cmeq_zero_8b_8b(TyB);
test_cmge_zero_2d_2d(TyD);
test_cmge_zero_4s_4s(TyS);
test_cmge_zero_2s_2s(TyS);
test_cmge_zero_8h_8h(TyH);
test_cmge_zero_4h_4h(TyH);
test_cmge_zero_16b_16b(TyB);
test_cmge_zero_8b_8b(TyB);
test_cmgt_zero_2d_2d(TyD);
test_cmgt_zero_4s_4s(TyS);
test_cmgt_zero_2s_2s(TyS);
test_cmgt_zero_8h_8h(TyH);
test_cmgt_zero_4h_4h(TyH);
test_cmgt_zero_16b_16b(TyB);
test_cmgt_zero_8b_8b(TyB);
test_cmle_zero_2d_2d(TyD);
test_cmle_zero_4s_4s(TyS);
test_cmle_zero_2s_2s(TyS);
test_cmle_zero_8h_8h(TyH);
test_cmle_zero_4h_4h(TyH);
test_cmle_zero_16b_16b(TyB);
test_cmle_zero_8b_8b(TyB);
test_cmlt_zero_2d_2d(TyD);
test_cmlt_zero_4s_4s(TyS);
test_cmlt_zero_2s_2s(TyS);
test_cmlt_zero_8h_8h(TyH);
test_cmlt_zero_4h_4h(TyH);
test_cmlt_zero_16b_16b(TyB);
test_cmlt_zero_8b_8b(TyB);
// cnt 16b,8b (population count per byte)
// dup d,s,h,b (vec elem to scalar)
// dup 2d,4s,2s,8h,4h,16b,8b (vec elem to vector)
// dup 2d,4s,2s,8h,4h,16b,8b (general reg to vector)
// ext 16b,8b,#imm4 (concat 2 vectors, then slice)
// ins d[]_d[],s[]_s[],h[]_h[],b[]_b[]
// ins d[]_x, s[]_w, h[]_w, b[]_w
test_INS_general();
// mla 4s_4s_s[],2s_2s_s[],8h_8h_h[],4h_4h_h[]
// mls 4s_4s_s[],2s_2s_s[],8h_8h_h[],4h_4h_h[]
// mla 4s,2s,8h,4h,16b,8b
// mls 4s,2s,8h,4h,16b,8b
test_mla_4s_4s_4s(TyS);
test_mla_2s_2s_2s(TyS);
test_mla_8h_8h_8h(TyH);
test_mla_4h_4h_4h(TyH);
test_mla_16b_16b_16b(TyB);
test_mla_8b_8b_8b(TyB);
test_mls_4s_4s_4s(TyS);
test_mls_2s_2s_2s(TyS);
test_mls_8h_8h_8h(TyH);
test_mls_4h_4h_4h(TyH);
test_mls_16b_16b_16b(TyB);
test_mls_8b_8b_8b(TyB);
// movi 16b,8b #imm8, LSL #0
// movi 8h,4h #imm8, LSL #0 or 8
// movi 4s,2s #imm8, LSL #0, 8, 16, 24
// movi 4s,2s #imm8, MSL #8 or 16
// movi d, #imm64
// movi 2d, #imm64
// mul 4s_4s_s[],2s_2s_s[],8h_8h_h[],4h_4h_h[]
// mul 4s,2s,8h,4h,16b,8b
test_mul_4s_4s_4s(TyS);
test_mul_2s_2s_2s(TyS);
test_mul_8h_8h_8h(TyH);
test_mul_4h_4h_4h(TyH);
test_mul_16b_16b_16b(TyB);
test_mul_8b_8b_8b(TyB);
// mvni 8h,4h #imm8, LSL #0 or 8
// mvni 4s,2s #imm8, LSL #0, 8, 16, 24
// mvni 4s,2s #imm8, MSL #8 or 16
// not 16b,8b
// pmul 16b,8b
// pmull{2} 8h_8b_8b,8h_16b_16b,1q_1d_1d,1d_2d_2d
// rbit 16b,8b
// rev16 16b,8b
// rev32 16b,8b,8h,4h
// rev64 16b,8b,8h,4h,4s,2s
// saba 16b,8b,8h,4h,4s,2s
// uaba 16b,8b,8h,4h,4s,2s
// sabal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
// uabal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
// sabd 16b,8b,8h,4h,4s,2s
// uabd 16b,8b,8h,4h,4s,2s
// sabdl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
// uabdl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
// sadalp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
// uadalp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
// saddl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
// uaddl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
// ssubl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
// usubl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
// saddlp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
// uaddlp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
// saddlv h_16b/8b, s_8h/4h, d_4s
// uaddlv h_16b/8b, s_8h/4h, d_4s
// saddw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
// uaddw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
// ssubw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
// usubw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
// shadd 16b,8b,8h,4h,4s,2s
// uhadd 16b,8b,8h,4h,4s,2s
// shsub 16b,8b,8h,4h,4s,2s
// uhsub 16b,8b,8h,4h,4s,2s
// shll{2} 8h_8b/16b_#8, 4s_4h/8h_#16, 2d_2s/4s_#32
// shrn{2} 2s/4s_2d, 8h/4h_4s, 2s/4s_2d, #imm in 1 .. elem_bits
// rshrn{2} 2s/4s_2d, 8h/4h_4s, 2s/4s_2d, #imm in 1 .. elem_bits
// sli d_#imm
// sri d_#imm
// sli 2d,4s,2s,8h,4h,16b,8b _#imm
// sri 2d,4s,2s,8h,4h,16b,8b _#imm
// smax 4s,2s,8h,4h,16b,8b
// umax 4s,2s,8h,4h,16b,8b
// smin 4s,2s,8h,4h,16b,8b
// umin 4s,2s,8h,4h,16b,8b
test_smax_4s_4s_4s(TyS);
test_smax_2s_2s_2s(TyS);
test_smax_8h_8h_8h(TyH);
test_smax_4h_4h_4h(TyH);
test_smax_16b_16b_16b(TyB);
test_smax_8b_8b_8b(TyB);
test_umax_4s_4s_4s(TyS);
test_umax_2s_2s_2s(TyS);
test_umax_8h_8h_8h(TyH);
test_umax_4h_4h_4h(TyH);
test_umax_16b_16b_16b(TyB);
test_umax_8b_8b_8b(TyB);
test_smin_4s_4s_4s(TyS);
test_smin_2s_2s_2s(TyS);
test_smin_8h_8h_8h(TyH);
test_smin_4h_4h_4h(TyH);
test_smin_16b_16b_16b(TyB);
test_smin_8b_8b_8b(TyB);
test_umin_4s_4s_4s(TyS);
test_umin_2s_2s_2s(TyS);
test_umin_8h_8h_8h(TyH);
test_umin_4h_4h_4h(TyH);
test_umin_16b_16b_16b(TyB);
test_umin_8b_8b_8b(TyB);
// smaxp 4s,2s,8h,4h,16b,8b
// umaxp 4s,2s,8h,4h,16b,8b
// sminp 4s,2s,8h,4h,16b,8b
// uminp 4s,2s,8h,4h,16b,8b
// smaxv s_4s,h_8h,h_4h,b_16b,b_8b
// umaxv s_4s,h_8h,h_4h,b_16b,b_8b
// sminv s_4s,h_8h,h_4h,b_16b,b_8b
// uminv s_4s,h_8h,h_4h,b_16b,b_8b
test_SMAXV();
test_UMAXV();
test_SMINV();
test_UMINV();
// smlal{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
// umlal{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
// smlsl{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
// umlsl{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
// smull{2} 2d_2s/4s_s[]. 4s_4h/8h_h[]
// umull{2} 2d_2s/4s_s[]. 4s_4h/8h_h[]
// smlal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
// umlal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
// smlsl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
// umlsl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
// smull{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
// umull{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
// smov w_b[], w_h[], x_b[], x_h[], x_s[]
// umov w_b[], w_h[], x_b[], x_h[], x_s[]
// INCOMPLETE
test_umov_x_d0(TyD);
test_umov_x_d1(TyD);
test_umov_w_s0(TyS);
test_umov_w_s3(TyS);
test_umov_w_h0(TyH);
test_umov_w_h7(TyH);
test_umov_w_b0(TyB);
test_umov_w_b15(TyB);
test_smov_x_s0(TyS);
test_smov_x_s3(TyS);
test_smov_x_h0(TyH);
test_smov_x_h7(TyH);
test_smov_w_h0(TyH);
test_smov_w_h7(TyH);
test_smov_x_b0(TyB);
test_smov_x_b15(TyB);
test_smov_w_b0(TyB);
test_smov_w_b15(TyB);
// sqabs d,s,h,b
// sqneg d,s,h,b
// sqabs 2d,4s,2s,8h,4h,16b,8b
// sqneg 2d,4s,2s,8h,4h,16b,8b
// sqadd d,s,h,b
// uqadd d,s,h,b
// sqsub d,s,h,b
// uqsub d,s,h,b
// sqadd 2d,4s,2s,8h,4h,16b,8b
// uqadd 2d,4s,2s,8h,4h,16b,8b
// sqsub 2d,4s,2s,8h,4h,16b,8b
// uqsub 2d,4s,2s,8h,4h,16b,8b
// sqdmlal d_s_s[], s_h_h[]
// sqdmlsl d_s_s[], s_h_h[]
// sqdmull d_s_s[], s_h_h[]
// sqdmlal{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
// sqdmlsl{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
// sqdmull{2} 2d_2s/4s_s[], 4s_4h/2h_h[]
// sqdmlal d_s_s, s_h_h
// sqdmlsl d_s_s, s_h_h
// sqdmull d_s_s, s_h_h
// sqdmlal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)
// sqdmlsl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)
// sqdmull{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)
// sqdmulh s_s_s[], h_h_h[]
// sqrdmulh s_s_s[], h_h_h[]
// sqdmulh 4s_4s_s[], 2s_2s_s[], 8h_8h_h[], 4h_4h_h[]
// sqrdmulh 4s_4s_s[], 2s_2s_s[], 8h_8h_h[], 4h_4h_h[]
// sqdmulh h,s
// sqrdmulh h,s
// sqdmulh 4s,2s,8h,4h
// sqrdmulh 4s,2s,8h,4h
// sqshl d,s,h,b
// uqshl d,s,h,b
// sqrshl d,s,h,b
// uqrshl d,s,h,b
// sqshl 2d,4s,2s,8h,4h,16b,8b
// uqshl 2d,4s,2s,8h,4h,16b,8b
// sqrshl 2d,4s,2s,8h,4h,16b,8b
// uqrshl 2d,4s,2s,8h,4h,16b,8b
// sqrshrn s_d, h_s, b_h #imm
// uqrshrn s_d, h_s, b_h #imm
// sqshrn s_d, h_s, b_h #imm
// uqshrn s_d, h_s, b_h #imm
// sqrshrun s_d, h_s, b_h #imm
// sqshrun s_d, h_s, b_h #imm
// sqrshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
// uqrshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
// sqshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
// uqshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
// sqrshrun{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
// sqshrun{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
// sqshl d,s,h,b _#imm
// uqshl d,s,h,b _#imm
// sqshlu d,s,h,b _#imm
// sqshl 2d,4s,2s,8h,4h,16b,8b _#imm
// uqshl 2d,4s,2s,8h,4h,16b,8b _#imm
// sqshlu 2d,4s,2s,8h,4h,16b,8b _#imm
// sqxtn s_d,h_s,b_h
// uqxtn s_d,h_s,b_h
// sqxtun s_d,h_s,b_h
// sqxtn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
// uqxtn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
// sqxtun{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
// srhadd 4s,2s,8h,4h,16b,8b
// urhadd 4s,2s,8h,4h,16b,8b
// sshl (reg) d
// ushl (reg) d
// sshl (reg) 2d,4s,2s,8h,4h,16b,8b
// ushl (reg) 2d,4s,2s,8h,4h,16b,8b
// shl (imm) d
// sshr (imm) d
// ushr (imm) d
// shl (imm) 16b,8b,8h,4h,4s,2s,2d
// sshr (imm) 2d,4s,2s,8h,4h,16b,8b
// ushr (imm) 2d,4s,2s,8h,4h,16b,8b
test_shl_2d_2d_1(TyD);
test_shl_2d_2d_13(TyD);
test_shl_2d_2d_63(TyD);
test_shl_4s_4s_1(TyS);
test_shl_4s_4s_13(TyS);
test_shl_4s_4s_31(TyS);
test_shl_2s_2s_1(TyS);
test_shl_2s_2s_13(TyS);
test_shl_2s_2s_31(TyS);
test_shl_8h_8h_1(TyH);
test_shl_8h_8h_13(TyH);
test_shl_8h_8h_15(TyH);
test_shl_4h_4h_1(TyH);
test_shl_4h_4h_13(TyH);
test_shl_4h_4h_15(TyH);
test_shl_16b_16b_1(TyB);
test_shl_16b_16b_7(TyB);
test_shl_8b_8b_1(TyB);
test_shl_8b_8b_7(TyB);
test_sshr_2d_2d_1(TyD);
test_sshr_2d_2d_13(TyD);
test_sshr_2d_2d_63(TyD);
test_sshr_4s_4s_1(TyS);
test_sshr_4s_4s_13(TyS);
test_sshr_4s_4s_31(TyS);
test_sshr_2s_2s_1(TyS);
test_sshr_2s_2s_13(TyS);
test_sshr_2s_2s_31(TyS);
test_sshr_8h_8h_1(TyH);
test_sshr_8h_8h_13(TyH);
test_sshr_8h_8h_15(TyH);
test_sshr_4h_4h_1(TyH);
test_sshr_4h_4h_13(TyH);
test_sshr_4h_4h_15(TyH);
test_sshr_16b_16b_1(TyB);
test_sshr_16b_16b_7(TyB);
test_sshr_8b_8b_1(TyB);
test_sshr_8b_8b_7(TyB);
test_ushr_2d_2d_1(TyD);
test_ushr_2d_2d_13(TyD);
test_ushr_2d_2d_63(TyD);
test_ushr_4s_4s_1(TyS);
test_ushr_4s_4s_13(TyS);
test_ushr_4s_4s_31(TyS);
test_ushr_2s_2s_1(TyS);
test_ushr_2s_2s_13(TyS);
test_ushr_2s_2s_31(TyS);
test_ushr_8h_8h_1(TyH);
test_ushr_8h_8h_13(TyH);
test_ushr_8h_8h_15(TyH);
test_ushr_4h_4h_1(TyH);
test_ushr_4h_4h_13(TyH);
test_ushr_4h_4h_15(TyH);
test_ushr_16b_16b_1(TyB);
test_ushr_16b_16b_7(TyB);
test_ushr_8b_8b_1(TyB);
test_ushr_8b_8b_7(TyB);
// ssra (imm) d
// usra (imm) d
// ssra (imm) 2d,4s,2s,8h,4h,16b,8b
// usra (imm) 2d,4s,2s,8h,4h,16b,8b
// srshl (reg) d
// urshl (reg) d
// srshl (reg) 2d,4s,2s,8h,4h,16b,8b
// urshl (reg) 2d,4s,2s,8h,4h,16b,8b
// srshr (imm) d
// urshr (imm) d
// srshr (imm) 2d,4s,2s,8h,4h,16b,8b
// urshr (imm) 2d,4s,2s,8h,4h,16b,8b
// srsra (imm) d
// ursra (imm) d
// srsra (imm) 2d,4s,2s,8h,4h,16b,8b
// ursra (imm) 2d,4s,2s,8h,4h,16b,8b
// sshll{2} (imm) 2d_2s/4s, 4s_4h/8h, 8h_8b/16b
// ushll{2} (imm) 2d_2s/4s, 4s_4h/8h, 8h_8b/16b
// INCOMPLETE
test_sshll_2d_2s_0(TyS);
test_sshll_2d_2s_15(TyS);
test_sshll_2d_2s_31(TyS);
test_sshll2_2d_4s_0(TyS);
test_sshll2_2d_4s_15(TyS);
test_sshll2_2d_4s_31(TyS);
test_ushll_2d_2s_0(TyS);
test_ushll_2d_2s_15(TyS);
test_ushll_2d_2s_31(TyS);
test_ushll2_2d_4s_0(TyS);
test_ushll2_2d_4s_15(TyS);
test_ushll2_2d_4s_31(TyS);
// suqadd d,s,h,b
// suqadd 2d,4s,2s,8h,4h,16b,8b
// tbl 8b_{16b}_8b, 16b_{16b}_16b
// tbl 8b_{16b,16b}_8b, 16b_{16b,16b}_16b
// tbl 8b_{16b,16b,16b}_8b, 16b_{16b,16b,16b}_16b
// tbl 8b_{16b,16b,16b,16b}_8b, 16b_{16b,16b,16b,16b}_16b
test_tbl_16b_1reg(TyB);
test_tbl_16b_2reg(TyB);
test_tbl_16b_3reg(TyB);
test_tbl_16b_4reg(TyB);
test_tbl_8b_1reg(TyB);
test_tbl_8b_2reg(TyB);
test_tbl_8b_3reg(TyB);
test_tbl_8b_4reg(TyB);
// tbx 8b_{16b}_8b, 16b_{16b}_16b
// tbx 8b_{16b,16b}_8b, 16b_{16b,16b}_16b
// tbx 8b_{16b,16b,16b}_8b, 16b_{16b,16b,16b}_16b
// tbx 8b_{16b,16b,16b,16b}_8b, 16b_{16b,16b,16b,16b}_16b
test_tbx_16b_1reg(TyB);
test_tbx_16b_2reg(TyB);
test_tbx_16b_3reg(TyB);
test_tbx_16b_4reg(TyB);
test_tbx_8b_1reg(TyB);
test_tbx_8b_2reg(TyB);
test_tbx_8b_3reg(TyB);
test_tbx_8b_4reg(TyB);
// trn1 2d,4s,2s,8h,4h,16b,8b
// trn2 2d,4s,2s,8h,4h,16b,8b
// urecpe 4s,2s
// ursqrte 4s,2s
// usqadd d,s,h,b
// usqadd 2d,4s,2s,8h,4h,16b,8b
// uzp1 2d,4s,2s,8h,4h,16b,8b
// uzp2 2d,4s,2s,8h,4h,16b,8b
// xtn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
test_xtn_2s_2d(TyD);
test_xtn2_4s_2d(TyD);
test_xtn_4h_4s(TyS);
test_xtn2_8h_4s(TyS);
test_xtn_8b_8h(TyH);
test_xtn2_16b_8h(TyH);
// zip1 2d,4s,2s,8h,4h,16b,8b
// zip2 2d,4s,2s,8h,4h,16b,8b
// ======================== MEM ========================
// ld1 (multiple 1-element structures to 1/2/3/4 regs)
// ld1 (single 1-element structure to one lane of 1 reg)
// ld1r (single 1-element structure and rep to all lanes of 1 reg)
// ld2 (multiple 2-element structures to 2 regs)
// ld2 (single 2-element structure to one lane of 2 regs)
// ld2r (single 2-element structure and rep to all lanes of 2 regs)
// ld3 (multiple 3-element structures to 3 regs)
// ld3 (single 3-element structure to one lane of 3 regs)
// ld3r (single 3-element structure and rep to all lanes of 3 regs)
// ld4 (multiple 4-element structures to 4 regs)
// ld4 (single 4-element structure to one lane of 4 regs)
// ld4r (single 4-element structure and rep to all lanes of 4 regs)
// ldnp q_q_addr,d_d_addr,s_s_addr (load pair w/ non-temporal hint)
// addr = reg + uimm7 * reg_size
// ldp q_q_addr,d_d_addr,s_s_addr (load pair)
// addr = [Xn|SP],#imm or [Xn|SP,#imm]! or [Xn|SP,#imm]
// ldr q,d,s,h,b from addr
// addr = [Xn|SP],#imm or [Xn|SP,#imm]! or [Xn|SP,#imm]
// ldr q,d,s from pc+#imm19
// ldr q,d,s,h,b from addr
// addr = [Xn|SP, R <extend> <shift]
// ldur q,d,s,h,b from addr
// addr = [Xn|SP,#imm] (unscaled offset)
// st1 (multiple 1-element structures from 1/2/3/4 regs)
// st1 (single 1-element structure for 1 lane of 1 reg)
// st2 (multiple 2-element structures from 2 regs)
// st2 (single 2-element structure from 1 lane of 2 regs)
// st3 (multiple 3-element structures from 3 regs)
// st3 (single 3-element structure from 1 lane of 3 regs)
// st4 (multiple 4-element structures from 4 regs)
// st4 (single 4-element structure from one lane of 4 regs)
// stnp q_q_addr, d_d_addr, s_s_addr
// addr = [Xn|SP, #imm]
// stp q_q_addr, d_d_addr, s_s_addr
// addr = [Xn|SP], #imm or [Xn|SP, #imm]! or [Xn|SP, #imm]
// str q,d,s,h,b_addr
// addr = [Xn|SP], #simm or [Xn|SP, #simm]! or [Xn|SP, #pimm]
// str q,d,s,h,b_addr
// addr = [Xn|SP, R <extend> <shift]
// stur q,d,s,h,b_addr
// addr = [Xn|SP,#imm] (unscaled offset)
// ======================== CRYPTO ========================
// aesd 16b (aes single round decryption)
// aese 16b (aes single round encryption)
// aesimc 16b (aes inverse mix columns)
// aesmc 16b (aes mix columns)
// sha1c q_s_4s
// sha1h s_s
// sha1m q_s_4s
// sha1p q_s_4s
// sha1su0 4s_4s_4s
// sha1su1 4s_4s
// sha256h2 q_q_4s
// sha256h q_q_4s
// sha256su0 4s_4s
// sha256su1 4s_4s_4s
return 0;
}
/* ---------------------------------------------------------------- */
/* -- Alphabetical list of insns -- */
/* ---------------------------------------------------------------- */
/*
abs d
abs 2d,4s,2s,8h,4h,16b,8b
add d
add 2d,4s,2s,8h,4h,16b,8b
addhn 2s.2d.2d, 4s.2d.2d, h_from_s and b_from_h (add and get high half)
addp d (add pairs, across)
addp 2d,4s,2s,8h,4h,16b,8b
addv 4s,8h,4h,16b,18b (reduce across vector)
aesd 16b (aes single round decryption)
aese 16b (aes single round encryption)
aesimc 16b (aes inverse mix columns)
aesmc 16b (aes mix columns)
and 16b,8b
bic 4s,2s,8h,4h (vector, imm)
also movi, mvni, orr
bic 16b,8b (vector,reg) (bit clear)
bif 16b,8b (vector) (bit insert if false)
bit 16b,8b (vector) (bit insert if true)
bsl 16b,8b (vector) (bit select)
cls 4s,2s,8h,4h,16b,8b (count leading sign bits)
clz 4s,2s,8h,4h,16b,8b (count leading zero bits)
cmeq d
cmeq 2d,4s,2s,8h,4h,16b,8b
cmeq_z d
cmeq_z 2d,4s,2s,8h,4h,16b,8b
cmge d
cmge 2d,4s,2s,8h,4h,16b,8b
cmge_z d
cmge_z 2d,4s,2s,8h,4h,16b,8b
cmgt d
cmgt 2d,4s,2s,8h,4h,16b,8b
cmgt_z d
cmgt_z 2d,4s,2s,8h,4h,16b,8b
cmhi d
cmhi 2d,4s,2s,8h,4h,16b,8b
cmhs d
cmhs 2d,4s,2s,8h,4h,16b,8b
cmle_z d
cmle_z 2d,4s,2s,8h,4h,16b,8b
cmlt_z d
cmlt_z 2d,4s,2s,8h,4h,16b,8b
cmtst d
cmtst 2d,4s,2s,8h,4h,16b,8b
cnt 16b,8b (population count per byte)
dup d,s,h,b (vec elem to scalar)
dup 2d,4s,2s,8h,4h,16b,8b (vec elem to vector)
dup 2d,4s,2s,8h,4h,16b,8b (general reg to vector)
eor 16b,8b (vector)
ext 16b,8b,#imm4 (concat 2 vectors, then slice)
fabd d,s
fabd 2d,4s,2s
fabs d,s
fabs 2d,4s,2s
facge s,d (floating abs compare GE)
facge 2d,4s,2s
facgt s,d (floating abs compare GE)
facgt 2d,4s,2s
fadd d,s
fadd 2d,4s,2s
faddp d,s (floating add pair)
faddp 2d,4s,2s
fccmp d,s (floating point conditional quiet compare)
fccmpe d,s (floating point conditional signaling compare)
fcmeq d,s
fcmeq 2d,4s,2s
fcmeq_z d,s
fcmeq_z 2d,4s,2s
fcmge d,s
fcmge 2d,4s,2s
fcmge_z d,s
fcmge_z 2d,4s,2s
fcmgt d,s
fcmgt 2d,4s,2s
fcmgt_z d,s
fcmgt_z 2d,4s,2s
fcmle_z d,s
fcmle_z 2d,4s,2s
fcmlt_z d,s
fcmlt_z 2d,4s,2s
fcmp d,s (floating point quiet, set flags)
fcmp_z d,s
fcmpe d,s (floating point signaling, set flags)
fcmpe_z d,s
fcsel d,s (fp cond select)
fcvt s_h,d_h,h_s,d_s,h_d,s_d (fp convert, scalar)
fcvtas d,s (fcvt to signed int, nearest, ties away)
fcvtas 2d,4s,2s
fcvtas w_s,x_s,w_d,x_d
fcvtau d,s (fcvt to unsigned int, nearest, ties away)
fcvtau 2d,4s,2s
fcvtau w_s,x_s,w_d,x_d
fcvtl{2} 4s/4h, 4s/8h, 2d/2s, 2d/4s (float convert to longer form)
fcvtms d,s (fcvt to signed int, minus inf)
fcvtms 2d,4s,2s
fcvtms w_s,x_s,w_d,x_d
fcvtmu d,s (fcvt to unsigned int, minus inf)
fcvtmu 2d,4s,2s
fcvtmu w_s,x_s,w_d,x_d
fcvtn{2} 4h/4s, 8h/4s, 2s/2d, 4s/2d (float convert to narrower form)
fcvtns d,s (fcvt to signed int, nearest)
fcvtns 2d,4s,2s
fcvtns w_s,x_s,w_d,x_d
fcvtnu d,s (fcvt to unsigned int, nearest)
fcvtnu 2d,4s,2s
fcvtnu w_s,x_s,w_d,x_d
fcvtps d,s (fcvt to signed int, plus inf)
fcvtps 2d,4s,2s
fcvtps w_s,x_s,w_d,x_d
fcvtpu d,s (fcvt to unsigned int, plus inf)
fcvtpu 2d,4s,2s
fcvtpu w_s,x_s,w_d,x_d
fcvtxn s_d (fcvt to lower prec narrow, rounding to odd)
fcvtxn 2s_2d,4s_2d
fcvtzs s,d (fcvt to signed fixedpt, to zero) (w/ #fbits)
fcvtzs 2d,4s,2s
fcvtzs s,d (fcvt to signed integer, to zero)
fcvtzs 2d,4s,2s
fcvtzs w_s,x_s,w_d,x_d (fcvt to signed fixedpt, to zero) (w/ #fbits)
fcvtzs w_s,x_s,w_d,x_d (fcvt to signed integer, to zero)
fcvtzu s,d (fcvt to unsigned fixedpt, to zero) (w/ #fbits)
fcvtzu 2d,4s,2s
fcvtzu s,d (fcvt to unsigned integer, to zero)
fcvtzu 2d,4s,2s
fcvtzu w_s,x_s,w_d,x_d (fcvt to unsigned fixedpt, to zero) (w/ #fbits)
fcvtzu w_s,x_s,w_d,x_d (fcvt to unsigned integer, to zero)
fdiv d,s
fdiv 2d,4s,2s
fmadd d,s
fnmadd d,s
fnmsub d,s
fnmul d,s
fmax d,s
fmin d,s
fmax 2d,4s,2s
fmin 2d,4s,2s
fmaxnm d,s ("max number")
fminnm d,s
fmaxnm 2d,4s,2s
fminnm 2d,4s,2s
fmaxnmp d_2d,s_2s ("max number pairwise")
fminnmp d_2d,s_2s
fmaxnmp 2d,4s,2s
fminnmp 2d,4s,2s
fmaxnmv s_4s (maxnum across vector)
fminnmv s_4s
fmaxp d_2d,s_2s (max of a pair)
fminp d_2d,s_2s (max of a pair)
fmaxp 2d,4s,2s (max pairwise)
fminp 2d,4s,2s
fmaxv s_4s (max across vector)
fminv s_4s
fmla d_d_d[],s_s_s[] (by element)
fmla 2d_2d_d[],4s_4s_s[],2s_2s_s[]
fmla 2d,4s,2s
fmls d_d_d[],s_s_s[] (by element)
fmls 2d_2d_d[],4s_4s_s[],2s_2s_s[]
fmls 2d,4s,2s
fmov 2d,4s,2s #imm (part of the MOVI/MVNI/ORR/BIC imm group)
fmov d_d,s_s
fmov s_w,w_s,d_x,d[1]_x,x_d,x_d[1]
fmov d,s #imm
fmsub d,s
fmul d_d_d[],s_s_s[]
fmul 2d_2d_d[],4s_4s_s[],2s_2s_s[]
fmul 2d,4s,2s
fmul d,s
fmulx d_d_d[],s_s_s[]
fmulx 2d_2d_d[],4s_4s_s[],2s_2s_s[]
fmulx d,s
fmulx 2d,4s,2s
fneg d,s
fneg 2d,4s,2s
frecpe d,s (recip estimate)
frecpe 2d,4s,2s
frecps d,s (recip step)
frecps 2d,4s,2s
frecpx d,s (recip exponent)
frinta 2d,4s,2s (round to integral, nearest away)
frinta d,s
frinti 2d,4s,2s (round to integral, per FPCR)
frinti d,s
frintm 2d,4s,2s (round to integral, minus inf)
frintm d,s
frintn 2d,4s,2s (round to integral, nearest, to even)
frintn d,s
frintp 2d,4s,2s (round to integral, plus inf)
frintp d,s
frintx 2d,4s,2s (round to integral exact, per FPCR)
frintx d,s
frintz 2d,4s,2s (round to integral, zero)
frintz d,s
frsqrte d,s (est)
frsqrte 2d,4s,2s
frsqrts d,s (step)
frsqrts 2d,4s,2s
fsqrt d,s
fsqrt 2d,4s,2s
fsub d,s
fsub 2d,4s,2s
ins d[]_d[],s[]_s[],h[]_h[],b[]_b[]
ins d[]_x, s[]_w, h[]_w, b[]_w
ld1 (multiple 1-element structures to 1/2/3/4 regs)
ld1 (single 1-element structure to one lane of 1 reg)
ld1r (single 1-element structure and rep to all lanes of 1 reg)
ld2 (multiple 2-element structures to 2 regs)
ld2 (single 2-element structure to one lane of 2 regs)
ld2r (single 2-element structure and rep to all lanes of 2 regs)
ld3 (multiple 3-element structures to 3 regs)
ld3 (single 3-element structure to one lane of 3 regs)
ld3r (single 3-element structure and rep to all lanes of 3 regs)
ld4 (multiple 4-element structures to 4 regs)
ld4 (single 4-element structure to one lane of 4 regs)
ld4r (single 4-element structure and rep to all lanes of 4 regs)
ldnp q_q_addr,d_d_addr,s_s_addr (load pair w/ non-temporal hint)
addr = reg + uimm7 * reg_size
ldp q_q_addr,d_d_addr,s_s_addr (load pair)
addr = [Xn|SP],#imm or [Xn|SP,#imm]! or [Xn|SP,#imm]
ldr q,d,s,h,b from addr
addr = [Xn|SP],#imm or [Xn|SP,#imm]! or [Xn|SP,#imm]
ldr q,d,s from pc+#imm19
ldr q,d,s,h,b from addr
addr = [Xn|SP, R <extend> <shift]
ldur q,d,s,h,b from addr
addr = [Xn|SP,#imm] (unscaled offset)
mla 4s_4s_s[],2s_2s_s[],8h_8h_h[],4h_4h_h[]
mla 4s,2s,8h,4h,16b,8b
mls 4s_4s_s[],2s_2s_s[],8h_8h_h[],4h_4h_h[]
mls 4s,2s,8h,4h,16b,8b
movi 16b,8b #imm8, LSL #0
movi 8h,4h #imm8, LSL #0 or 8
movi 4s,2s #imm8, LSL #0, 8, 16, 24
movi 4s,2s #imm8, MSL #8 or 16
movi d, #imm64
movi 2d, #imm64
mul 4s_4s_s[],2s_2s_s[],8h_8h_h[],4h_4h_h[]
mul 4s,2s,8h,4h,16b,8b
mvni 8h,4h #imm8, LSL #0 or 8
mvni 4s,2s #imm8, LSL #0, 8, 16, 24
mvni 4s,2s #imm8, MSL #8 or 16
neg d
neg 2d,4s,2s,8h,4h,16b,8b
not 16b,8b
orn 16b,8b
orr 8h,4h #imm8, LSL #0 or 8
orr 4s,2s #imm8, LSL #0, 8, 16 or 24
orr 16b,8b
pmul 16b,8b
pmull{2} 8h_8b_8b,8h_16b_16b,1q_1d_1d,1d_2d_2d
raddhn{2} 2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
rbit 16b,8b
rev16 16b,8b
rev32 16b,8b,8h,4h
rev64 16b,8b,8h,4h,4s,2s
rshrn{2} 2s/4s_2d, 8h/4h_4s, 2s/4s_2d, #imm in 1 .. elem_bits
rsubhn{2} 2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
saba 16b,8b,8h,4h,4s,2s
sabal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
sabd 16b,8b,8h,4h,4s,2s
sabdl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
sadalp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
saddl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
saddlp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
saddlv h_16b/8b, s_8h/4h, d_4s
saddw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
scvtf d,s _#fbits
scvtf 2d,4s,2s _#fbits
scvtf d,s
scvtf 2d,4s,2s
scvtf s_w, d_w, s_x, d_x, _#fbits
scvtf s_w, d_w, s_x, d_x
sha1c q_s_4s
sha1h s_s
sha1m q_s_4s
sha1p q_s_4s
sha1su0 4s_4s_4s
sha1su1 4s_4s
sha256h2 q_q_4s
sha256h q_q_4s
sha256su0 4s_4s
sha256su1 4s_4s_4s
shadd 16b,8b,8h,4h,4s,2s
shl d_#imm
shl 16b,8b,8h,4h,4s,2s,2d _#imm
shll{2} 8h_8b/16b_#8, 4s_4h/8h_#16, 2d_2s/4s_#32
shrn{2} 2s/4s_2d, 8h/4h_4s, 2s/4s_2d, #imm in 1 .. elem_bits
shsub 16b,8b,8h,4h,4s,2s
sli d_#imm
sli 2d,4s,2s,8h,4h,16b,8b _#imm
smax 4s,2s,8h,4h,16b,8b
smaxp 4s,2s,8h,4h,16b,8b
smaxv s_4s,h_8h,h_4h,b_16b,b_8b
smin 4s,2s,8h,4h,16b,8b
sminp 4s,2s,8h,4h,16b,8b
sminv s_4s,h_8h,h_4h,b_16b,b_8b
smlal{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
smlal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
smlsl{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
smlsl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
smov w_b[], w_h[], x_b[], x_h[], x_s[]
smull{2} 2d_2s/4s_s[]. 4s_4h/8h_h[]
smull{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
sqabs d,s,h,b
sqabs 2d,4s,2s,8h,4h,16b,8b
sqadd d,s,h,b
sqadd 2d,4s,2s,8h,4h,16b,8b
sqdmlal d_s_s[], s_h_h[]
sqdmlal{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
sqdmlal d_s_s, s_h_h
sqdmlal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)
sqdmlsl d_s_s[], s_h_h[]
sqdmlsl{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
sqdmlsl d_s_s, s_h_h
sqdmlsl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)
sqdmulh s_s_s[], h_h_h[]
sqdmulh 4s_4s_s[], 2s_2s_s[], 8h_8h_h[], 4h_4h_h[]
sqdmulh h,s
sqdmulh 4s,2s,8h,4h
sqdmull d_s_s[], s_h_h[]
sqdmull{2} 2d_2s/4s_s[], 4s_4h/2h_h[]
sqdmull d_s_s,s_h_h
sqdmull{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)
sqneg d,s,h,b
sqneg 2d,4s,2s,8h,4h,16b,8b
sqrdmulh s_s_s[], h_h_h[]
sqrdmulh 4s_4s_s[], 2s_2s_s[], 8h_8h_h[], 4h_4h_h[]
sqrdmulh h,s
sqrdmulh 4s,2s,8h,4h
sqrshl d,s,h,b
sqrshl 2d,4s,2s,8h,4h,16b,8b
sqrshrn s_d, h_s, b_h #imm
sqrshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
sqrshrun s_d, h_s, b_h #imm
sqrshrun{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
sqshl d,s,h,b _#imm
sqshl 2d,4s,2s,8h,4h,16b,8b _#imm
sqshl d,s,h,b
sqshl 2d,4s,2s,8h,4h,16b,8b
sqshlu d,s,h,b _#imm
sqshlu 2d,4s,2s,8h,4h,16b,8b _#imm
sqshrn s_d, h_s, b_h #imm
sqshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
sqshrun s_d, h_s, b_h #imm
sqshrun{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
sqsub d,s,h,b
sqsub 2d,4s,2s,8h,4h,16b,8b
sqxtn s_d,h_s,b_h
sqxtn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
sqxtun s_d,h_s,b_h
sqxtun{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
srhadd 4s,2s,8h,4h,16b,8b
sri d_#imm
sri 2d,4s,2s,8h,4h,16b,8b _#imm
srshl (reg) d
srshl 2d,4s,2s,8h,4h,16b,8b
srshr (imm) d
srshr 2d,4s,2s,8h,4h,16b,8b
srsra (imm) d
srsra 2d,4s,2s,8h,4h,16b,8b
sshl (reg) d
sshl 2d,4s,2s,8h,4h,16b,8b
sshll{2} (imm) 2d_2s/4s 4s_4h/8h, 8h_8b/16b
sshr (imm) d
sshr 2d,4s,2s,8h,4h,16b,8b
ssra (imm) d
ssra 2d,4s,2s,8h,4h,16b,8b
ssubl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
ssubw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
st1 (multiple 1-element structures from 1/2/3/4 regs)
st1 (single 1-element structure for 1 lane of 1 reg)
st2 (multiple 2-element structures from 2 regs)
st2 (single 2-element structure from 1 lane of 2 regs)
st3 (multiple 3-element structures from 3 regs)
st3 (single 3-element structure from 1 lane of 3 regs)
st4 (multiple 4-element structures from 4 regs)
st4 (single 4-element structure from one lane of 4 regs)
stnp q_q_addr, d_d_addr, s_s_addr
addr = [Xn|SP, #imm]
stp q_q_addr, d_d_addr, s_s_addr
addr = [Xn|SP], #imm or [Xn|SP, #imm]! or [Xn|SP, #imm]
str q,d,s,h,b_addr
addr = [Xn|SP], #simm or [Xn|SP, #simm]! or [Xn|SP, #pimm]
str q,d,s,h,b_addr
addr = [Xn|SP, R <extend> <shift]
stur q,d,s,h,b_addr
addr = [Xn|SP,#imm] (unscaled offset)
sub d
sub 2d,4s,2s,8h,4h,16b,8b
subhn{2} 2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
suqadd d,s,h,b
suqadd 2d,4s,2s,8h,4h,16b,8b
tbl 8b_{16b}_8b, 16b_{16b}_16b
tbl 8b_{16b,16b}_8b, 16b_{16b,16b}_16b
tbl 8b_{16b,16b,16b}_8b, 16b_{16b,16b,16b}_16b
tbl 8b_{16b,16b,16b,16b}_8b, 16b_{16b,16b,16b,16b}_16b
tbx 8b_{16b}_8b, 16b_{16b}_16b
tbx 8b_{16b,16b}_8b, 16b_{16b,16b}_16b
tbx 8b_{16b,16b,16b}_8b, 16b_{16b,16b,16b}_16b
tbx 8b_{16b,16b,16b,16b}_8b, 16b_{16b,16b,16b,16b}_16b
trn1 2d,4s,2s,8h,4h,16b,8b
trn2 2d,4s,2s,8h,4h,16b,8b
uaba 16b,8b,8h,4h,4s,2s
uabal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
uabd 16b,8b,8h,4h,4s,2s
uabdl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
uadalp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
uaddl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
uaddlp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
uaddlv h_16b/8b, s_8h/4h, d_4s
uaddw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
ucvtf d,s _#fbits
ucvtf 2d,4s,2s _#fbits
ucvtf d,s
ucvtf 2d,4s,2s
ucvtf s_w, d_w, s_x, d_x, _#fbits
ucvtf s_w, d_w, s_x, d_x
uhadd 16b,8b,8h,4h,4s,2s
uhsub 16b,8b,8h,4h,4s,2s
umax 4s,2s,8h,4h,16b,8b
umaxp 4s,2s,8h,4h,16b,8b
umaxv s_4s,h_8h,h_4h,b_16b,b_8b
umin 4s,2s,8h,4h,16b,8b
uminp 4s,2s,8h,4h,16b,8b
uminv s_4s,h_8h,h_4h,b_16b,b_8b
umlal{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
umlal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
umlsl{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
umlsl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
umov w_b[], w_h[], x_b[], x_h[], x_s[]
umull{2} 2d_2s/4s_s[]. 4s_4h/8h_h[]
umull{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
uqadd d,s,h,b
uqadd 2d,4s,2s,8h,4h,16b,8b
uqrshl d,s,h,b
uqrshl 2d,4s,2s,8h,4h,16b,8b
uqrshrn s_d, h_s, b_h #imm
uqrshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
uqshl d,s,h,b _#imm
uqshl 2d,4s,2s,8h,4h,16b,8b _#imm
uqshl d,s,h,b
uqshl 2d,4s,2s,8h,4h,16b,8b
uqshrn s_d, h_s, b_h #imm
uqshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
uqsub d,s,h,b
uqsub 2d,4s,2s,8h,4h,16b,8b
uqxtn s_d,h_s,b_h
uqxtn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
urecpe 4s,2s
urhadd 4s,2s,8h,4h,16b,8b
urshl (reg) d
urshl 2d,4s,2s,8h,4h,16b,8b
urshr (imm) d
urshr 2d,4s,2s,8h,4h,16b,8b
ursqrte 4s,2s
ursra (imm) d
ursra 2d,4s,2s,8h,4h,16b,8b
ushl (reg) d
ushl 2d,4s,2s,8h,4h,16b,8b
ushll{2} (imm) 2d_2s/4s 4s_4h/8h, 8h_8b/16b
ushr (imm) d
ushr 2d,4s,2s,8h,4h,16b,8b
usqadd d,s,h,b
usqadd 2d,4s,2s,8h,4h,16b,8b
usra (imm) d
usra 2d,4s,2s,8h,4h,16b,8b
usubl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
usubw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
uzp1 2d,4s,2s,8h,4h,16b,8b
uzp2 2d,4s,2s,8h,4h,16b,8b
xtn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
zip1 2d,4s,2s,8h,4h,16b,8b
zip2 2d,4s,2s,8h,4h,16b,8b
*/
/* ---------------------------------------------------------------- */
/* -- List of insns, grouped somewhat by laneage configuration -- */
/* ---------------------------------------------------------------- */
/*
======================== FP ========================
fabs d,s
fabs 2d,4s,2s
fneg d,s
fneg 2d,4s,2s
fsqrt d,s
fsqrt 2d,4s,2s
fadd d,s
fsub d,s
fadd 2d,4s,2s
fsub 2d,4s,2s
fabd d,s
fabd 2d,4s,2s
faddp d,s (floating add pair)
faddp 2d,4s,2s
fccmp d,s (floating point conditional quiet compare)
fccmpe d,s (floating point conditional signaling compare)
fcmeq d,s
fcmge d,s
fcmgt d,s
facgt d,s (floating abs compare GE)
facge d,s (floating abs compare GE)
fcmeq 2d,4s,2s
fcmge 2d,4s,2s
fcmgt 2d,4s,2s
facge 2d,4s,2s
facgt 2d,4s,2s
fcmeq_z d,s
fcmge_z d,s
fcmgt_z d,s
fcmle_z d,s
fcmlt_z d,s
fcmeq_z 2d,4s,2s
fcmge_z 2d,4s,2s
fcmgt_z 2d,4s,2s
fcmle_z 2d,4s,2s
fcmlt_z 2d,4s,2s
fcmp_z d,s
fcmpe_z d,s
fcmp d,s (floating point quiet, set flags)
fcmpe d,s (floating point signaling, set flags)
fcsel d,s (fp cond select)
fdiv d,s
fdiv 2d,4s,2s
fmadd d,s
fnmadd d,s
fmsub d,s
fnmsub d,s
fnmul d,s
fmax d,s
fmin d,s
fmaxnm d,s ("max number")
fminnm d,s
fmax 2d,4s,2s
fmin 2d,4s,2s
fmaxnm 2d,4s,2s
fminnm 2d,4s,2s
fmaxnmp d_2d,s_2s ("max number pairwise")
fminnmp d_2d,s_2s
fmaxnmp 2d,4s,2s
fminnmp 2d,4s,2s
fmaxnmv s_4s (maxnum across vector)
fminnmv s_4s
fmaxp d_2d,s_2s (max of a pair)
fminp d_2d,s_2s (max of a pair)
fmaxp 2d,4s,2s (max pairwise)
fminp 2d,4s,2s
fmaxv s_4s (max across vector)
fminv s_4s
fmla 2d,4s,2s
fmls 2d,4s,2s
fmla d_d_d[],s_s_s[] (by element)
fmls d_d_d[],s_s_s[] (by element)
fmla 2d_2d_d[],4s_4s_s[],2s_2s_s[]
fmls 2d_2d_d[],4s_4s_s[],2s_2s_s[]
fmov 2d,4s,2s #imm (part of the MOVI/MVNI/ORR/BIC imm group)
fmov d_d,s_s
fmov s_w,w_s,d_x,d[1]_x,x_d,x_d[1]
fmov d,s #imm
fmul d_d_d[],s_s_s[]
fmul 2d_2d_d[],4s_4s_s[],2s_2s_s[]
fmul 2d,4s,2s
fmul d,s
fmulx d_d_d[],s_s_s[]
fmulx 2d_2d_d[],4s_4s_s[],2s_2s_s[]
fmulx d,s
fmulx 2d,4s,2s
frecpe d,s (recip estimate)
frecpe 2d,4s,2s
frecps d,s (recip step)
frecps 2d,4s,2s
frecpx d,s (recip exponent)
frinta d,s
frinti d,s
frintm d,s
frintn d,s
frintp d,s
frintx d,s
frintz d,s
frinta 2d,4s,2s (round to integral, nearest away)
frinti 2d,4s,2s (round to integral, per FPCR)
frintm 2d,4s,2s (round to integral, minus inf)
frintn 2d,4s,2s (round to integral, nearest, to even)
frintp 2d,4s,2s (round to integral, plus inf)
frintx 2d,4s,2s (round to integral exact, per FPCR)
frintz 2d,4s,2s (round to integral, zero)
frsqrte d,s (est)
frsqrte 2d,4s,2s
frsqrts d,s (step)
frsqrts 2d,4s,2s
======================== CONV ========================
fcvt s_h,d_h,h_s,d_s,h_d,s_d (fp convert, scalar)
fcvtl{2} 4s/4h, 4s/8h, 2d/2s, 2d/4s (float convert to longer form)
fcvtn{2} 4h/4s, 8h/4s, 2s/2d, 4s/2d (float convert to narrower form)
fcvtas d,s (fcvt to signed int, nearest, ties away)
fcvtau d,s (fcvt to unsigned int, nearest, ties away)
fcvtas 2d,4s,2s
fcvtau 2d,4s,2s
fcvtas w_s,x_s,w_d,x_d
fcvtau w_s,x_s,w_d,x_d
fcvtms d,s (fcvt to signed int, minus inf)
fcvtmu d,s (fcvt to unsigned int, minus inf)
fcvtms 2d,4s,2s
fcvtmu 2d,4s,2s
fcvtms w_s,x_s,w_d,x_d
fcvtmu w_s,x_s,w_d,x_d
fcvtns d,s (fcvt to signed int, nearest)
fcvtnu d,s (fcvt to unsigned int, nearest)
fcvtns 2d,4s,2s
fcvtnu 2d,4s,2s
fcvtns w_s,x_s,w_d,x_d
fcvtnu w_s,x_s,w_d,x_d
fcvtps d,s (fcvt to signed int, plus inf)
fcvtpu d,s (fcvt to unsigned int, plus inf)
fcvtps 2d,4s,2s
fcvtpu 2d,4s,2s
fcvtps w_s,x_s,w_d,x_d
fcvtpu w_s,x_s,w_d,x_d
fcvtzs d,s (fcvt to signed integer, to zero)
fcvtzu d,s (fcvt to unsigned integer, to zero)
fcvtzs 2d,4s,2s
fcvtzu 2d,4s,2s
fcvtzs w_s,x_s,w_d,x_d
fcvtzu w_s,x_s,w_d,x_d
fcvtzs d,s (fcvt to signed fixedpt, to zero) (w/ #fbits)
fcvtzu d,s (fcvt to unsigned fixedpt, to zero) (w/ #fbits)
fcvtzs 2d,4s,2s
fcvtzu 2d,4s,2s
fcvtzs w_s,x_s,w_d,x_d (fcvt to signed fixedpt, to zero) (w/ #fbits)
fcvtzu w_s,x_s,w_d,x_d (fcvt to unsigned fixedpt, to zero) (w/ #fbits)
fcvtxn s_d (fcvt to lower prec narrow, rounding to odd)
fcvtxn 2s_2d,4s_2d
scvtf d,s _#fbits
ucvtf d,s _#fbits
scvtf 2d,4s,2s _#fbits
ucvtf 2d,4s,2s _#fbits
scvtf d,s
ucvtf d,s
scvtf 2d,4s,2s
ucvtf 2d,4s,2s
scvtf s_w, d_w, s_x, d_x, _#fbits
ucvtf s_w, d_w, s_x, d_x, _#fbits
scvtf s_w, d_w, s_x, d_x
ucvtf s_w, d_w, s_x, d_x
======================== INT ========================
abs d
neg d
abs 2d,4s,2s,8h,4h,16b,8b
neg 2d,4s,2s,8h,4h,16b,8b
add d
sub d
add 2d,4s,2s,8h,4h,16b,8b
sub 2d,4s,2s,8h,4h,16b,8b
addhn{2} 2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
subhn{2} 2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
raddhn{2} 2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
rsubhn{2} 2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
addp d (add pairs, across)
addp 2d,4s,2s,8h,4h,16b,8b
addv 4s,8h,4h,16b,18b (reduce across vector)
and 16b,8b
orr 8h,4h #imm8, LSL #0 or 8
orr 4s,2s #imm8, LSL #0, 8, 16 or 24
bic 8h,4h #imm8, LSL #0 or 8
bic 4s,2s #imm8, LSL #0, 8, 16 or 24
also movi, mvni
bic 16b,8b (vector,reg) (bit clear)
bif 16b,8b (vector) (bit insert if false)
bit 16b,8b (vector) (bit insert if true)
bsl 16b,8b (vector) (bit select)
cls 4s,2s,8h,4h,16b,8b (count leading sign bits)
clz 4s,2s,8h,4h,16b,8b (count leading zero bits)
cmeq d
cmge d
cmgt d
cmhi d
cmhs d
cmtst d
cmeq 2d,4s,2s,8h,4h,16b,8b
cmge 2d,4s,2s,8h,4h,16b,8b
cmgt 2d,4s,2s,8h,4h,16b,8b
cmhi 2d,4s,2s,8h,4h,16b,8b
cmhs 2d,4s,2s,8h,4h,16b,8b
cmtst 2d,4s,2s,8h,4h,16b,8b
cmeq_z d
cmge_z d
cmgt_z d
cmle_z d
cmlt_z d
cmeq_z 2d,4s,2s,8h,4h,16b,8b
cmge_z 2d,4s,2s,8h,4h,16b,8b
cmgt_z 2d,4s,2s,8h,4h,16b,8b
cmle_z 2d,4s,2s,8h,4h,16b,8b
cmlt_z 2d,4s,2s,8h,4h,16b,8b
cnt 16b,8b (population count per byte)
dup d,s,h,b (vec elem to scalar)
dup 2d,4s,2s,8h,4h,16b,8b (vec elem to vector)
dup 2d,4s,2s,8h,4h,16b,8b (general reg to vector)
eor 16b,8b (vector)
ext 16b,8b,#imm4 (concat 2 vectors, then slice)
ins d[]_d[],s[]_s[],h[]_h[],b[]_b[]
ins d[]_x, s[]_w, h[]_w, b[]_w
mla 4s_4s_s[],2s_2s_s[],8h_8h_h[],4h_4h_h[]
mla 4s,2s,8h,4h,16b,8b
mls 4s_4s_s[],2s_2s_s[],8h_8h_h[],4h_4h_h[]
mls 4s,2s,8h,4h,16b,8b
movi 16b,8b #imm8, LSL #0
movi 8h,4h #imm8, LSL #0 or 8
movi 4s,2s #imm8, LSL #0, 8, 16, 24
movi 4s,2s #imm8, MSL #8 or 16
movi d, #imm64
movi 2d, #imm64
mul 4s_4s_s[],2s_2s_s[],8h_8h_h[],4h_4h_h[]
mul 4s,2s,8h,4h,16b,8b
mvni 8h,4h #imm8, LSL #0 or 8
mvni 4s,2s #imm8, LSL #0, 8, 16, 24
mvni 4s,2s #imm8, MSL #8 or 16
not 16b,8b
orn 16b,8b
orr 16b,8b
pmul 16b,8b
pmull{2} 8h_8b_8b,8h_16b_16b,1q_1d_1d,1d_2d_2d
rbit 16b,8b
rev16 16b,8b
rev32 16b,8b,8h,4h
rev64 16b,8b,8h,4h,4s,2s
saba 16b,8b,8h,4h,4s,2s
uaba 16b,8b,8h,4h,4s,2s
sabal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
uabal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
sabd 16b,8b,8h,4h,4s,2s
uabd 16b,8b,8h,4h,4s,2s
sabdl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
uabdl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
sadalp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
uadalp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
saddl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
uaddl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
ssubl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
usubl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
saddlp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
uaddlp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
saddlv h_16b/8b, s_8h/4h, d_4s
uaddlv h_16b/8b, s_8h/4h, d_4s
saddw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
uaddw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
ssubw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
usubw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
shadd 16b,8b,8h,4h,4s,2s
uhadd 16b,8b,8h,4h,4s,2s
shsub 16b,8b,8h,4h,4s,2s
uhsub 16b,8b,8h,4h,4s,2s
shl d_#imm
shl 16b,8b,8h,4h,4s,2s,2d _#imm
shll{2} 8h_8b/16b_#8, 4s_4h/8h_#16, 2d_2s/4s_#32
shrn{2} 2s/4s_2d, 8h/4h_4s, 2s/4s_2d, #imm in 1 .. elem_bits
rshrn{2} 2s/4s_2d, 8h/4h_4s, 2s/4s_2d, #imm in 1 .. elem_bits
sli d_#imm
sri d_#imm
sli 2d,4s,2s,8h,4h,16b,8b _#imm
sri 2d,4s,2s,8h,4h,16b,8b _#imm
smax 4s,2s,8h,4h,16b,8b
umax 4s,2s,8h,4h,16b,8b
smin 4s,2s,8h,4h,16b,8b
umin 4s,2s,8h,4h,16b,8b
smaxp 4s,2s,8h,4h,16b,8b
umaxp 4s,2s,8h,4h,16b,8b
sminp 4s,2s,8h,4h,16b,8b
uminp 4s,2s,8h,4h,16b,8b
smaxv s_4s,h_8h,h_4h,b_16b,b_8b
umaxv s_4s,h_8h,h_4h,b_16b,b_8b
sminv s_4s,h_8h,h_4h,b_16b,b_8b
uminv s_4s,h_8h,h_4h,b_16b,b_8b
smlal{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
umlal{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
smlsl{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
umlsl{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
smull{2} 2d_2s/4s_s[]. 4s_4h/8h_h[]
umull{2} 2d_2s/4s_s[]. 4s_4h/8h_h[]
smlal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
umlal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
smlsl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
umlsl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
smull{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
umull{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
smov w_b[], w_h[], x_b[], x_h[], x_s[]
umov w_b[], w_h[], x_b[], x_h[], x_s[]
sqabs d,s,h,b
sqneg d,s,h,b
sqabs 2d,4s,2s,8h,4h,16b,8b
sqneg 2d,4s,2s,8h,4h,16b,8b
sqadd d,s,h,b
uqadd d,s,h,b
sqsub d,s,h,b
uqsub d,s,h,b
sqadd 2d,4s,2s,8h,4h,16b,8b
uqadd 2d,4s,2s,8h,4h,16b,8b
sqsub 2d,4s,2s,8h,4h,16b,8b
uqsub 2d,4s,2s,8h,4h,16b,8b
sqdmlal d_s_s[], s_h_h[]
sqdmlsl d_s_s[], s_h_h[]
sqdmull d_s_s[], s_h_h[]
sqdmlal{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
sqdmlsl{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
sqdmull{2} 2d_2s/4s_s[], 4s_4h/2h_h[]
sqdmlal d_s_s, s_h_h
sqdmlsl d_s_s, s_h_h
sqdmull d_s_s, s_h_h
sqdmlal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)
sqdmlsl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)
sqdmull{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)
sqdmulh s_s_s[], h_h_h[]
sqrdmulh s_s_s[], h_h_h[]
sqdmulh 4s_4s_s[], 2s_2s_s[], 8h_8h_h[], 4h_4h_h[]
sqrdmulh 4s_4s_s[], 2s_2s_s[], 8h_8h_h[], 4h_4h_h[]
sqdmulh h,s
sqrdmulh h,s
sqdmulh 4s,2s,8h,4h
sqrdmulh 4s,2s,8h,4h
sqshl d,s,h,b
uqshl d,s,h,b
sqrshl d,s,h,b
uqrshl d,s,h,b
sqshl 2d,4s,2s,8h,4h,16b,8b
uqshl 2d,4s,2s,8h,4h,16b,8b
sqrshl 2d,4s,2s,8h,4h,16b,8b
uqrshl 2d,4s,2s,8h,4h,16b,8b
sqrshrn s_d, h_s, b_h #imm
uqrshrn s_d, h_s, b_h #imm
sqshrn s_d, h_s, b_h #imm
uqshrn s_d, h_s, b_h #imm
sqrshrun s_d, h_s, b_h #imm
sqshrun s_d, h_s, b_h #imm
sqrshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
uqrshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
sqshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
uqshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
sqrshrun{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
sqshrun{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
sqshl d,s,h,b _#imm
uqshl d,s,h,b _#imm
sqshlu d,s,h,b _#imm
sqshl 2d,4s,2s,8h,4h,16b,8b _#imm
uqshl 2d,4s,2s,8h,4h,16b,8b _#imm
sqshlu 2d,4s,2s,8h,4h,16b,8b _#imm
sqxtn s_d,h_s,b_h
uqxtn s_d,h_s,b_h
sqxtun s_d,h_s,b_h
sqxtn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
uqxtn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
sqxtun{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
srhadd 4s,2s,8h,4h,16b,8b
urhadd 4s,2s,8h,4h,16b,8b
sshl (reg) d
ushl (reg) d
sshr (imm) d
ushr (imm) d
ssra (imm) d
usra (imm) d
srshl (reg) d
urshl (reg) d
srshr (imm) d
urshr (imm) d
srsra (imm) d
ursra (imm) d
sshl 2d,4s,2s,8h,4h,16b,8b
ushl 2d,4s,2s,8h,4h,16b,8b
sshr 2d,4s,2s,8h,4h,16b,8b
ushr 2d,4s,2s,8h,4h,16b,8b
ssra 2d,4s,2s,8h,4h,16b,8b
usra 2d,4s,2s,8h,4h,16b,8b
srshl 2d,4s,2s,8h,4h,16b,8b
urshl 2d,4s,2s,8h,4h,16b,8b
srshr 2d,4s,2s,8h,4h,16b,8b
urshr 2d,4s,2s,8h,4h,16b,8b
srsra 2d,4s,2s,8h,4h,16b,8b
ursra 2d,4s,2s,8h,4h,16b,8b
sshll{2} (imm) 2d_2s/4s 4s_4h/8h, 8h_8b/16b
ushll{2} (imm) 2d_2s/4s 4s_4h/8h, 8h_8b/16b
suqadd d,s,h,b
suqadd 2d,4s,2s,8h,4h,16b,8b
tbl 8b_{16b}_8b, 16b_{16b}_16b
tbl 8b_{16b,16b}_8b, 16b_{16b,16b}_16b
tbl 8b_{16b,16b,16b}_8b, 16b_{16b,16b,16b}_16b
tbl 8b_{16b,16b,16b,16b}_8b, 16b_{16b,16b,16b,16b}_16b
tbx 8b_{16b}_8b, 16b_{16b}_16b
tbx 8b_{16b,16b}_8b, 16b_{16b,16b}_16b
tbx 8b_{16b,16b,16b}_8b, 16b_{16b,16b,16b}_16b
tbx 8b_{16b,16b,16b,16b}_8b, 16b_{16b,16b,16b,16b}_16b
trn1 2d,4s,2s,8h,4h,16b,8b
trn2 2d,4s,2s,8h,4h,16b,8b
urecpe 4s,2s
ursqrte 4s,2s
usqadd d,s,h,b
usqadd 2d,4s,2s,8h,4h,16b,8b
uzp1 2d,4s,2s,8h,4h,16b,8b
uzp2 2d,4s,2s,8h,4h,16b,8b
xtn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
zip1 2d,4s,2s,8h,4h,16b,8b
zip2 2d,4s,2s,8h,4h,16b,8b
======================== MEM ========================
ld1 (multiple 1-element structures to 1/2/3/4 regs)
ld1 (single 1-element structure to one lane of 1 reg)
ld1r (single 1-element structure and rep to all lanes of 1 reg)
ld2 (multiple 2-element structures to 2 regs)
ld2 (single 2-element structure to one lane of 2 regs)
ld2r (single 2-element structure and rep to all lanes of 2 regs)
ld3 (multiple 3-element structures to 3 regs)
ld3 (single 3-element structure to one lane of 3 regs)
ld3r (single 3-element structure and rep to all lanes of 3 regs)
ld4 (multiple 4-element structures to 4 regs)
ld4 (single 4-element structure to one lane of 4 regs)
ld4r (single 4-element structure and rep to all lanes of 4 regs)
ldnp q_q_addr,d_d_addr,s_s_addr (load pair w/ non-temporal hint)
addr = reg + uimm7 * reg_size
ldp q_q_addr,d_d_addr,s_s_addr (load pair)
addr = [Xn|SP],#imm or [Xn|SP,#imm]! or [Xn|SP,#imm]
ldr q,d,s,h,b from addr
addr = [Xn|SP],#imm or [Xn|SP,#imm]! or [Xn|SP,#imm]
ldr q,d,s from pc+#imm19
ldr q,d,s,h,b from addr
addr = [Xn|SP, R <extend> <shift]
ldur q,d,s,h,b from addr
addr = [Xn|SP,#imm] (unscaled offset)
st1 (multiple 1-element structures from 1/2/3/4 regs)
st1 (single 1-element structure for 1 lane of 1 reg)
st2 (multiple 2-element structures from 2 regs)
st2 (single 2-element structure from 1 lane of 2 regs)
st3 (multiple 3-element structures from 3 regs)
st3 (single 3-element structure from 1 lane of 3 regs)
st4 (multiple 4-element structures from 4 regs)
st4 (single 4-element structure from one lane of 4 regs)
stnp q_q_addr, d_d_addr, s_s_addr
addr = [Xn|SP, #imm]
stp q_q_addr, d_d_addr, s_s_addr
addr = [Xn|SP], #imm or [Xn|SP, #imm]! or [Xn|SP, #imm]
str q,d,s,h,b_addr
addr = [Xn|SP], #simm or [Xn|SP, #simm]! or [Xn|SP, #pimm]
str q,d,s,h,b_addr
addr = [Xn|SP, R <extend> <shift]
stur q,d,s,h,b_addr
addr = [Xn|SP,#imm] (unscaled offset)
======================== CRYPTO ========================
aesd 16b (aes single round decryption)
aese 16b (aes single round encryption)
aesimc 16b (aes inverse mix columns)
aesmc 16b (aes mix columns)
sha1c q_s_4s
sha1h s_s
sha1m q_s_4s
sha1p q_s_4s
sha1su0 4s_4s_4s
sha1su1 4s_4s
sha256h2 q_q_4s
sha256h q_q_4s
sha256su0 4s_4s
sha256su1 4s_4s_4s
*/