Big reorganisation:
* add lane type descriptors, to be used by the random data
generators -- as-yet unused
* move existing tests into new groupings, and rename some of
them to be more consistent with the new notation.
git-svn-id: svn://svn.valgrind.org/valgrind/trunk@13936 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/none/tests/arm64/fp_and_simd.c b/none/tests/arm64/fp_and_simd.c
index a725518..8dd7c5e 100644
--- a/none/tests/arm64/fp_and_simd.c
+++ b/none/tests/arm64/fp_and_simd.c
@@ -17,8 +17,11 @@
#define True ((Bool)1)
-#define ITERS 10
+#define ITERS 1
+typedef
+ enum { TySF=1234, TyDF, TyB, TyH, TyS, TyD, TyNONE }
+ LaneTy;
union _V128 {
UChar u8[16];
@@ -37,7 +40,7 @@
return (seed >> 17) & 0xFF;
}
-static ULong randULong ( void )
+static ULong randULong ( LaneTy ty )
{
Int i;
ULong r = 0;
@@ -50,7 +53,7 @@
/* Generates a random V128. Ensures that that it contains normalised
FP numbers when viewed as either F32x4 or F64x2, so that it is
reasonable to use in FP test cases. */
-static void randV128 ( V128* v )
+static void randV128 ( /*OUT*/V128* v, LaneTy ty )
{
static UInt nCalls = 0, nIters = 0;
Int i;
@@ -86,6 +89,188 @@
}
+/* ---------------------------------------------------------------- */
+/* -- Test functions -- */
+/* ---------------------------------------------------------------- */
+
+/* Note this also sets the destination register to a known value (0x55..55)
+ since it can sometimes be an input to the instruction too. */
+#define GEN_UNARY_TEST(INSN,SUFFIXD,SUFFIXN) \
+ __attribute__((noinline)) \
+ static void test_##INSN##_##SUFFIXD##_##SUFFIXN ( LaneTy ty ) { \
+ Int i; \
+ for (i = 0; i < ITERS; i++) { \
+ V128 block[2]; \
+ memset(block, 0x55, sizeof(block)); \
+ randV128(&block[0], ty); \
+ randV128(&block[1], ty); \
+ __asm__ __volatile__( \
+ "ldr q7, [%0, #0] ; " \
+ "ldr q8, [%0, #16] ; " \
+ #INSN " v8." #SUFFIXD ", v7." #SUFFIXN " ; " \
+ "str q8, [%0, #16] " \
+ : : "r"(&block[0]) : "memory", "v7", "v8" \
+ ); \
+ printf(#INSN " v8." #SUFFIXD ", v7." #SUFFIXN); \
+ showV128(&block[0]); printf(" "); \
+ showV128(&block[1]); printf("\n"); \
+ } \
+ }
+
+
+/* Note this also sets the destination register to a known value (0x55..55)
+ since it can sometimes be an input to the instruction too. */
+#define GEN_BINARY_TEST(INSN,SUFFIXD,SUFFIXN,SUFFIXM) \
+ __attribute__((noinline)) \
+ static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_##SUFFIXM ( LaneTy ty ) { \
+ Int i; \
+ for (i = 0; i < ITERS; i++) { \
+ V128 block[3]; \
+ memset(block, 0x55, sizeof(block)); \
+ randV128(&block[0], ty); \
+ randV128(&block[1], ty); \
+ randV128(&block[2], ty); \
+ __asm__ __volatile__( \
+ "ldr q7, [%0, #0] ; " \
+ "ldr q8, [%0, #16] ; " \
+ "ldr q9, [%0, #32] ; " \
+ #INSN " v9." #SUFFIXD ", v7." #SUFFIXN ", v8." #SUFFIXM " ; " \
+ "str q9, [%0, #32] " \
+ : : "r"(&block[0]) : "memory", "v7", "v8", "v9" \
+ ); \
+ printf(#INSN " v9." #SUFFIXD \
+ ", v7." #SUFFIXN ", v8." #SUFFIXM " "); \
+ showV128(&block[0]); printf(" "); \
+ showV128(&block[1]); printf(" "); \
+ showV128(&block[2]); printf("\n"); \
+ } \
+ }
+
+
+/* Note this also sets the destination register to a known value (0x55..55)
+ since it can sometimes be an input to the instruction too. */
+#define GEN_SHIFT_TEST(INSN,SUFFIXD,SUFFIXN,AMOUNT) \
+ __attribute__((noinline)) \
+ static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_##AMOUNT ( LaneTy ty ) { \
+ Int i; \
+ for (i = 0; i < ITERS; i++) { \
+ V128 block[2]; \
+ memset(block, 0x55, sizeof(block)); \
+ randV128(&block[0], ty); \
+ randV128(&block[1], ty); \
+ __asm__ __volatile__( \
+ "ldr q7, [%0, #0] ; " \
+ "ldr q8, [%0, #16] ; " \
+ #INSN " v8." #SUFFIXD ", v7." #SUFFIXN ", #" #AMOUNT " ; " \
+ "str q8, [%0, #16] " \
+ : : "r"(&block[0]) : "memory", "v7", "v8" \
+ ); \
+ printf(#INSN " v8." #SUFFIXD ", v7." #SUFFIXN ", #" #AMOUNT " "); \
+ showV128(&block[0]); printf(" "); \
+ showV128(&block[1]); printf("\n"); \
+ } \
+ }
+
+
+/* Generate a test that involves one integer reg and one vector reg,
+ with no bias as towards which is input or output. */
+#define GEN_ONEINT_ONEVEC_TEST(TESTNAME,INSN,INTREGNO,VECREGNO) \
+ __attribute__((noinline)) \
+ static void test_##TESTNAME ( LaneTy ty ) { \
+ Int i; \
+ for (i = 0; i < ITERS; i++) { \
+ V128 block[4]; \
+ memset(block, 0x55, sizeof(block)); \
+ randV128(&block[0], ty); \
+ randV128(&block[1], ty); \
+ randV128(&block[2], ty); \
+ randV128(&block[3], ty); \
+ __asm__ __volatile__( \
+ "ldr q"#VECREGNO", [%0, #0] ; " \
+ "ldr x"#INTREGNO", [%0, #16] ; " \
+ INSN " ; " \
+ "str q"#VECREGNO", [%0, #32] ; " \
+ "str x"#INTREGNO", [%0, #48] ; " \
+ : : "r"(&block[0]) : "memory", "v"#VECREGNO, "x"#INTREGNO \
+ ); \
+ printf(INSN " "); \
+ showV128(&block[0]); printf(" "); \
+ showV128(&block[1]); printf(" "); \
+ showV128(&block[2]); printf(" "); \
+ showV128(&block[3]); printf("\n"); \
+ } \
+ }
+
+
+/* Generate a test that involves two vector regs,
+ with no bias as towards which is input or output. */
+#define GEN_TWOVEC_TEST(TESTNAME,INSN,VECREG1NO,VECREG2NO) \
+ __attribute__((noinline)) \
+ static void test_##TESTNAME ( LaneTy ty ) { \
+ Int i; \
+ for (i = 0; i < ITERS; i++) { \
+ V128 block[4]; \
+ memset(block, 0x55, sizeof(block)); \
+ randV128(&block[0], ty); \
+ randV128(&block[1], ty); \
+ randV128(&block[2], ty); \
+ randV128(&block[3], ty); \
+ __asm__ __volatile__( \
+ "ldr q"#VECREG1NO", [%0, #0] ; " \
+ "ldr q"#VECREG2NO", [%0, #16] ; " \
+ INSN " ; " \
+ "str q"#VECREG1NO", [%0, #32] ; " \
+ "str q"#VECREG2NO", [%0, #48] ; " \
+ : : "r"(&block[0]) : "memory", "v"#VECREG1NO, "v"#VECREG2NO \
+ ); \
+ printf(INSN " "); \
+ showV128(&block[0]); printf(" "); \
+ showV128(&block[1]); printf(" "); \
+ showV128(&block[2]); printf(" "); \
+ showV128(&block[3]); printf("\n"); \
+ } \
+ }
+
+
+/* Generate a test that involves three vector regs,
+ with no bias as towards which is input or output. It's also OK
+ to use v16, v17, v18 as scratch. */
+#define GEN_THREEVEC_TEST(TESTNAME,INSN,VECREG1NO,VECREG2NO,VECREG3NO) \
+ __attribute__((noinline)) \
+ static void test_##TESTNAME ( LaneTy ty ) { \
+ Int i; \
+ for (i = 0; i < ITERS; i++) { \
+ V128 block[6]; \
+ memset(block, 0x55, sizeof(block)); \
+ randV128(&block[0], ty); \
+ randV128(&block[1], ty); \
+ randV128(&block[2], ty); \
+ randV128(&block[3], ty); \
+ randV128(&block[4], ty); \
+ randV128(&block[5], ty); \
+ __asm__ __volatile__( \
+ "ldr q"#VECREG1NO", [%0, #0] ; " \
+ "ldr q"#VECREG2NO", [%0, #16] ; " \
+ "ldr q"#VECREG3NO", [%0, #32] ; " \
+ INSN " ; " \
+ "str q"#VECREG1NO", [%0, #48] ; " \
+ "str q"#VECREG2NO", [%0, #64] ; " \
+ "str q"#VECREG3NO", [%0, #80] ; " \
+ : : "r"(&block[0]) \
+ : "memory", "v"#VECREG1NO, "v"#VECREG2NO, "v"#VECREG3NO, \
+ "v16", "v17", "v18" \
+ ); \
+ printf(INSN " "); \
+ showV128(&block[0]); printf(" "); \
+ showV128(&block[1]); printf(" "); \
+ showV128(&block[2]); printf(" "); \
+ showV128(&block[3]); printf(" "); \
+ showV128(&block[4]); printf(" "); \
+ showV128(&block[5]); printf("\n"); \
+ } \
+ }
+
+
void test_UMINV ( void )
{
int i;
@@ -95,8 +280,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyS);
+ randV128(&block[1], TyS);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"uminv s8, v7.4s ; "
@@ -112,8 +297,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyH);
+ randV128(&block[1], TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"uminv h8, v7.8h ; "
@@ -129,8 +314,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyH);
+ randV128(&block[1], TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"uminv h8, v7.4h ; "
@@ -146,8 +331,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyB);
+ randV128(&block[1], TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"uminv b8, v7.16b ; "
@@ -163,8 +348,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyB);
+ randV128(&block[1], TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"uminv b8, v7.8b ; "
@@ -188,8 +373,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyS);
+ randV128(&block[1], TyS);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"umaxv s8, v7.4s ; "
@@ -205,8 +390,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyH);
+ randV128(&block[1], TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"umaxv h8, v7.8h ; "
@@ -222,8 +407,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyH);
+ randV128(&block[1], TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"umaxv h8, v7.4h ; "
@@ -239,8 +424,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyB);
+ randV128(&block[1], TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"umaxv b8, v7.16b ; "
@@ -256,8 +441,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyB);
+ randV128(&block[1], TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"umaxv b8, v7.8b ; "
@@ -279,7 +464,7 @@
/* -- D[0..1] -- */
memset(&block, 0x55, sizeof(block));
- block[1].u64[0] = randULong();
+ block[1].u64[0] = randULong(TyD);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
@@ -292,7 +477,7 @@
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
- block[1].u64[0] = randULong();
+ block[1].u64[0] = randULong(TyD);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
@@ -307,7 +492,7 @@
/* -- S[0..3] -- */
memset(&block, 0x55, sizeof(block));
- block[1].u64[0] = randULong();
+ block[1].u64[0] = randULong(TyS);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
@@ -320,7 +505,7 @@
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
- block[1].u64[0] = randULong();
+ block[1].u64[0] = randULong(TyS);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
@@ -333,7 +518,7 @@
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
- block[1].u64[0] = randULong();
+ block[1].u64[0] = randULong(TyS);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
@@ -346,7 +531,7 @@
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
- block[1].u64[0] = randULong();
+ block[1].u64[0] = randULong(TyS);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
@@ -361,7 +546,7 @@
/* -- H[0..7] -- */
memset(&block, 0x55, sizeof(block));
- block[1].u64[0] = randULong();
+ block[1].u64[0] = randULong(TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
@@ -374,7 +559,7 @@
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
- block[1].u64[0] = randULong();
+ block[1].u64[0] = randULong(TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
@@ -387,7 +572,7 @@
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
- block[1].u64[0] = randULong();
+ block[1].u64[0] = randULong(TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
@@ -400,7 +585,7 @@
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
- block[1].u64[0] = randULong();
+ block[1].u64[0] = randULong(TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
@@ -413,7 +598,7 @@
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
- block[1].u64[0] = randULong();
+ block[1].u64[0] = randULong(TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
@@ -426,7 +611,7 @@
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
- block[1].u64[0] = randULong();
+ block[1].u64[0] = randULong(TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
@@ -439,7 +624,7 @@
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
- block[1].u64[0] = randULong();
+ block[1].u64[0] = randULong(TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
@@ -452,7 +637,7 @@
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
- block[1].u64[0] = randULong();
+ block[1].u64[0] = randULong(TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
@@ -467,7 +652,7 @@
/* -- B[0,15] -- */
memset(&block, 0x55, sizeof(block));
- block[1].u64[0] = randULong();
+ block[1].u64[0] = randULong(TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
@@ -480,7 +665,7 @@
showV128(&block[2]); printf("\n");
memset(&block, 0x55, sizeof(block));
- block[1].u64[0] = randULong();
+ block[1].u64[0] = randULong(TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"ldr x19, [%0, #16] ; "
@@ -504,8 +689,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyS);
+ randV128(&block[1], TyS);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"sminv s8, v7.4s ; "
@@ -521,8 +706,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyH);
+ randV128(&block[1], TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"sminv h8, v7.8h ; "
@@ -538,8 +723,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyH);
+ randV128(&block[1], TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"sminv h8, v7.4h ; "
@@ -555,8 +740,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyB);
+ randV128(&block[1], TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"sminv b8, v7.16b ; "
@@ -572,8 +757,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyB);
+ randV128(&block[1], TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"sminv b8, v7.8b ; "
@@ -597,8 +782,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyS);
+ randV128(&block[1], TyS);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"smaxv s8, v7.4s ; "
@@ -614,8 +799,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyH);
+ randV128(&block[1], TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"smaxv h8, v7.8h ; "
@@ -631,8 +816,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyH);
+ randV128(&block[1], TyH);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"smaxv h8, v7.4h ; "
@@ -648,8 +833,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyB);
+ randV128(&block[1], TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"smaxv b8, v7.16b ; "
@@ -665,8 +850,8 @@
for (i = 0; i < 10; i++) {
memset(&block, 0x55, sizeof(block));
- randV128(&block[0]);
- randV128(&block[1]);
+ randV128(&block[0], TyB);
+ randV128(&block[1], TyB);
__asm__ __volatile__(
"ldr q7, [%0, #0] ; "
"smaxv b8, v7.8b ; "
@@ -680,215 +865,143 @@
}
-/* Note this also sets the destination register to a known value (0x55..55)
- since it can sometimes be an input to the instruction too. */
-#define GEN_BINARY_TEST(INSN,SUFFIX) \
- __attribute__((noinline)) \
- static void test_##INSN##_##SUFFIX ( void ) { \
- Int i; \
- for (i = 0; i < ITERS; i++) { \
- V128 block[3]; \
- memset(block, 0x55, sizeof(block)); \
- randV128(&block[0]); \
- randV128(&block[1]); \
- randV128(&block[2]); \
- __asm__ __volatile__( \
- "ldr q7, [%0, #0] ; " \
- "ldr q8, [%0, #16] ; " \
- "ldr q9, [%0, #32] ; " \
- #INSN " v9." #SUFFIX ", v7." #SUFFIX ", v8." #SUFFIX " ; " \
- "str q9, [%0, #32] " \
- : : "r"(&block[0]) : "memory", "v7", "v8", "v9" \
- ); \
- printf(#INSN " v9." #SUFFIX ", v7." #SUFFIX ", v8." #SUFFIX " "); \
- showV128(&block[0]); printf(" "); \
- showV128(&block[1]); printf(" "); \
- showV128(&block[2]); printf("\n"); \
- } \
- }
+GEN_BINARY_TEST(umax, 4s, 4s, 4s)
+GEN_BINARY_TEST(umax, 2s, 2s, 2s)
+GEN_BINARY_TEST(umax, 8h, 8h, 8h)
+GEN_BINARY_TEST(umax, 4h, 4h, 4h)
+GEN_BINARY_TEST(umax, 16b, 16b, 16b)
+GEN_BINARY_TEST(umax, 8b, 8b, 8b)
-/* Note this also sets the destination register to a known value (0x55..55)
- since it can sometimes be an input to the instruction too. */
-#define GEN_SHIFT_TEST(INSN,SUFFIXD,SUFFIXN,AMOUNT) \
- __attribute__((noinline)) \
- static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_##AMOUNT ( void ) { \
- Int i; \
- for (i = 0; i < ITERS; i++) { \
- V128 block[2]; \
- memset(block, 0x55, sizeof(block)); \
- randV128(&block[0]); \
- randV128(&block[1]); \
- __asm__ __volatile__( \
- "ldr q7, [%0, #0] ; " \
- "ldr q8, [%0, #16] ; " \
- #INSN " v8." #SUFFIXD ", v7." #SUFFIXN ", #" #AMOUNT " ; " \
- "str q8, [%0, #16] " \
- : : "r"(&block[0]) : "memory", "v7", "v8" \
- ); \
- printf(#INSN " v8." #SUFFIXD ", v7." #SUFFIXN ", #" #AMOUNT " "); \
- showV128(&block[0]); printf(" "); \
- showV128(&block[1]); printf("\n"); \
- } \
- }
+GEN_BINARY_TEST(umin, 4s, 4s, 4s)
+GEN_BINARY_TEST(umin, 2s, 2s, 2s)
+GEN_BINARY_TEST(umin, 8h, 8h, 8h)
+GEN_BINARY_TEST(umin, 4h, 4h, 4h)
+GEN_BINARY_TEST(umin, 16b, 16b, 16b)
+GEN_BINARY_TEST(umin, 8b, 8b, 8b)
-/* Note this also sets the destination register to a known value (0x55..55)
- since it can sometimes be an input to the instruction too. */
-#define GEN_UNARY_TEST(INSN,SUFFIXD,SUFFIXN) \
- __attribute__((noinline)) \
- static void test_##INSN##_##SUFFIXD##_##SUFFIXN ( void ) { \
- Int i; \
- for (i = 0; i < ITERS; i++) { \
- V128 block[2]; \
- memset(block, 0x55, sizeof(block)); \
- randV128(&block[0]); \
- randV128(&block[1]); \
- __asm__ __volatile__( \
- "ldr q7, [%0, #0] ; " \
- "ldr q8, [%0, #16] ; " \
- #INSN " v8." #SUFFIXD ", v7." #SUFFIXN " ; " \
- "str q8, [%0, #16] " \
- : : "r"(&block[0]) : "memory", "v7", "v8" \
- ); \
- printf(#INSN " v8." #SUFFIXD ", v7." #SUFFIXN); \
- showV128(&block[0]); printf(" "); \
- showV128(&block[1]); printf("\n"); \
- } \
- }
+GEN_BINARY_TEST(smax, 4s, 4s, 4s)
+GEN_BINARY_TEST(smax, 2s, 2s, 2s)
+GEN_BINARY_TEST(smax, 8h, 8h, 8h)
+GEN_BINARY_TEST(smax, 4h, 4h, 4h)
+GEN_BINARY_TEST(smax, 16b, 16b, 16b)
+GEN_BINARY_TEST(smax, 8b, 8b, 8b)
+GEN_BINARY_TEST(smin, 4s, 4s, 4s)
+GEN_BINARY_TEST(smin, 2s, 2s, 2s)
+GEN_BINARY_TEST(smin, 8h, 8h, 8h)
+GEN_BINARY_TEST(smin, 4h, 4h, 4h)
+GEN_BINARY_TEST(smin, 16b, 16b, 16b)
+GEN_BINARY_TEST(smin, 8b, 8b, 8b)
-GEN_BINARY_TEST(umax, 4s)
-GEN_BINARY_TEST(umax, 8h)
-GEN_BINARY_TEST(umax, 4h)
-GEN_BINARY_TEST(umax, 16b)
-GEN_BINARY_TEST(umax, 8b)
+GEN_BINARY_TEST(add, 2d, 2d, 2d)
+GEN_BINARY_TEST(add, 4s, 4s, 4s)
+GEN_BINARY_TEST(add, 2s, 2s, 2s)
+GEN_BINARY_TEST(add, 8h, 8h, 8h)
+GEN_BINARY_TEST(add, 4h, 4h, 4h)
+GEN_BINARY_TEST(add, 16b, 16b, 16b)
+GEN_BINARY_TEST(add, 8b, 8b, 8b)
-GEN_BINARY_TEST(umin, 4s)
-GEN_BINARY_TEST(umin, 8h)
-GEN_BINARY_TEST(umin, 4h)
-GEN_BINARY_TEST(umin, 16b)
-GEN_BINARY_TEST(umin, 8b)
+GEN_BINARY_TEST(sub, 2d, 2d, 2d)
+GEN_BINARY_TEST(sub, 4s, 4s, 4s)
+GEN_BINARY_TEST(sub, 2s, 2s, 2s)
+GEN_BINARY_TEST(sub, 8h, 8h, 8h)
+GEN_BINARY_TEST(sub, 4h, 4h, 4h)
+GEN_BINARY_TEST(sub, 16b, 16b, 16b)
+GEN_BINARY_TEST(sub, 8b, 8b, 8b)
-GEN_BINARY_TEST(smax, 4s)
-GEN_BINARY_TEST(smax, 8h)
-GEN_BINARY_TEST(smax, 4h)
-GEN_BINARY_TEST(smax, 16b)
-GEN_BINARY_TEST(smax, 8b)
+GEN_BINARY_TEST(mul, 4s, 4s, 4s)
+GEN_BINARY_TEST(mul, 2s, 2s, 2s)
+GEN_BINARY_TEST(mul, 8h, 8h, 8h)
+GEN_BINARY_TEST(mul, 4h, 4h, 4h)
+GEN_BINARY_TEST(mul, 16b, 16b, 16b)
+GEN_BINARY_TEST(mul, 8b, 8b, 8b)
-GEN_BINARY_TEST(smin, 4s)
-GEN_BINARY_TEST(smin, 8h)
-GEN_BINARY_TEST(smin, 4h)
-GEN_BINARY_TEST(smin, 16b)
-GEN_BINARY_TEST(smin, 8b)
+GEN_BINARY_TEST(mla, 4s, 4s, 4s)
+GEN_BINARY_TEST(mla, 2s, 2s, 2s)
+GEN_BINARY_TEST(mla, 8h, 8h, 8h)
+GEN_BINARY_TEST(mla, 4h, 4h, 4h)
+GEN_BINARY_TEST(mla, 16b, 16b, 16b)
+GEN_BINARY_TEST(mla, 8b, 8b, 8b)
-GEN_BINARY_TEST(add, 2d)
-GEN_BINARY_TEST(add, 4s)
-GEN_BINARY_TEST(add, 2s)
-GEN_BINARY_TEST(add, 8h)
-GEN_BINARY_TEST(add, 4h)
-GEN_BINARY_TEST(add, 16b)
-GEN_BINARY_TEST(add, 8b)
+GEN_BINARY_TEST(mls, 4s, 4s, 4s)
+GEN_BINARY_TEST(mls, 2s, 2s, 2s)
+GEN_BINARY_TEST(mls, 8h, 8h, 8h)
+GEN_BINARY_TEST(mls, 4h, 4h, 4h)
+GEN_BINARY_TEST(mls, 16b, 16b, 16b)
+GEN_BINARY_TEST(mls, 8b, 8b, 8b)
-GEN_BINARY_TEST(sub, 2d)
-GEN_BINARY_TEST(sub, 4s)
-GEN_BINARY_TEST(sub, 2s)
-GEN_BINARY_TEST(sub, 8h)
-GEN_BINARY_TEST(sub, 4h)
-GEN_BINARY_TEST(sub, 16b)
-GEN_BINARY_TEST(sub, 8b)
+GEN_BINARY_TEST(and, 16b, 16b, 16b)
+GEN_BINARY_TEST(and, 8b, 8b, 8b)
-GEN_BINARY_TEST(mul, 4s)
-GEN_BINARY_TEST(mul, 2s)
-GEN_BINARY_TEST(mul, 8h)
-GEN_BINARY_TEST(mul, 4h)
-GEN_BINARY_TEST(mul, 16b)
-GEN_BINARY_TEST(mul, 8b)
+GEN_BINARY_TEST(bic, 16b, 16b, 16b)
+GEN_BINARY_TEST(bic, 8b, 8b, 8b)
-GEN_BINARY_TEST(mla, 4s)
-GEN_BINARY_TEST(mla, 2s)
-GEN_BINARY_TEST(mla, 8h)
-GEN_BINARY_TEST(mla, 4h)
-GEN_BINARY_TEST(mla, 16b)
-GEN_BINARY_TEST(mla, 8b)
+GEN_BINARY_TEST(orr, 16b, 16b, 16b)
+GEN_BINARY_TEST(orr, 8b, 8b, 8b)
-GEN_BINARY_TEST(mls, 4s)
-GEN_BINARY_TEST(mls, 2s)
-GEN_BINARY_TEST(mls, 8h)
-GEN_BINARY_TEST(mls, 4h)
-GEN_BINARY_TEST(mls, 16b)
-GEN_BINARY_TEST(mls, 8b)
+GEN_BINARY_TEST(orn, 16b, 16b, 16b)
+GEN_BINARY_TEST(orn, 8b, 8b, 8b)
-GEN_BINARY_TEST(and, 16b)
-GEN_BINARY_TEST(and, 8b)
+GEN_BINARY_TEST(eor, 16b, 16b, 16b)
+GEN_BINARY_TEST(eor, 8b, 8b, 8b)
-GEN_BINARY_TEST(bic, 16b)
-GEN_BINARY_TEST(bic, 8b)
+GEN_BINARY_TEST(bsl, 16b, 16b, 16b)
+GEN_BINARY_TEST(bsl, 8b, 8b, 8b)
-GEN_BINARY_TEST(orr, 16b)
-GEN_BINARY_TEST(orr, 8b)
+GEN_BINARY_TEST(bit, 16b, 16b, 16b)
+GEN_BINARY_TEST(bit, 8b, 8b, 8b)
-GEN_BINARY_TEST(orn, 16b)
-GEN_BINARY_TEST(orn, 8b)
+GEN_BINARY_TEST(bif, 16b, 16b, 16b)
+GEN_BINARY_TEST(bif, 8b, 8b, 8b)
-GEN_BINARY_TEST(eor, 16b)
-GEN_BINARY_TEST(eor, 8b)
+GEN_BINARY_TEST(cmeq, 2d, 2d, 2d)
+GEN_BINARY_TEST(cmeq, 4s, 4s, 4s)
+GEN_BINARY_TEST(cmeq, 2s, 2s, 2s)
+GEN_BINARY_TEST(cmeq, 8h, 8h, 8h)
+GEN_BINARY_TEST(cmeq, 4h, 4h, 4h)
+GEN_BINARY_TEST(cmeq, 16b, 16b, 16b)
+GEN_BINARY_TEST(cmeq, 8b, 8b, 8b)
-GEN_BINARY_TEST(bsl, 16b)
-GEN_BINARY_TEST(bsl, 8b)
+GEN_BINARY_TEST(cmtst, 2d, 2d, 2d)
+GEN_BINARY_TEST(cmtst, 4s, 4s, 4s)
+GEN_BINARY_TEST(cmtst, 2s, 2s, 2s)
+GEN_BINARY_TEST(cmtst, 8h, 8h, 8h)
+GEN_BINARY_TEST(cmtst, 4h, 4h, 4h)
+GEN_BINARY_TEST(cmtst, 16b, 16b, 16b)
+GEN_BINARY_TEST(cmtst, 8b, 8b, 8b)
-GEN_BINARY_TEST(bit, 16b)
-GEN_BINARY_TEST(bit, 8b)
+GEN_BINARY_TEST(cmhi, 2d, 2d, 2d)
+GEN_BINARY_TEST(cmhi, 4s, 4s, 4s)
+GEN_BINARY_TEST(cmhi, 2s, 2s, 2s)
+GEN_BINARY_TEST(cmhi, 8h, 8h, 8h)
+GEN_BINARY_TEST(cmhi, 4h, 4h, 4h)
+GEN_BINARY_TEST(cmhi, 16b, 16b, 16b)
+GEN_BINARY_TEST(cmhi, 8b, 8b, 8b)
-GEN_BINARY_TEST(bif, 16b)
-GEN_BINARY_TEST(bif, 8b)
+GEN_BINARY_TEST(cmgt, 2d, 2d, 2d)
+GEN_BINARY_TEST(cmgt, 4s, 4s, 4s)
+GEN_BINARY_TEST(cmgt, 2s, 2s, 2s)
+GEN_BINARY_TEST(cmgt, 8h, 8h, 8h)
+GEN_BINARY_TEST(cmgt, 4h, 4h, 4h)
+GEN_BINARY_TEST(cmgt, 16b, 16b, 16b)
+GEN_BINARY_TEST(cmgt, 8b, 8b, 8b)
-GEN_BINARY_TEST(cmeq, 2d)
-GEN_BINARY_TEST(cmeq, 4s)
-GEN_BINARY_TEST(cmeq, 2s)
-GEN_BINARY_TEST(cmeq, 8h)
-GEN_BINARY_TEST(cmeq, 4h)
-GEN_BINARY_TEST(cmeq, 16b)
-GEN_BINARY_TEST(cmeq, 8b)
+GEN_BINARY_TEST(cmhs, 2d, 2d, 2d)
+GEN_BINARY_TEST(cmhs, 4s, 4s, 4s)
+GEN_BINARY_TEST(cmhs, 2s, 2s, 2s)
+GEN_BINARY_TEST(cmhs, 8h, 8h, 8h)
+GEN_BINARY_TEST(cmhs, 4h, 4h, 4h)
+GEN_BINARY_TEST(cmhs, 16b, 16b, 16b)
+GEN_BINARY_TEST(cmhs, 8b, 8b, 8b)
-GEN_BINARY_TEST(cmtst, 2d)
-GEN_BINARY_TEST(cmtst, 4s)
-GEN_BINARY_TEST(cmtst, 2s)
-GEN_BINARY_TEST(cmtst, 8h)
-GEN_BINARY_TEST(cmtst, 4h)
-GEN_BINARY_TEST(cmtst, 16b)
-GEN_BINARY_TEST(cmtst, 8b)
-
-GEN_BINARY_TEST(cmhi, 2d)
-GEN_BINARY_TEST(cmhi, 4s)
-GEN_BINARY_TEST(cmhi, 2s)
-GEN_BINARY_TEST(cmhi, 8h)
-GEN_BINARY_TEST(cmhi, 4h)
-GEN_BINARY_TEST(cmhi, 16b)
-GEN_BINARY_TEST(cmhi, 8b)
-
-GEN_BINARY_TEST(cmgt, 2d)
-GEN_BINARY_TEST(cmgt, 4s)
-GEN_BINARY_TEST(cmgt, 2s)
-GEN_BINARY_TEST(cmgt, 8h)
-GEN_BINARY_TEST(cmgt, 4h)
-GEN_BINARY_TEST(cmgt, 16b)
-GEN_BINARY_TEST(cmgt, 8b)
-
-GEN_BINARY_TEST(cmhs, 2d)
-GEN_BINARY_TEST(cmhs, 4s)
-GEN_BINARY_TEST(cmhs, 2s)
-GEN_BINARY_TEST(cmhs, 8h)
-GEN_BINARY_TEST(cmhs, 4h)
-GEN_BINARY_TEST(cmhs, 16b)
-GEN_BINARY_TEST(cmhs, 8b)
-
-GEN_BINARY_TEST(cmge, 2d)
-GEN_BINARY_TEST(cmge, 4s)
-GEN_BINARY_TEST(cmge, 2s)
-GEN_BINARY_TEST(cmge, 8h)
-GEN_BINARY_TEST(cmge, 4h)
-GEN_BINARY_TEST(cmge, 16b)
-GEN_BINARY_TEST(cmge, 8b)
+GEN_BINARY_TEST(cmge, 2d, 2d, 2d)
+GEN_BINARY_TEST(cmge, 4s, 4s, 4s)
+GEN_BINARY_TEST(cmge, 2s, 2s, 2s)
+GEN_BINARY_TEST(cmge, 8h, 8h, 8h)
+GEN_BINARY_TEST(cmge, 4h, 4h, 4h)
+GEN_BINARY_TEST(cmge, 16b, 16b, 16b)
+GEN_BINARY_TEST(cmge, 8b, 8b, 8b)
GEN_SHIFT_TEST(ushr, 2d, 2d, 1)
GEN_SHIFT_TEST(ushr, 2d, 2d, 13)
@@ -975,89 +1088,28 @@
GEN_UNARY_TEST(xtn, 8b, 8h)
GEN_UNARY_TEST(xtn2, 16b, 8h)
+GEN_ONEINT_ONEVEC_TEST(umov_x_d0, "umov x9, v10.d[0]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(umov_x_d1, "umov x9, v10.d[1]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(umov_w_s0, "umov w9, v10.s[0]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(umov_w_s3, "umov w9, v10.s[3]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(umov_w_h0, "umov w9, v10.h[0]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(umov_w_h7, "umov w9, v10.h[7]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(umov_w_b0, "umov w9, v10.b[0]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(umov_w_b15, "umov w9, v10.b[15]", 9, 10)
-/* Generate a test that involves one integer reg and one vector reg,
- with no bias as towards which is input or output. */
-#define GEN_ONEINT_ONEVEC_TEST(TESTNAME,INSN,INTREGNO,VECREGNO) \
- __attribute__((noinline)) \
- static void test_##TESTNAME ( void ) { \
- Int i; \
- for (i = 0; i < ITERS; i++) { \
- V128 block[4]; \
- memset(block, 0x55, sizeof(block)); \
- randV128(&block[0]); \
- randV128(&block[1]); \
- randV128(&block[2]); \
- randV128(&block[3]); \
- __asm__ __volatile__( \
- "ldr q"#VECREGNO", [%0, #0] ; " \
- "ldr x"#INTREGNO", [%0, #16] ; " \
- INSN " ; " \
- "str q"#VECREGNO", [%0, #32] ; " \
- "str x"#INTREGNO", [%0, #48] ; " \
- : : "r"(&block[0]) : "memory", "v"#VECREGNO, "x"#INTREGNO \
- ); \
- printf(INSN " "); \
- showV128(&block[0]); printf(" "); \
- showV128(&block[1]); printf(" "); \
- showV128(&block[2]); printf(" "); \
- showV128(&block[3]); printf("\n"); \
- } \
- }
+GEN_ONEINT_ONEVEC_TEST(smov_x_s0, "smov x9, v10.s[0]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(smov_x_s3, "smov x9, v10.s[3]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(smov_x_h0, "smov x9, v10.h[0]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(smov_x_h7, "smov x9, v10.h[7]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(smov_w_h0, "smov w9, v10.h[0]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(smov_w_h7, "smov w9, v10.h[7]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(smov_x_b0, "smov x9, v10.b[0]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(smov_x_b15, "smov x9, v10.b[15]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(smov_w_b0, "smov w9, v10.b[0]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(smov_w_b15, "smov w9, v10.b[15]", 9, 10)
-GEN_ONEINT_ONEVEC_TEST(umov_01, "umov x9, v10.d[0]", 9, 10)
-GEN_ONEINT_ONEVEC_TEST(umov_02, "umov x9, v10.d[1]", 9, 10)
-GEN_ONEINT_ONEVEC_TEST(umov_03, "umov w9, v10.s[0]", 9, 10)
-GEN_ONEINT_ONEVEC_TEST(umov_04, "umov w9, v10.s[3]", 9, 10)
-GEN_ONEINT_ONEVEC_TEST(umov_05, "umov w9, v10.h[0]", 9, 10)
-GEN_ONEINT_ONEVEC_TEST(umov_06, "umov w9, v10.h[7]", 9, 10)
-GEN_ONEINT_ONEVEC_TEST(umov_07, "umov w9, v10.b[0]", 9, 10)
-GEN_ONEINT_ONEVEC_TEST(umov_08, "umov w9, v10.b[15]", 9, 10)
-
-GEN_ONEINT_ONEVEC_TEST(smov_01, "smov x9, v10.s[0]", 9, 10)
-GEN_ONEINT_ONEVEC_TEST(smov_02, "smov x9, v10.s[3]", 9, 10)
-
-GEN_ONEINT_ONEVEC_TEST(smov_03, "smov x9, v10.h[0]", 9, 10)
-GEN_ONEINT_ONEVEC_TEST(smov_04, "smov x9, v10.h[7]", 9, 10)
-GEN_ONEINT_ONEVEC_TEST(smov_05, "smov w9, v10.h[0]", 9, 10)
-GEN_ONEINT_ONEVEC_TEST(smov_06, "smov w9, v10.h[7]", 9, 10)
-
-GEN_ONEINT_ONEVEC_TEST(smov_07, "smov x9, v10.b[0]", 9, 10)
-GEN_ONEINT_ONEVEC_TEST(smov_08, "smov x9, v10.b[15]", 9, 10)
-GEN_ONEINT_ONEVEC_TEST(smov_09, "smov w9, v10.b[0]", 9, 10)
-GEN_ONEINT_ONEVEC_TEST(smov_10, "smov w9, v10.b[15]", 9, 10)
-
-/* Generate a test that involves two vector regs,
- with no bias as towards which is input or output. */
-#define GEN_TWOVEC_TEST(TESTNAME,INSN,VECREG1NO,VECREG2NO) \
- __attribute__((noinline)) \
- static void test_##TESTNAME ( void ) { \
- Int i; \
- for (i = 0; i < ITERS; i++) { \
- V128 block[4]; \
- memset(block, 0x55, sizeof(block)); \
- randV128(&block[0]); \
- randV128(&block[1]); \
- randV128(&block[2]); \
- randV128(&block[3]); \
- __asm__ __volatile__( \
- "ldr q"#VECREG1NO", [%0, #0] ; " \
- "ldr q"#VECREG2NO", [%0, #16] ; " \
- INSN " ; " \
- "str q"#VECREG1NO", [%0, #32] ; " \
- "str q"#VECREG2NO", [%0, #48] ; " \
- : : "r"(&block[0]) : "memory", "v"#VECREG1NO, "v"#VECREG2NO \
- ); \
- printf(INSN " "); \
- showV128(&block[0]); printf(" "); \
- showV128(&block[1]); printf(" "); \
- showV128(&block[2]); printf(" "); \
- showV128(&block[3]); printf("\n"); \
- } \
- }
-
-GEN_TWOVEC_TEST(fcvtn_01, "fcvtn v22.2s, v23.2d", 22, 23)
-GEN_TWOVEC_TEST(fcvtn_02, "fcvtn2 v22.4s, v23.2d", 22, 23)
+GEN_TWOVEC_TEST(fcvtn_2s_2d, "fcvtn v22.2s, v23.2d", 22, 23)
+GEN_TWOVEC_TEST(fcvtn_4s_2d, "fcvtn2 v22.4s, v23.2d", 22, 23)
GEN_UNARY_TEST(neg, 2d, 2d)
GEN_UNARY_TEST(neg, 4s, 4s)
@@ -1066,122 +1118,88 @@
GEN_UNARY_TEST(neg, 4h, 4h)
GEN_UNARY_TEST(neg, 16b, 16b)
GEN_UNARY_TEST(neg, 8b, 8b)
-GEN_BINARY_TEST(fadd, 2d)
-GEN_BINARY_TEST(fadd, 4s)
-GEN_BINARY_TEST(fadd, 2s)
-GEN_BINARY_TEST(fsub, 2d)
-GEN_BINARY_TEST(fsub, 4s)
-GEN_BINARY_TEST(fsub, 2s)
-GEN_BINARY_TEST(fmul, 2d)
-GEN_BINARY_TEST(fmul, 4s)
-GEN_BINARY_TEST(fmul, 2s)
-GEN_BINARY_TEST(fdiv, 2d)
-GEN_BINARY_TEST(fdiv, 4s)
-GEN_BINARY_TEST(fdiv, 2s)
-GEN_BINARY_TEST(fmla, 2d)
-GEN_BINARY_TEST(fmla, 4s)
-GEN_BINARY_TEST(fmla, 2s)
-GEN_BINARY_TEST(fmls, 2d)
-GEN_BINARY_TEST(fmls, 4s)
-GEN_BINARY_TEST(fmls, 2s)
-GEN_BINARY_TEST(fabd, 2d)
-GEN_BINARY_TEST(fabd, 4s)
-GEN_BINARY_TEST(fabd, 2s)
-/* Generate a test that involves three vector regs,
- with no bias as towards which is input or output. It's also OK
- to use v16, v17, v18 as scratch. */
-#define GEN_THREEVEC_TEST(TESTNAME,INSN,VECREG1NO,VECREG2NO,VECREG3NO) \
- __attribute__((noinline)) \
- static void test_##TESTNAME ( void ) { \
- Int i; \
- for (i = 0; i < ITERS; i++) { \
- V128 block[6]; \
- memset(block, 0x55, sizeof(block)); \
- randV128(&block[0]); \
- randV128(&block[1]); \
- randV128(&block[2]); \
- randV128(&block[3]); \
- randV128(&block[4]); \
- randV128(&block[5]); \
- __asm__ __volatile__( \
- "ldr q"#VECREG1NO", [%0, #0] ; " \
- "ldr q"#VECREG2NO", [%0, #16] ; " \
- "ldr q"#VECREG3NO", [%0, #32] ; " \
- INSN " ; " \
- "str q"#VECREG1NO", [%0, #48] ; " \
- "str q"#VECREG2NO", [%0, #64] ; " \
- "str q"#VECREG3NO", [%0, #80] ; " \
- : : "r"(&block[0]) \
- : "memory", "v"#VECREG1NO, "v"#VECREG2NO, "v"#VECREG3NO, \
- "v16", "v17", "v18" \
- ); \
- printf(INSN " "); \
- showV128(&block[0]); printf(" "); \
- showV128(&block[1]); printf(" "); \
- showV128(&block[2]); printf(" "); \
- showV128(&block[3]); printf(" "); \
- showV128(&block[4]); printf(" "); \
- showV128(&block[5]); printf("\n"); \
- } \
- }
+GEN_BINARY_TEST(fadd, 2d, 2d, 2d)
+GEN_BINARY_TEST(fadd, 4s, 4s, 4s)
+GEN_BINARY_TEST(fadd, 2s, 2s, 2s)
+GEN_BINARY_TEST(fsub, 2d, 2d, 2d)
+GEN_BINARY_TEST(fsub, 4s, 4s, 4s)
+GEN_BINARY_TEST(fsub, 2s, 2s, 2s)
+GEN_BINARY_TEST(fmul, 2d, 2d, 2d)
+GEN_BINARY_TEST(fmul, 4s, 4s, 4s)
+GEN_BINARY_TEST(fmul, 2s, 2s, 2s)
+GEN_BINARY_TEST(fdiv, 2d, 2d, 2d)
+GEN_BINARY_TEST(fdiv, 4s, 4s, 4s)
+GEN_BINARY_TEST(fdiv, 2s, 2s, 2s)
+GEN_BINARY_TEST(fmla, 2d, 2d, 2d)
+GEN_BINARY_TEST(fmla, 4s, 4s, 4s)
+GEN_BINARY_TEST(fmla, 2s, 2s, 2s)
+GEN_BINARY_TEST(fmls, 2d, 2d, 2d)
+GEN_BINARY_TEST(fmls, 4s, 4s, 4s)
+GEN_BINARY_TEST(fmls, 2s, 2s, 2s)
+GEN_BINARY_TEST(fabd, 2d, 2d, 2d)
+GEN_BINARY_TEST(fabd, 4s, 4s, 4s)
+GEN_BINARY_TEST(fabd, 2s, 2s, 2s)
GEN_THREEVEC_TEST(add_d_d_d, "add d21, d22, d23", 21, 22, 23)
GEN_THREEVEC_TEST(sub_d_d_d, "sub d21, d22, d23", 21, 22, 23)
/* overkill -- don't need two vecs, only one */
-GEN_TWOVEC_TEST(fmov_scalar_imm_01, "fmov d22, #0.125", 22, 23)
-GEN_TWOVEC_TEST(fmov_scalar_imm_02, "fmov d22, #-4.0", 22, 23)
-GEN_TWOVEC_TEST(fmov_scalar_imm_03, "fmov d22, #1.0", 22, 23)
-GEN_TWOVEC_TEST(fmov_scalar_imm_04, "fmov s22, #0.125", 22, 23)
-GEN_TWOVEC_TEST(fmov_scalar_imm_05, "fmov s22, #-4.0", 22, 23)
-GEN_TWOVEC_TEST(fmov_scalar_imm_06, "fmov s22, #-1.0", 22, 23)
+GEN_TWOVEC_TEST(fmov_d_imm_01, "fmov d22, #0.125", 22, 23)
+GEN_TWOVEC_TEST(fmov_d_imm_02, "fmov d22, #-4.0", 22, 23)
+GEN_TWOVEC_TEST(fmov_d_imm_03, "fmov d22, #1.0", 22, 23)
+GEN_TWOVEC_TEST(fmov_s_imm_01, "fmov s22, #0.125", 22, 23)
+GEN_TWOVEC_TEST(fmov_s_imm_02, "fmov s22, #-4.0", 22, 23)
+GEN_TWOVEC_TEST(fmov_s_imm_03, "fmov s22, #-1.0", 22, 23)
-GEN_ONEINT_ONEVEC_TEST(fmov_gen_01, "fmov s7, w15", 15, 7)
-GEN_ONEINT_ONEVEC_TEST(fmov_gen_02, "fmov d7, x15", 15, 7)
-GEN_ONEINT_ONEVEC_TEST(fmov_gen_03, "fmov v7.d[1], x15", 15, 7)
-GEN_ONEINT_ONEVEC_TEST(fmov_gen_04, "fmov w15, s7", 15, 7)
-GEN_ONEINT_ONEVEC_TEST(fmov_gen_05, "fmov x15, d7", 15, 7)
-GEN_ONEINT_ONEVEC_TEST(fmov_gen_06, "fmov x15, v7.d[1]", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(fmov_s_w, "fmov s7, w15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(fmov_d_x, "fmov d7, x15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(fmov_d1_x, "fmov v7.d[1], x15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(fmov_w_s, "fmov w15, s7", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(fmov_x_d, "fmov x15, d7", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(fmov_x_d1, "fmov x15, v7.d[1]", 15, 7)
-GEN_TWOVEC_TEST(movi_vector_imm_01, "fmov d22, #0.125", 22, 23)
-GEN_TWOVEC_TEST(movi_vector_imm_02, "fmov d22, #-4.0", 22, 23)
-GEN_TWOVEC_TEST(movi_vector_imm_03, "fmov d22, #1.0", 22, 23)
-GEN_TWOVEC_TEST(movi_vector_imm_04, "fmov v22.2d, #0.125", 22, 23)
-GEN_TWOVEC_TEST(movi_vector_imm_05, "fmov v22.2d, #-4.0", 22, 23)
-GEN_TWOVEC_TEST(movi_vector_imm_06, "fmov v22.2d, #1.0", 22, 23)
+GEN_TWOVEC_TEST(fmov_2d_imm_01, "fmov v22.2d, #0.125", 22, 23)
+GEN_TWOVEC_TEST(fmov_2d_imm_02, "fmov v22.2d, #-4.0", 22, 23)
+GEN_TWOVEC_TEST(fmov_2d_imm_03, "fmov v22.2d, #1.0", 22, 23)
+GEN_TWOVEC_TEST(fmov_4s_imm_01, "fmov v22.4s, #0.125", 22, 23)
+GEN_TWOVEC_TEST(fmov_4s_imm_02, "fmov v22.4s, #-4.0", 22, 23)
+GEN_TWOVEC_TEST(fmov_4s_imm_03, "fmov v22.4s, #1.0", 22, 23)
+GEN_TWOVEC_TEST(fmov_2s_imm_01, "fmov v22.2s, #0.125", 22, 23)
+GEN_TWOVEC_TEST(fmov_2s_imm_02, "fmov v22.2s, #-4.0", 22, 23)
+GEN_TWOVEC_TEST(fmov_2s_imm_03, "fmov v22.2s, #1.0", 22, 23)
-GEN_ONEINT_ONEVEC_TEST(sucvtf_01, "scvtf s7, w15", 15, 7)
-GEN_ONEINT_ONEVEC_TEST(sucvtf_02, "scvtf d7, w15", 15, 7)
-GEN_ONEINT_ONEVEC_TEST(sucvtf_03, "scvtf s7, x15", 15, 7)
-GEN_ONEINT_ONEVEC_TEST(sucvtf_04, "scvtf d7, x15", 15, 7)
-GEN_ONEINT_ONEVEC_TEST(sucvtf_05, "ucvtf s7, w15", 15, 7)
-GEN_ONEINT_ONEVEC_TEST(sucvtf_06, "ucvtf d7, w15", 15, 7)
-GEN_ONEINT_ONEVEC_TEST(sucvtf_07, "ucvtf s7, x15", 15, 7)
-GEN_ONEINT_ONEVEC_TEST(sucvtf_08, "ucvtf d7, x15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(scvtf_s_w, "scvtf s7, w15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(scvtf_d_w, "scvtf d7, w15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(scvtf_s_x, "scvtf s7, x15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(scvtf_d_x, "scvtf d7, x15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(ucvtf_s_w, "ucvtf s7, w15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(ucvtf_d_w, "ucvtf d7, w15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(ucvtf_s_x, "ucvtf s7, x15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(ucvtf_d_x, "ucvtf d7, x15", 15, 7)
-GEN_THREEVEC_TEST(fadd_d, "fadd d2, d11, d29", 2, 11, 29)
-GEN_THREEVEC_TEST(fadd_s, "fadd s2, s11, s29", 2, 11, 29)
-GEN_THREEVEC_TEST(fsub_d, "fsub d2, d11, d29", 2, 11, 29)
-GEN_THREEVEC_TEST(fsub_s, "fsub s2, s11, s29", 2, 11, 29)
-GEN_THREEVEC_TEST(fmul_d, "fmul d2, d11, d29", 2, 11, 29)
-GEN_THREEVEC_TEST(fmul_s, "fmul s2, s11, s29", 2, 11, 29)
-GEN_THREEVEC_TEST(fdiv_d, "fdiv d2, d11, d29", 2, 11, 29)
-GEN_THREEVEC_TEST(fdiv_s, "fdiv s2, s11, s29", 2, 11, 29)
-GEN_THREEVEC_TEST(fnmul_d, "fnmul d2, d11, d29", 2, 11, 29)
-GEN_THREEVEC_TEST(fnmul_s, "fnmul s2, s11, s29", 2, 11, 29)
+GEN_THREEVEC_TEST(fadd_d_d_d, "fadd d2, d11, d29", 2, 11, 29)
+GEN_THREEVEC_TEST(fadd_s_s_s, "fadd s2, s11, s29", 2, 11, 29)
+GEN_THREEVEC_TEST(fsub_d_d_d, "fsub d2, d11, d29", 2, 11, 29)
+GEN_THREEVEC_TEST(fsub_s_s_s, "fsub s2, s11, s29", 2, 11, 29)
+GEN_THREEVEC_TEST(fmul_d_d_d, "fmul d2, d11, d29", 2, 11, 29)
+GEN_THREEVEC_TEST(fmul_s_s_s, "fmul s2, s11, s29", 2, 11, 29)
+GEN_THREEVEC_TEST(fdiv_d_d_d, "fdiv d2, d11, d29", 2, 11, 29)
+GEN_THREEVEC_TEST(fdiv_s_s_s, "fdiv s2, s11, s29", 2, 11, 29)
+GEN_THREEVEC_TEST(fnmul_d_d_d, "fnmul d2, d11, d29", 2, 11, 29)
+GEN_THREEVEC_TEST(fnmul_s_s_s, "fnmul s2, s11, s29", 2, 11, 29)
-GEN_THREEVEC_TEST(fabd_d, "fabd d2, d11, d29", 2, 11, 29)
-GEN_THREEVEC_TEST(fabd_s, "fabd s2, s11, s29", 2, 11, 29)
+GEN_THREEVEC_TEST(fabd_d_d_d, "fabd d2, d11, d29", 2, 11, 29)
+GEN_THREEVEC_TEST(fabd_s_s_s, "fabd s2, s11, s29", 2, 11, 29)
-GEN_TWOVEC_TEST(fmov_d, "fmov d22, d23", 22, 23)
-GEN_TWOVEC_TEST(fmov_s, "fmov s22, s23", 22, 23)
-GEN_TWOVEC_TEST(fabs_d, "fabs d22, d23", 22, 23)
-GEN_TWOVEC_TEST(fabs_s, "fabs s22, s23", 22, 23)
-GEN_TWOVEC_TEST(fneg_d, "fneg d22, d23", 22, 23)
-GEN_TWOVEC_TEST(fneg_s, "fneg s22, s23", 22, 23)
-GEN_TWOVEC_TEST(fsqrt_d, "fsqrt d22, d23", 22, 23)
-GEN_TWOVEC_TEST(fsqrt_s, "fsqrt s22, s23", 22, 23)
+GEN_TWOVEC_TEST(fmov_d_d, "fmov d22, d23", 22, 23)
+GEN_TWOVEC_TEST(fmov_s_s, "fmov s22, s23", 22, 23)
+GEN_TWOVEC_TEST(fabs_d_d, "fabs d22, d23", 22, 23)
+GEN_TWOVEC_TEST(fabs_s_s, "fabs s22, s23", 22, 23)
+GEN_TWOVEC_TEST(fneg_d_d, "fneg d22, d23", 22, 23)
+GEN_TWOVEC_TEST(fneg_s_s, "fneg s22, s23", 22, 23)
+GEN_TWOVEC_TEST(fsqrt_d_d, "fsqrt d22, d23", 22, 23)
+GEN_TWOVEC_TEST(fsqrt_s_s, "fsqrt s22, s23", 22, 23)
GEN_UNARY_TEST(fneg, 2d, 2d)
GEN_UNARY_TEST(fneg, 4s, 4s)
@@ -1190,21 +1208,21 @@
GEN_UNARY_TEST(fabs, 4s, 4s)
GEN_UNARY_TEST(fabs, 2s, 2s)
-GEN_BINARY_TEST(fcmeq, 2d)
-GEN_BINARY_TEST(fcmeq, 4s)
-GEN_BINARY_TEST(fcmeq, 2s)
-GEN_BINARY_TEST(fcmge, 2d)
-GEN_BINARY_TEST(fcmge, 4s)
-GEN_BINARY_TEST(fcmge, 2s)
-GEN_BINARY_TEST(fcmgt, 2d)
-GEN_BINARY_TEST(fcmgt, 4s)
-GEN_BINARY_TEST(fcmgt, 2s)
-GEN_BINARY_TEST(facge, 2d)
-GEN_BINARY_TEST(facge, 4s)
-GEN_BINARY_TEST(facge, 2s)
-GEN_BINARY_TEST(facgt, 2d)
-GEN_BINARY_TEST(facgt, 4s)
-GEN_BINARY_TEST(facgt, 2s)
+GEN_BINARY_TEST(fcmeq, 2d, 2d, 2d)
+GEN_BINARY_TEST(fcmeq, 4s, 4s, 4s)
+GEN_BINARY_TEST(fcmeq, 2s, 2s, 2s)
+GEN_BINARY_TEST(fcmge, 2d, 2d, 2d)
+GEN_BINARY_TEST(fcmge, 4s, 4s, 4s)
+GEN_BINARY_TEST(fcmge, 2s, 2s, 2s)
+GEN_BINARY_TEST(fcmgt, 2d, 2d, 2d)
+GEN_BINARY_TEST(fcmgt, 4s, 4s, 4s)
+GEN_BINARY_TEST(fcmgt, 2s, 2s, 2s)
+GEN_BINARY_TEST(facge, 2d, 2d, 2d)
+GEN_BINARY_TEST(facge, 4s, 4s, 4s)
+GEN_BINARY_TEST(facge, 2s, 2s, 2s)
+GEN_BINARY_TEST(facgt, 2d, 2d, 2d)
+GEN_BINARY_TEST(facgt, 4s, 4s, 4s)
+GEN_BINARY_TEST(facgt, 2s, 2s, 2s)
// Uses v15 as the first table entry
GEN_THREEVEC_TEST(
@@ -1280,515 +1298,1084 @@
"tbx v21.8b, {v15.16b, v16.16b, v17.16b, v18.16b}, v23.8b",
21, 15, 23)
-GEN_TWOVEC_TEST(cmge_zero_2d, "cmge v5.2d, v22.2d, #0", 5, 22)
-GEN_TWOVEC_TEST(cmge_zero_4s, "cmge v5.4s, v22.4s, #0", 5, 22)
-GEN_TWOVEC_TEST(cmge_zero_2s, "cmge v5.2s, v22.2s, #0", 5, 22)
-GEN_TWOVEC_TEST(cmge_zero_8h, "cmge v5.8h, v22.8h, #0", 5, 22)
-GEN_TWOVEC_TEST(cmge_zero_4h, "cmge v5.4h, v22.4h, #0", 5, 22)
-GEN_TWOVEC_TEST(cmge_zero_16b, "cmge v5.16b, v22.16b, #0", 5, 22)
-GEN_TWOVEC_TEST(cmge_zero_8b, "cmge v5.8b, v22.8b, #0", 5, 22)
+GEN_TWOVEC_TEST(cmge_zero_2d_2d, "cmge v5.2d, v22.2d, #0", 5, 22)
+GEN_TWOVEC_TEST(cmge_zero_4s_4s, "cmge v5.4s, v22.4s, #0", 5, 22)
+GEN_TWOVEC_TEST(cmge_zero_2s_2s, "cmge v5.2s, v22.2s, #0", 5, 22)
+GEN_TWOVEC_TEST(cmge_zero_8h_8h, "cmge v5.8h, v22.8h, #0", 5, 22)
+GEN_TWOVEC_TEST(cmge_zero_4h_4h, "cmge v5.4h, v22.4h, #0", 5, 22)
+GEN_TWOVEC_TEST(cmge_zero_16b_16b, "cmge v5.16b, v22.16b, #0", 5, 22)
+GEN_TWOVEC_TEST(cmge_zero_8b_8b, "cmge v5.8b, v22.8b, #0", 5, 22)
-GEN_TWOVEC_TEST(cmgt_zero_2d, "cmgt v5.2d, v22.2d, #0", 5, 22)
-GEN_TWOVEC_TEST(cmgt_zero_4s, "cmgt v5.4s, v22.4s, #0", 5, 22)
-GEN_TWOVEC_TEST(cmgt_zero_2s, "cmgt v5.2s, v22.2s, #0", 5, 22)
-GEN_TWOVEC_TEST(cmgt_zero_8h, "cmgt v5.8h, v22.8h, #0", 5, 22)
-GEN_TWOVEC_TEST(cmgt_zero_4h, "cmgt v5.4h, v22.4h, #0", 5, 22)
-GEN_TWOVEC_TEST(cmgt_zero_16b, "cmgt v5.16b, v22.16b, #0", 5, 22)
-GEN_TWOVEC_TEST(cmgt_zero_8b, "cmgt v5.8b, v22.8b, #0", 5, 22)
+GEN_TWOVEC_TEST(cmgt_zero_2d_2d, "cmgt v5.2d, v22.2d, #0", 5, 22)
+GEN_TWOVEC_TEST(cmgt_zero_4s_4s, "cmgt v5.4s, v22.4s, #0", 5, 22)
+GEN_TWOVEC_TEST(cmgt_zero_2s_2s, "cmgt v5.2s, v22.2s, #0", 5, 22)
+GEN_TWOVEC_TEST(cmgt_zero_8h_8h, "cmgt v5.8h, v22.8h, #0", 5, 22)
+GEN_TWOVEC_TEST(cmgt_zero_4h_4h, "cmgt v5.4h, v22.4h, #0", 5, 22)
+GEN_TWOVEC_TEST(cmgt_zero_16b_16b, "cmgt v5.16b, v22.16b, #0", 5, 22)
+GEN_TWOVEC_TEST(cmgt_zero_8b_8b, "cmgt v5.8b, v22.8b, #0", 5, 22)
-GEN_TWOVEC_TEST(cmle_zero_2d, "cmle v5.2d, v22.2d, #0", 5, 22)
-GEN_TWOVEC_TEST(cmle_zero_4s, "cmle v5.4s, v22.4s, #0", 5, 22)
-GEN_TWOVEC_TEST(cmle_zero_2s, "cmle v5.2s, v22.2s, #0", 5, 22)
-GEN_TWOVEC_TEST(cmle_zero_8h, "cmle v5.8h, v22.8h, #0", 5, 22)
-GEN_TWOVEC_TEST(cmle_zero_4h, "cmle v5.4h, v22.4h, #0", 5, 22)
-GEN_TWOVEC_TEST(cmle_zero_16b, "cmle v5.16b, v22.16b, #0", 5, 22)
-GEN_TWOVEC_TEST(cmle_zero_8b, "cmle v5.8b, v22.8b, #0", 5, 22)
+GEN_TWOVEC_TEST(cmle_zero_2d_2d, "cmle v5.2d, v22.2d, #0", 5, 22)
+GEN_TWOVEC_TEST(cmle_zero_4s_4s, "cmle v5.4s, v22.4s, #0", 5, 22)
+GEN_TWOVEC_TEST(cmle_zero_2s_2s, "cmle v5.2s, v22.2s, #0", 5, 22)
+GEN_TWOVEC_TEST(cmle_zero_8h_8h, "cmle v5.8h, v22.8h, #0", 5, 22)
+GEN_TWOVEC_TEST(cmle_zero_4h_4h, "cmle v5.4h, v22.4h, #0", 5, 22)
+GEN_TWOVEC_TEST(cmle_zero_16b_16b, "cmle v5.16b, v22.16b, #0", 5, 22)
+GEN_TWOVEC_TEST(cmle_zero_8b_8b, "cmle v5.8b, v22.8b, #0", 5, 22)
-GEN_TWOVEC_TEST(cmeq_zero_2d, "cmeq v5.2d, v22.2d, #0", 5, 22)
-GEN_TWOVEC_TEST(cmeq_zero_4s, "cmeq v5.4s, v22.4s, #0", 5, 22)
-GEN_TWOVEC_TEST(cmeq_zero_2s, "cmeq v5.2s, v22.2s, #0", 5, 22)
-GEN_TWOVEC_TEST(cmeq_zero_8h, "cmeq v5.8h, v22.8h, #0", 5, 22)
-GEN_TWOVEC_TEST(cmeq_zero_4h, "cmeq v5.4h, v22.4h, #0", 5, 22)
-GEN_TWOVEC_TEST(cmeq_zero_16b, "cmeq v5.16b, v22.16b, #0", 5, 22)
-GEN_TWOVEC_TEST(cmeq_zero_8b, "cmeq v5.8b, v22.8b, #0", 5, 22)
+GEN_TWOVEC_TEST(cmeq_zero_2d_2d, "cmeq v5.2d, v22.2d, #0", 5, 22)
+GEN_TWOVEC_TEST(cmeq_zero_4s_4s, "cmeq v5.4s, v22.4s, #0", 5, 22)
+GEN_TWOVEC_TEST(cmeq_zero_2s_2s, "cmeq v5.2s, v22.2s, #0", 5, 22)
+GEN_TWOVEC_TEST(cmeq_zero_8h_8h, "cmeq v5.8h, v22.8h, #0", 5, 22)
+GEN_TWOVEC_TEST(cmeq_zero_4h_4h, "cmeq v5.4h, v22.4h, #0", 5, 22)
+GEN_TWOVEC_TEST(cmeq_zero_16b_16b, "cmeq v5.16b, v22.16b, #0", 5, 22)
+GEN_TWOVEC_TEST(cmeq_zero_8b_8b, "cmeq v5.8b, v22.8b, #0", 5, 22)
-GEN_TWOVEC_TEST(cmlt_zero_2d, "cmlt v5.2d, v22.2d, #0", 5, 22)
-GEN_TWOVEC_TEST(cmlt_zero_4s, "cmlt v5.4s, v22.4s, #0", 5, 22)
-GEN_TWOVEC_TEST(cmlt_zero_2s, "cmlt v5.2s, v22.2s, #0", 5, 22)
-GEN_TWOVEC_TEST(cmlt_zero_8h, "cmlt v5.8h, v22.8h, #0", 5, 22)
-GEN_TWOVEC_TEST(cmlt_zero_4h, "cmlt v5.4h, v22.4h, #0", 5, 22)
-GEN_TWOVEC_TEST(cmlt_zero_16b, "cmlt v5.16b, v22.16b, #0", 5, 22)
-GEN_TWOVEC_TEST(cmlt_zero_8b, "cmlt v5.8b, v22.8b, #0", 5, 22)
+GEN_TWOVEC_TEST(cmlt_zero_2d_2d, "cmlt v5.2d, v22.2d, #0", 5, 22)
+GEN_TWOVEC_TEST(cmlt_zero_4s_4s, "cmlt v5.4s, v22.4s, #0", 5, 22)
+GEN_TWOVEC_TEST(cmlt_zero_2s_2s, "cmlt v5.2s, v22.2s, #0", 5, 22)
+GEN_TWOVEC_TEST(cmlt_zero_8h_8h, "cmlt v5.8h, v22.8h, #0", 5, 22)
+GEN_TWOVEC_TEST(cmlt_zero_4h_4h, "cmlt v5.4h, v22.4h, #0", 5, 22)
+GEN_TWOVEC_TEST(cmlt_zero_16b_16b, "cmlt v5.16b, v22.16b, #0", 5, 22)
+GEN_TWOVEC_TEST(cmlt_zero_8b_8b, "cmlt v5.8b, v22.8b, #0", 5, 22)
+/* ---------------------------------------------------------------- */
+/* -- main() -- */
+/* ---------------------------------------------------------------- */
-/* IMPORTANT: keep the tests in here in the same order as the
- implementations are in guest_arm64_toIR.c. */
int main ( void )
{
assert(sizeof(V128) == 16);
- printf("BEGIN: FMOV (general)\n");
- test_fmov_gen_01();
- test_fmov_gen_02();
- test_fmov_gen_03();
- test_fmov_gen_04();
- test_fmov_gen_05();
- test_fmov_gen_06();
- printf("END: FMOV (general)\n\n");
+ // ======================== FP ========================
- printf("BEGIN: FMOV (scalar, immediate)\n");
- test_fmov_scalar_imm_01();
- test_fmov_scalar_imm_02();
- test_fmov_scalar_imm_03();
- test_fmov_scalar_imm_04();
- test_fmov_scalar_imm_05();
- test_fmov_scalar_imm_06();
- printf("END: FMOV (scalar, immediate)\n\n");
+ // fabs d,s
+ // fabs 2d,4s,2s
+ test_fabs_d_d(TyDF);
+ test_fabs_s_s(TySF);
+ test_fabs_2d_2d(TyDF);
+ test_fabs_4s_4s(TySF);
+ test_fabs_2s_2s(TyDF);
+ test_fneg_2d_2d(TySF);
+ test_fneg_4s_4s(TyDF);
+ test_fneg_2s_2s(TySF);
- printf("BEGIN: {FMOV,MOVI} (vector, immediate)\n");
- test_movi_vector_imm_01();
- test_movi_vector_imm_02();
- test_movi_vector_imm_03();
- test_movi_vector_imm_04();
- test_movi_vector_imm_05();
- test_movi_vector_imm_06();
- printf("END: {FMOV,MOVI} (vector, immediate)\n\n");
+ // fneg d,s
+ // fneg 2d,4s,2s
+ test_fneg_d_d(TyDF);
+ test_fneg_s_s(TySF);
- printf("BEGIN: {S,U}CVTF (scalar, integer)\n");
- test_sucvtf_01();
- test_sucvtf_02();
- test_sucvtf_03();
- test_sucvtf_04();
- test_sucvtf_05();
- test_sucvtf_06();
- test_sucvtf_07();
- test_sucvtf_08();
- printf("END: {S,U}CVTF (scalar, integer)\n\n");
+ // fsqrt d,s
+ // fsqrt 2d,4s,2s
+ test_fsqrt_d_d(TyDF);
+ test_fsqrt_s_s(TySF);
- printf("BEGIN: F{ADD,SUB,MUL,DIV,NMUL} (scalar)\n");
- test_fadd_d();
- test_fadd_s();
- test_fsub_d();
- test_fsub_s();
- test_fmul_d();
- test_fmul_s();
- test_fdiv_d();
- test_fdiv_s();
- test_fnmul_d();
- test_fnmul_s();
- printf("END: F{ADD,SUB,MUL,DIV,NMUL} (scalar)\n\n");
+ // fadd d,s
+ // fsub d,s
+ test_fadd_d_d_d(TyDF);
+ test_fadd_s_s_s(TySF);
+ test_fsub_d_d_d(TyDF);
+ test_fsub_s_s_s(TySF);
- printf("BEGIN: F{MOV,ABS,NEG,SQRT} D/D or S/S\n");
- test_fmov_d();
- test_fmov_s();
- test_fabs_d();
- test_fabs_s();
- test_fneg_d();
- test_fneg_s();
- test_fsqrt_d();
- test_fsqrt_s();
- printf("END: F{MOV,ABS,NEG,SQRT} D/D or S/S\n\n");
+ // fadd 2d,4s,2s
+ // fsub 2d,4s,2s
+ test_fadd_2d_2d_2d(TyDF);
+ test_fadd_4s_4s_4s(TySF);
+ test_fadd_2s_2s_2s(TySF);
+ test_fsub_2d_2d_2d(TyDF);
+ test_fsub_4s_4s_4s(TySF);
+ test_fsub_2s_2s_2s(TySF);
- printf("BEGIN: F{ABS,NEG} (vector)\n");
- test_fabs_2d_2d();
- test_fabs_4s_4s();
- test_fabs_2s_2s();
- test_fneg_2d_2d();
- test_fneg_4s_4s();
- test_fneg_2s_2s();
- printf("END: F{ABS,NEG} (vector)\n\n");
+ // fabd d,s
+ // fabd 2d,4s,2s
+ test_fabd_d_d_d(TyDF);
+ test_fabd_s_s_s(TySF);
+ test_fabd_2d_2d_2d(TyDF);
+ test_fabd_4s_4s_4s(TySF);
+ test_fabd_2s_2s_2s(TySF);
- printf("FCMP,FCMPE MISSING\n\n");
+ // faddp d,s (floating add pair)
+ // faddp 2d,4s,2s
- printf("F{N}M{ADD,SUB} MISSING\n\n");
+ // fccmp d,s (floating point conditional quiet compare)
+ // fccmpe d,s (floating point conditional signaling compare)
- printf("FCVT{N,P,M,Z}{S,U} (scalar, integer) MISSING\n\n");
+ // fcmeq d,s
+ // fcmge d,s
+ // fcmgt d,s
+ // facgt d,s (floating abs compare GE)
+ // facge d,s (floating abs compare GE)
- printf("FRINT{I,M,P,Z} (scalar) MISSING\n\n");
+ // fcmeq 2d,4s,2s
+ // fcmge 2d,4s,2s
+ // fcmgt 2d,4s,2s
+ // facge 2d,4s,2s
+ // facgt 2d,4s,2s
+ test_fcmeq_2d_2d_2d(TyDF);
+ test_fcmeq_4s_4s_4s(TySF);
+ test_fcmeq_2s_2s_2s(TySF);
+ test_fcmge_2d_2d_2d(TyDF);
+ test_fcmge_4s_4s_4s(TySF);
+ test_fcmge_2s_2s_2s(TySF);
+ test_fcmgt_2d_2d_2d(TyDF);
+ test_fcmgt_4s_4s_4s(TySF);
+ test_fcmgt_2s_2s_2s(TySF);
+ test_facge_2d_2d_2d(TyDF);
+ test_facge_4s_4s_4s(TySF);
+ test_facge_2s_2s_2s(TySF);
+ test_facgt_2d_2d_2d(TyDF);
+ test_facgt_4s_4s_4s(TySF);
+ test_facgt_2s_2s_2s(TySF);
- printf("FCVT (scalar) MISSING\n\n");
+ // fcmeq_z d,s
+ // fcmge_z d,s
+ // fcmgt_z d,s
+ // fcmle_z d,s
+ // fcmlt_z d,s
- printf("BEGIN: FABD (scalar) MISSING\n");
- test_fabd_d();
- test_fabd_s();
- printf("END: FABD (scalar) MISSING\n\n");
+ // fcmeq_z 2d,4s,2s
+ // fcmge_z 2d,4s,2s
+ // fcmgt_z 2d,4s,2s
+ // fcmle_z 2d,4s,2s
+ // fcmlt_z 2d,4s,2s
- printf("{S,U}CVTF (vector, integer) MISSING\n\n");
+ // fcmp_z d,s
+ // fcmpe_z d,s
+ // fcmp d,s (floating point quiet, set flags)
+ // fcmpe d,s (floating point signaling, set flags)
- printf("BEGIN: F{ADD,SUB,MUL,DIV,MLA,MLS,ABD} (vector)\n");
- test_fadd_2d();
- test_fadd_4s();
- test_fadd_2s();
- test_fsub_2d();
- test_fsub_4s();
- test_fsub_2s();
- test_fmul_2d();
- test_fmul_4s();
- test_fmul_2s();
- test_fdiv_2d();
- test_fdiv_4s();
- test_fdiv_2s();
- test_fmla_2d();
- test_fmla_4s();
- test_fmla_2s();
- test_fmls_2d();
- test_fmls_4s();
- test_fmls_2s();
- test_fabd_2d();
- test_fabd_4s();
- test_fabd_2s();
- printf("END: F{ADD,SUB,MUL,DIV,MLA,MLS,ABD} (vector)\n\n");
+ // fcsel d,s (fp cond select)
- printf("BEGIN: FCM{EQ,GE,GT}, FAC{GE,GT} (vector)\n");
- test_fcmeq_2d();
- test_fcmeq_4s();
- test_fcmeq_2s();
- test_fcmge_2d();
- test_fcmge_4s();
- test_fcmge_2s();
- test_fcmgt_2d();
- test_fcmgt_4s();
- test_fcmgt_2s();
- test_facge_2d();
- test_facge_4s();
- test_facge_2s();
- test_facgt_2d();
- test_facgt_4s();
- test_facgt_2s();
- printf("END: FCM{EQ,GE,GT}, FAC{GE,GT} (vector)\n");
+ // fdiv d,s
+ // fdiv 2d,4s,2s
+ test_fdiv_d_d_d(TyDF);
+ test_fdiv_s_s_s(TySF);
+ test_fdiv_2d_2d_2d(TyDF);
+ test_fdiv_4s_4s_4s(TySF);
+ test_fdiv_2s_2s_2s(TySF);
- printf("BEGIN: FCVTN (MISSING 16F <- 32F cases)\n");
- test_fcvtn_01();
- test_fcvtn_02();
- printf("END: FCVTN (MISSING 16F <- 32F cases)\n\n");
+ // fmadd d,s
+ // fnmadd d,s
+ // fmsub d,s
+ // fnmsub d,s
- printf("BEGIN: ADD/SUB (vector)\n");
- test_add_2d();
- test_add_4s();
- test_add_2s();
- test_add_8h();
- test_add_4h();
- test_add_16b();
- test_add_8b();
- test_sub_2d();
- test_sub_4s();
- test_sub_2s();
- test_sub_8h();
- test_sub_4h();
- test_sub_16b();
- test_sub_8b();
- printf("END: ADD/SUB (vector)\n\n");
+ // fnmul d,s
+ test_fnmul_d_d_d(TyDF);
+ test_fnmul_s_s_s(TySF);
- printf("BEGIN: ADD/SUB (scalar)\n");
- test_add_d_d_d();
- test_sub_d_d_d();
- printf("END: ADD/SUB (scalar)\n\n");
+ // fmax d,s
+ // fmin d,s
+ // fmaxnm d,s ("max number")
+ // fminnm d,s
- printf("BEGIN: MUL/PMUL/MLA/MLS (vector)\n");
- test_mul_4s();
- test_mul_2s();
- test_mul_8h();
- test_mul_4h();
- test_mul_16b();
- test_mul_8b();
- test_mla_4s();
- test_mla_2s();
- test_mla_8h();
- test_mla_4h();
- test_mla_16b();
- test_mla_8b();
- test_mls_4s();
- test_mls_2s();
- test_mls_8h();
- test_mls_4h();
- test_mls_16b();
- test_mls_8b();
- printf("END: MUL/PMUL/MLA/MLS (vector) (MISSING PMUL)\n\n");
+ // fmax 2d,4s,2s
+ // fmin 2d,4s,2s
+ // fmaxnm 2d,4s,2s
+ // fminnm 2d,4s,2s
- printf("BEGIN: {S,U}{MIN,MAX} (vector)\n");
- test_umax_4s();
- test_umax_8h();
- test_umax_4h();
- test_umax_16b();
- test_umax_8b();
- test_umin_4s();
- test_umin_8h();
- test_umin_4h();
- test_umin_16b();
- test_umin_8b();
- test_smax_4s();
- test_smax_8h();
- test_smax_4h();
- test_smax_16b();
- test_smax_8b();
- test_smin_4s();
- test_smin_8h();
- test_smin_4h();
- test_smin_16b();
- test_smin_8b();
- printf("END: {S,U}{MIN,MAX} (vector)\n\n");
+ // fmaxnmp d_2d,s_2s ("max number pairwise")
+ // fminnmp d_2d,s_2s
- printf("BEGIN: {S,U}{MIN,MAX}V\n");
- test_UMINV();
+ // fmaxnmp 2d,4s,2s
+ // fminnmp 2d,4s,2s
+
+ // fmaxnmv s_4s (maxnum across vector)
+ // fminnmv s_4s
+
+ // fmaxp d_2d,s_2s (max of a pair)
+ // fminp d_2d,s_2s (max of a pair)
+
+ // fmaxp 2d,4s,2s (max pairwise)
+ // fminp 2d,4s,2s
+
+ // fmaxv s_4s (max across vector)
+ // fminv s_4s
+
+ // fmla 2d,4s,2s
+ // fmls 2d,4s,2s
+ test_fmla_2d_2d_2d(TyDF);
+ test_fmla_4s_4s_4s(TySF);
+ test_fmla_2s_2s_2s(TySF);
+ test_fmls_2d_2d_2d(TyDF);
+ test_fmls_4s_4s_4s(TySF);
+ test_fmls_2s_2s_2s(TySF);
+
+ // fmla d_d_d[],s_s_s[] (by element)
+ // fmls d_d_d[],s_s_s[] (by element)
+
+ // fmla 2d_2d_d[],4s_4s_s[],2s_2s_s[]
+ // fmls 2d_2d_d[],4s_4s_s[],2s_2s_s[]
+
+ // fmov 2d,4s,2s #imm (part of the MOVI/MVNI/ORR/BIC imm group)
+ // INCOMPLETE
+ test_fmov_2d_imm_01(TyD);
+ test_fmov_2d_imm_02(TyD);
+ test_fmov_2d_imm_03(TyD);
+ if (0) test_fmov_4s_imm_01(TyS);
+ if (0) test_fmov_4s_imm_02(TyS);
+ if (0) test_fmov_4s_imm_03(TyS);
+ if (0) test_fmov_2s_imm_01(TyS);
+ if (0) test_fmov_2s_imm_02(TyS);
+ if (0) test_fmov_2s_imm_03(TyS);
+
+ // fmov d_d,s_s
+ test_fmov_d_d(TyDF);
+ test_fmov_s_s(TySF);
+
+ // fmov s_w,w_s,d_x,d[1]_x,x_d,x_d[1]
+ test_fmov_s_w(TyS);
+ test_fmov_d_x(TyD);
+ test_fmov_d1_x(TyD);
+ test_fmov_w_s(TyS);
+ test_fmov_x_d(TyD);
+ test_fmov_x_d1(TyD);
+
+ // fmov d,s #imm
+ test_fmov_d_imm_01(TyNONE);
+ test_fmov_d_imm_02(TyNONE);
+ test_fmov_d_imm_03(TyNONE);
+ test_fmov_s_imm_01(TyNONE);
+ test_fmov_s_imm_02(TyNONE);
+ test_fmov_s_imm_03(TyNONE);
+
+ // fmul d_d_d[],s_s_s[]
+ // fmul 2d_2d_d[],4s_4s_s[],2s_2s_s[]
+
+ // fmul 2d,4s,2s
+ // fmul d,s
+ test_fmul_d_d_d(TyDF);
+ test_fmul_s_s_s(TySF);
+ test_fmul_2d_2d_2d(TyDF);
+ test_fmul_4s_4s_4s(TySF);
+ test_fmul_2s_2s_2s(TySF);
+
+ // fmulx d_d_d[],s_s_s[]
+ // fmulx 2d_2d_d[],4s_4s_s[],2s_2s_s[]
+
+ // fmulx d,s
+ // fmulx 2d,4s,2s
+
+ // frecpe d,s (recip estimate)
+ // frecpe 2d,4s,2s
+
+ // frecps d,s (recip step)
+ // frecps 2d,4s,2s
+
+ // frecpx d,s (recip exponent)
+
+ // frinta d,s
+ // frinti d,s
+ // frintm d,s
+ // frintn d,s
+ // frintp d,s
+ // frintx d,s
+ // frintz d,s
+
+ // frinta 2d,4s,2s (round to integral, nearest away)
+ // frinti 2d,4s,2s (round to integral, per FPCR)
+ // frintm 2d,4s,2s (round to integral, minus inf)
+ // frintn 2d,4s,2s (round to integral, nearest, to even)
+ // frintp 2d,4s,2s (round to integral, plus inf)
+ // frintx 2d,4s,2s (round to integral exact, per FPCR)
+ // frintz 2d,4s,2s (round to integral, zero)
+
+ // frsqrte d,s (est)
+ // frsqrte 2d,4s,2s
+
+ // frsqrts d,s (step)
+ // frsqrts 2d,4s,2s
+
+ // ======================== CONV ========================
+
+ // fcvt s_h,d_h,h_s,d_s,h_d,s_d (fp convert, scalar)
+
+ // fcvtl{2} 4s/4h, 4s/8h, 2d/2s, 2d/4s (float convert to longer form)
+
+ // fcvtn{2} 4h/4s, 8h/4s, 2s/2d, 4s/2d (float convert to narrower form)
+ // INCOMPLETE
+ test_fcvtn_2s_2d(TyDF);
+ test_fcvtn_4s_2d(TyDF);
+
+ // fcvtas d,s (fcvt to signed int, nearest, ties away)
+ // fcvtau d,s (fcvt to unsigned int, nearest, ties away)
+ // fcvtas 2d,4s,2s
+ // fcvtau 2d,4s,2s
+ // fcvtas w_s,x_s,w_d,x_d
+ // fcvtau w_s,x_s,w_d,x_d
+
+ // fcvtms d,s (fcvt to signed int, minus inf)
+ // fcvtmu d,s (fcvt to unsigned int, minus inf)
+ // fcvtms 2d,4s,2s
+ // fcvtmu 2d,4s,2s
+ // fcvtms w_s,x_s,w_d,x_d
+ // fcvtmu w_s,x_s,w_d,x_d
+
+ // fcvtns d,s (fcvt to signed int, nearest)
+ // fcvtnu d,s (fcvt to unsigned int, nearest)
+ // fcvtns 2d,4s,2s
+ // fcvtnu 2d,4s,2s
+ // fcvtns w_s,x_s,w_d,x_d
+ // fcvtnu w_s,x_s,w_d,x_d
+
+ // fcvtps d,s (fcvt to signed int, plus inf)
+ // fcvtpu d,s (fcvt to unsigned int, plus inf)
+ // fcvtps 2d,4s,2s
+ // fcvtpu 2d,4s,2s
+ // fcvtps w_s,x_s,w_d,x_d
+ // fcvtpu w_s,x_s,w_d,x_d
+
+ // fcvtzs d,s (fcvt to signed integer, to zero)
+ // fcvtzu d,s (fcvt to unsigned integer, to zero)
+ // fcvtzs 2d,4s,2s
+ // fcvtzu 2d,4s,2s
+ // fcvtzs w_s,x_s,w_d,x_d
+ // fcvtzu w_s,x_s,w_d,x_d
+
+ // fcvtzs d,s (fcvt to signed fixedpt, to zero) (w/ #fbits)
+ // fcvtzu d,s (fcvt to unsigned fixedpt, to zero) (w/ #fbits)
+ // fcvtzs 2d,4s,2s
+ // fcvtzu 2d,4s,2s
+ // fcvtzs w_s,x_s,w_d,x_d (fcvt to signed fixedpt, to zero) (w/ #fbits)
+ // fcvtzu w_s,x_s,w_d,x_d (fcvt to unsigned fixedpt, to zero) (w/ #fbits)
+
+ // fcvtxn s_d (fcvt to lower prec narrow, rounding to odd)
+ // fcvtxn 2s_2d,4s_2d
+
+ // scvtf d,s _#fbits
+ // ucvtf d,s _#fbits
+
+ // scvtf 2d,4s,2s _#fbits
+ // ucvtf 2d,4s,2s _#fbits
+
+ // scvtf d,s
+ // ucvtf d,s
+
+ // scvtf 2d,4s,2s
+ // ucvtf 2d,4s,2s
+
+ // scvtf s_w, d_w, s_x, d_x, _#fbits
+ // ucvtf s_w, d_w, s_x, d_x, _#fbits
+
+ // scvtf s_w, d_w, s_x, d_x
+ // ucvtf s_w, d_w, s_x, d_x
+ test_scvtf_s_w(TyS);
+ test_scvtf_d_w(TyS);
+ test_scvtf_s_x(TyD);
+ test_scvtf_d_x(TyD);
+ test_ucvtf_s_w(TyS);
+ test_ucvtf_d_w(TyS);
+ test_ucvtf_s_x(TyD);
+ test_ucvtf_d_x(TyD);
+
+ // ======================== INT ========================
+
+ // abs d
+ // neg d
+
+ // abs 2d,4s,2s,8h,4h,16b,8b
+ // neg 2d,4s,2s,8h,4h,16b,8b
+ test_neg_2d_2d(TyD);
+ test_neg_4s_4s(TyS);
+ test_neg_2s_2s(TyS);
+ test_neg_8h_8h(TyH);
+ test_neg_4h_4h(TyH);
+ test_neg_16b_16b(TyB);
+ test_neg_8b_8b(TyB);
+
+ // add d
+ // sub d
+ test_add_d_d_d(TyD);
+ test_sub_d_d_d(TyD);
+
+ // add 2d,4s,2s,8h,4h,16b,8b
+ // sub 2d,4s,2s,8h,4h,16b,8b
+ test_add_2d_2d_2d(TyD);
+ test_add_4s_4s_4s(TyS);
+ test_add_2s_2s_2s(TyS);
+ test_add_8h_8h_8h(TyH);
+ test_add_4h_4h_4h(TyH);
+ test_add_16b_16b_16b(TyB);
+ test_add_8b_8b_8b(TyB);
+ test_sub_2d_2d_2d(TyD);
+ test_sub_4s_4s_4s(TyS);
+ test_sub_2s_2s_2s(TyS);
+ test_sub_8h_8h_8h(TyH);
+ test_sub_4h_4h_4h(TyH);
+ test_sub_16b_16b_16b(TyB);
+ test_sub_8b_8b_8b(TyB);
+
+ // addhn{2} 2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
+ // subhn{2} 2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
+ // raddhn{2} 2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
+ // rsubhn{2} 2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
+
+ // addp d (add pairs, across)
+ // addp 2d,4s,2s,8h,4h,16b,8b
+ // addv 4s,8h,4h,16b,18b (reduce across vector)
+
+ // and 16b,8b
+ // bic 16b,8b (vector,reg) (bit clear)
+ // orn 16b,8b
+ // orr 16b,8b
+ test_and_16b_16b_16b(TyB);
+ test_and_8b_8b_8b(TyB);
+ test_bic_16b_16b_16b(TyB);
+ test_bic_8b_8b_8b(TyB);
+ test_orr_16b_16b_16b(TyB);
+ test_orr_8b_8b_8b(TyB);
+ test_orn_16b_16b_16b(TyB);
+ test_orn_8b_8b_8b(TyB);
+
+ // orr 8h,4h #imm8, LSL #0 or 8
+ // orr 4s,2s #imm8, LSL #0, 8, 16 or 24
+ // bic 8h,4h #imm8, LSL #0 or 8
+ // bic 4s,2s #imm8, LSL #0, 8, 16 or 24
+ // also movi, mvni
+
+ // bif 16b,8b (vector) (bit insert if false)
+ // bit 16b,8b (vector) (bit insert if true)
+ // bsl 16b,8b (vector) (bit select)
+ // eor 16b,8b (vector)
+ test_bif_16b_16b_16b(TyB);
+ test_bif_8b_8b_8b(TyB);
+ test_bit_16b_16b_16b(TyB);
+ test_bit_8b_8b_8b(TyB);
+ test_bsl_16b_16b_16b(TyB);
+ test_bsl_8b_8b_8b(TyB);
+ test_eor_16b_16b_16b(TyB);
+ test_eor_8b_8b_8b(TyB);
+
+ // cls 4s,2s,8h,4h,16b,8b (count leading sign bits)
+ // clz 4s,2s,8h,4h,16b,8b (count leading zero bits)
+
+ // cmeq d
+ // cmge d
+ // cmgt d
+ // cmhi d
+ // cmhs d
+ // cmtst d
+
+ // cmeq 2d,4s,2s,8h,4h,16b,8b
+ // cmge 2d,4s,2s,8h,4h,16b,8b
+ // cmgt 2d,4s,2s,8h,4h,16b,8b
+ // cmhi 2d,4s,2s,8h,4h,16b,8b
+ // cmhs 2d,4s,2s,8h,4h,16b,8b
+ // cmtst 2d,4s,2s,8h,4h,16b,8b
+ test_cmeq_2d_2d_2d(TyD);
+ test_cmeq_4s_4s_4s(TyS);
+ test_cmeq_2s_2s_2s(TyS);
+ test_cmeq_8h_8h_8h(TyH);
+ test_cmeq_4h_4h_4h(TyH);
+ test_cmeq_16b_16b_16b(TyB);
+ test_cmeq_8b_8b_8b(TyB);
+ test_cmge_2d_2d_2d(TyD);
+ test_cmge_4s_4s_4s(TyS);
+ test_cmge_2s_2s_2s(TyS);
+ test_cmge_8h_8h_8h(TyH);
+ test_cmge_4h_4h_4h(TyH);
+ test_cmge_16b_16b_16b(TyB);
+ test_cmge_8b_8b_8b(TyB);
+ test_cmgt_2d_2d_2d(TyD);
+ test_cmgt_4s_4s_4s(TyS);
+ test_cmgt_2s_2s_2s(TyS);
+ test_cmgt_8h_8h_8h(TyH);
+ test_cmgt_4h_4h_4h(TyH);
+ test_cmgt_16b_16b_16b(TyB);
+ test_cmgt_8b_8b_8b(TyB);
+ test_cmhi_2d_2d_2d(TyD);
+ test_cmhi_4s_4s_4s(TyS);
+ test_cmhi_2s_2s_2s(TyS);
+ test_cmhi_8h_8h_8h(TyH);
+ test_cmhi_4h_4h_4h(TyH);
+ test_cmhi_16b_16b_16b(TyB);
+ test_cmhi_8b_8b_8b(TyB);
+ test_cmhs_2d_2d_2d(TyD);
+ test_cmhs_4s_4s_4s(TyS);
+ test_cmhs_2s_2s_2s(TyS);
+ test_cmhs_8h_8h_8h(TyH);
+ test_cmhs_4h_4h_4h(TyH);
+ test_cmhs_16b_16b_16b(TyB);
+ test_cmhs_8b_8b_8b(TyB);
+ test_cmtst_2d_2d_2d(TyD);
+ test_cmtst_4s_4s_4s(TyS);
+ test_cmtst_2s_2s_2s(TyS);
+ test_cmtst_8h_8h_8h(TyH);
+ test_cmtst_4h_4h_4h(TyH);
+ test_cmtst_16b_16b_16b(TyB);
+ test_cmtst_8b_8b_8b(TyB);
+
+ // cmeq_z d
+ // cmge_z d
+ // cmgt_z d
+ // cmle_z d
+ // cmlt_z d
+
+ // cmeq_z 2d,4s,2s,8h,4h,16b,8b
+ // cmge_z 2d,4s,2s,8h,4h,16b,8b
+ // cmgt_z 2d,4s,2s,8h,4h,16b,8b
+ // cmle_z 2d,4s,2s,8h,4h,16b,8b
+ // cmlt_z 2d,4s,2s,8h,4h,16b,8b
+ test_cmeq_zero_2d_2d(TyD);
+ test_cmeq_zero_4s_4s(TyS);
+ test_cmeq_zero_2s_2s(TyS);
+ test_cmeq_zero_8h_8h(TyH);
+ test_cmeq_zero_4h_4h(TyH);
+ test_cmeq_zero_16b_16b(TyB);
+ test_cmeq_zero_8b_8b(TyB);
+ test_cmge_zero_2d_2d(TyD);
+ test_cmge_zero_4s_4s(TyS);
+ test_cmge_zero_2s_2s(TyS);
+ test_cmge_zero_8h_8h(TyH);
+ test_cmge_zero_4h_4h(TyH);
+ test_cmge_zero_16b_16b(TyB);
+ test_cmge_zero_8b_8b(TyB);
+ test_cmgt_zero_2d_2d(TyD);
+ test_cmgt_zero_4s_4s(TyS);
+ test_cmgt_zero_2s_2s(TyS);
+ test_cmgt_zero_8h_8h(TyH);
+ test_cmgt_zero_4h_4h(TyH);
+ test_cmgt_zero_16b_16b(TyB);
+ test_cmgt_zero_8b_8b(TyB);
+ test_cmle_zero_2d_2d(TyD);
+ test_cmle_zero_4s_4s(TyS);
+ test_cmle_zero_2s_2s(TyS);
+ test_cmle_zero_8h_8h(TyH);
+ test_cmle_zero_4h_4h(TyH);
+ test_cmle_zero_16b_16b(TyB);
+ test_cmle_zero_8b_8b(TyB);
+ test_cmlt_zero_2d_2d(TyD);
+ test_cmlt_zero_4s_4s(TyS);
+ test_cmlt_zero_2s_2s(TyS);
+ test_cmlt_zero_8h_8h(TyH);
+ test_cmlt_zero_4h_4h(TyH);
+ test_cmlt_zero_16b_16b(TyB);
+ test_cmlt_zero_8b_8b(TyB);
+
+ // cnt 16b,8b (population count per byte)
+
+ // dup d,s,h,b (vec elem to scalar)
+ // dup 2d,4s,2s,8h,4h,16b,8b (vec elem to vector)
+ // dup 2d,4s,2s,8h,4h,16b,8b (general reg to vector)
+
+ // ext 16b,8b,#imm4 (concat 2 vectors, then slice)
+
+ // ins d[]_d[],s[]_s[],h[]_h[],b[]_b[]
+
+ // ins d[]_x, s[]_w, h[]_w, b[]_w
+ test_INS_general();
+
+ // mla 4s_4s_s[],2s_2s_s[],8h_8h_h[],4h_4h_h[]
+ // mls 4s_4s_s[],2s_2s_s[],8h_8h_h[],4h_4h_h[]
+
+ // mla 4s,2s,8h,4h,16b,8b
+ // mls 4s,2s,8h,4h,16b,8b
+ test_mla_4s_4s_4s(TyS);
+ test_mla_2s_2s_2s(TyS);
+ test_mla_8h_8h_8h(TyH);
+ test_mla_4h_4h_4h(TyH);
+ test_mla_16b_16b_16b(TyB);
+ test_mla_8b_8b_8b(TyB);
+ test_mls_4s_4s_4s(TyS);
+ test_mls_2s_2s_2s(TyS);
+ test_mls_8h_8h_8h(TyH);
+ test_mls_4h_4h_4h(TyH);
+ test_mls_16b_16b_16b(TyB);
+ test_mls_8b_8b_8b(TyB);
+
+ // movi 16b,8b #imm8, LSL #0
+ // movi 8h,4h #imm8, LSL #0 or 8
+ // movi 4s,2s #imm8, LSL #0, 8, 16, 24
+ // movi 4s,2s #imm8, MSL #8 or 16
+ // movi d, #imm64
+ // movi 2d, #imm64
+
+ // mul 4s_4s_s[],2s_2s_s[],8h_8h_h[],4h_4h_h[]
+
+ // mul 4s,2s,8h,4h,16b,8b
+ test_mul_4s_4s_4s(TyS);
+ test_mul_2s_2s_2s(TyS);
+ test_mul_8h_8h_8h(TyH);
+ test_mul_4h_4h_4h(TyH);
+ test_mul_16b_16b_16b(TyB);
+ test_mul_8b_8b_8b(TyB);
+
+ // mvni 8h,4h #imm8, LSL #0 or 8
+ // mvni 4s,2s #imm8, LSL #0, 8, 16, 24
+ // mvni 4s,2s #imm8, MSL #8 or 16
+
+ // not 16b,8b
+
+ // pmul 16b,8b
+
+ // pmull{2} 8h_8b_8b,8h_16b_16b,1q_1d_1d,1d_2d_2d
+
+ // rbit 16b,8b
+ // rev16 16b,8b
+ // rev32 16b,8b,8h,4h
+ // rev64 16b,8b,8h,4h,4s,2s
+
+ // saba 16b,8b,8h,4h,4s,2s
+ // uaba 16b,8b,8h,4h,4s,2s
+
+ // sabal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ // uabal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+ // sabd 16b,8b,8h,4h,4s,2s
+ // uabd 16b,8b,8h,4h,4s,2s
+
+ // sabdl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ // uabdl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+ // sadalp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
+ // uadalp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
+
+ // saddl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ // uaddl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ // ssubl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ // usubl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+ // saddlp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
+ // uaddlp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
+
+ // saddlv h_16b/8b, s_8h/4h, d_4s
+ // uaddlv h_16b/8b, s_8h/4h, d_4s
+
+ // saddw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
+ // uaddw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
+ // ssubw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
+ // usubw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
+
+ // shadd 16b,8b,8h,4h,4s,2s
+ // uhadd 16b,8b,8h,4h,4s,2s
+ // shsub 16b,8b,8h,4h,4s,2s
+ // uhsub 16b,8b,8h,4h,4s,2s
+
+ // shll{2} 8h_8b/16b_#8, 4s_4h/8h_#16, 2d_2s/4s_#32
+
+ // shrn{2} 2s/4s_2d, 8h/4h_4s, 2s/4s_2d, #imm in 1 .. elem_bits
+ // rshrn{2} 2s/4s_2d, 8h/4h_4s, 2s/4s_2d, #imm in 1 .. elem_bits
+
+ // sli d_#imm
+ // sri d_#imm
+
+ // sli 2d,4s,2s,8h,4h,16b,8b _#imm
+ // sri 2d,4s,2s,8h,4h,16b,8b _#imm
+
+ // smax 4s,2s,8h,4h,16b,8b
+ // umax 4s,2s,8h,4h,16b,8b
+ // smin 4s,2s,8h,4h,16b,8b
+ // umin 4s,2s,8h,4h,16b,8b
+ test_smax_4s_4s_4s(TyS);
+ test_smax_2s_2s_2s(TyS);
+ test_smax_8h_8h_8h(TyH);
+ test_smax_4h_4h_4h(TyH);
+ test_smax_16b_16b_16b(TyB);
+ test_smax_8b_8b_8b(TyB);
+ test_umax_4s_4s_4s(TyS);
+ test_umax_2s_2s_2s(TyS);
+ test_umax_8h_8h_8h(TyH);
+ test_umax_4h_4h_4h(TyH);
+ test_umax_16b_16b_16b(TyB);
+ test_umax_8b_8b_8b(TyB);
+ test_smin_4s_4s_4s(TyS);
+ test_smin_2s_2s_2s(TyS);
+ test_smin_8h_8h_8h(TyH);
+ test_smin_4h_4h_4h(TyH);
+ test_smin_16b_16b_16b(TyB);
+ test_smin_8b_8b_8b(TyB);
+ test_umin_4s_4s_4s(TyS);
+ test_umin_2s_2s_2s(TyS);
+ test_umin_8h_8h_8h(TyH);
+ test_umin_4h_4h_4h(TyH);
+ test_umin_16b_16b_16b(TyB);
+ test_umin_8b_8b_8b(TyB);
+
+ // smaxp 4s,2s,8h,4h,16b,8b
+ // umaxp 4s,2s,8h,4h,16b,8b
+ // sminp 4s,2s,8h,4h,16b,8b
+ // uminp 4s,2s,8h,4h,16b,8b
+
+ // smaxv s_4s,h_8h,h_4h,b_16b,b_8b
+ // umaxv s_4s,h_8h,h_4h,b_16b,b_8b
+ // sminv s_4s,h_8h,h_4h,b_16b,b_8b
+ // uminv s_4s,h_8h,h_4h,b_16b,b_8b
+ test_SMAXV();
test_UMAXV();
test_SMINV();
- test_SMAXV();
- printf("END: {S,U}{MIN,MAX}V\n\n");
+ test_UMINV();
- printf("BEGIN: {AND,BIC,ORR,ORN} (vector)\n");
- test_and_16b();
- test_and_8b();
- test_bic_16b();
- test_bic_8b();
- test_orr_16b();
- test_orr_8b();
- test_orn_16b();
- test_orn_8b();
- printf("END: {AND,BIC,ORR,ORN} (vector)\n\n");
+ // smlal{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
+ // umlal{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
+ // smlsl{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
+ // umlsl{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
+ // smull{2} 2d_2s/4s_s[]. 4s_4h/8h_h[]
+ // umull{2} 2d_2s/4s_s[]. 4s_4h/8h_h[]
- printf("BEGIN: CM{EQ,HI,HS,GE,GT,TST,LE,LT} (vector)\n\n");
- test_cmeq_2d();
- test_cmeq_4s();
- test_cmeq_2s();
- test_cmeq_8h();
- test_cmeq_4h();
- test_cmeq_16b();
- test_cmeq_8b();
- test_cmtst_2d();
- test_cmtst_4s();
- test_cmtst_2s();
- test_cmtst_8h();
- test_cmtst_4h();
- test_cmtst_16b();
- test_cmtst_8b();
- test_cmhi_2d();
- test_cmhi_4s();
- test_cmhi_2s();
- test_cmhi_8h();
- test_cmhi_4h();
- test_cmhi_16b();
- test_cmhi_8b();
- test_cmgt_2d();
- test_cmgt_4s();
- test_cmgt_2s();
- test_cmgt_8h();
- test_cmgt_4h();
- test_cmgt_16b();
- test_cmgt_8b();
- test_cmhs_2d();
- test_cmhs_4s();
- test_cmhs_2s();
- test_cmhs_8h();
- test_cmhs_4h();
- test_cmhs_16b();
- test_cmhs_8b();
- test_cmge_2d();
- test_cmge_4s();
- test_cmge_2s();
- test_cmge_8h();
- test_cmge_4h();
- test_cmge_16b();
- test_cmge_8b();
- test_cmge_zero_2d();
- test_cmge_zero_4s();
- test_cmge_zero_2s();
- test_cmge_zero_8h();
- test_cmge_zero_4h();
- test_cmge_zero_16b();
- test_cmge_zero_8b();
- test_cmgt_zero_2d();
- test_cmgt_zero_4s();
- test_cmgt_zero_2s();
- test_cmgt_zero_8h();
- test_cmgt_zero_4h();
- test_cmgt_zero_16b();
- test_cmgt_zero_8b();
- test_cmle_zero_2d();
- test_cmle_zero_4s();
- test_cmle_zero_2s();
- test_cmle_zero_8h();
- test_cmle_zero_4h();
- test_cmle_zero_16b();
- test_cmle_zero_8b();
- test_cmeq_zero_2d();
- test_cmeq_zero_4s();
- test_cmeq_zero_2s();
- test_cmeq_zero_8h();
- test_cmeq_zero_4h();
- test_cmeq_zero_16b();
- test_cmeq_zero_8b();
- test_cmlt_zero_2d();
- test_cmlt_zero_4s();
- test_cmlt_zero_2s();
- test_cmlt_zero_8h();
- test_cmlt_zero_4h();
- test_cmlt_zero_16b();
- test_cmlt_zero_8b();
- printf("END: CM{EQ,HI,HS,GE,GT,TST,LE,LT} (vector)\n\n");
+ // smlal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ // umlal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ // smlsl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ // umlsl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ // smull{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ // umull{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
- printf("BEGIN: {EOR,BSL,BIT,BIF} (vector)\n");
- test_eor_16b();
- test_eor_8b();
- test_bsl_16b();
- test_bsl_8b();
- test_bit_16b();
- test_bit_8b();
- test_bif_16b();
- test_bif_8b();
- printf("END: {EOR,BSL,BIT,BIF} (vector)\n\n");
+ // smov w_b[], w_h[], x_b[], x_h[], x_s[]
+ // umov w_b[], w_h[], x_b[], x_h[], x_s[]
+ // INCOMPLETE
+ test_umov_x_d0(TyD);
+ test_umov_x_d1(TyD);
+ test_umov_w_s0(TyS);
+ test_umov_w_s3(TyS);
+ test_umov_w_h0(TyH);
+ test_umov_w_h7(TyH);
+ test_umov_w_b0(TyB);
+ test_umov_w_b15(TyB);
+ test_smov_x_s0(TyS);
+ test_smov_x_s3(TyS);
+ test_smov_x_h0(TyH);
+ test_smov_x_h7(TyH);
+ test_smov_w_h0(TyH);
+ test_smov_w_h7(TyH);
+ test_smov_x_b0(TyB);
+ test_smov_x_b15(TyB);
+ test_smov_w_b0(TyB);
+ test_smov_w_b15(TyB);
- printf("BEGIN: {USHR,SSHR,SHL} (vector, immediate)\n");
- test_ushr_2d_2d_1();
- test_ushr_2d_2d_13();
- test_ushr_2d_2d_63();
- test_sshr_2d_2d_1();
- test_sshr_2d_2d_13();
- test_sshr_2d_2d_63();
- test_shl_2d_2d_1();
- test_shl_2d_2d_13();
- test_shl_2d_2d_63();
+ // sqabs d,s,h,b
+ // sqneg d,s,h,b
- test_ushr_4s_4s_1();
- test_ushr_4s_4s_13();
- test_ushr_4s_4s_31();
- test_sshr_4s_4s_1();
- test_sshr_4s_4s_13();
- test_sshr_4s_4s_31();
- test_shl_4s_4s_1();
- test_shl_4s_4s_13();
- test_shl_4s_4s_31();
+ // sqabs 2d,4s,2s,8h,4h,16b,8b
+ // sqneg 2d,4s,2s,8h,4h,16b,8b
- test_ushr_2s_2s_1();
- test_ushr_2s_2s_13();
- test_ushr_2s_2s_31();
- test_sshr_2s_2s_1();
- test_sshr_2s_2s_13();
- test_sshr_2s_2s_31();
- test_shl_2s_2s_1();
- test_shl_2s_2s_13();
- test_shl_2s_2s_31();
+ // sqadd d,s,h,b
+ // uqadd d,s,h,b
+ // sqsub d,s,h,b
+ // uqsub d,s,h,b
- test_ushr_8h_8h_1();
- test_ushr_8h_8h_13();
- test_ushr_8h_8h_15();
- test_sshr_8h_8h_1();
- test_sshr_8h_8h_13();
- test_sshr_8h_8h_15();
- test_shl_8h_8h_1();
- test_shl_8h_8h_13();
- test_shl_8h_8h_15();
+ // sqadd 2d,4s,2s,8h,4h,16b,8b
+ // uqadd 2d,4s,2s,8h,4h,16b,8b
+ // sqsub 2d,4s,2s,8h,4h,16b,8b
+ // uqsub 2d,4s,2s,8h,4h,16b,8b
- test_ushr_4h_4h_1();
- test_ushr_4h_4h_13();
- test_ushr_4h_4h_15();
- test_sshr_4h_4h_1();
- test_sshr_4h_4h_13();
- test_sshr_4h_4h_15();
- test_shl_4h_4h_1();
- test_shl_4h_4h_13();
- test_shl_4h_4h_15();
+ // sqdmlal d_s_s[], s_h_h[]
+ // sqdmlsl d_s_s[], s_h_h[]
+ // sqdmull d_s_s[], s_h_h[]
- test_ushr_16b_16b_1();
- test_ushr_16b_16b_7();
- test_sshr_16b_16b_1();
- test_sshr_16b_16b_7();
- test_shl_16b_16b_1();
- test_shl_16b_16b_7();
+ // sqdmlal{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
+ // sqdmlsl{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
+ // sqdmull{2} 2d_2s/4s_s[], 4s_4h/2h_h[]
- test_ushr_8b_8b_1();
- test_ushr_8b_8b_7();
- test_sshr_8b_8b_1();
- test_sshr_8b_8b_7();
- test_shl_8b_8b_1();
- test_shl_8b_8b_7();
- printf("END: {USHR,SSHR,SHL} (vector, immediate)\n\n");
+ // sqdmlal d_s_s, s_h_h
+ // sqdmlsl d_s_s, s_h_h
+ // sqdmull d_s_s, s_h_h
- printf("BEGIN: {U,S}SHLL{,2}\n");
- test_ushll_2d_2s_0();
- test_ushll_2d_2s_15();
- test_ushll_2d_2s_31();
- test_ushll2_2d_4s_0();
- test_ushll2_2d_4s_15();
- test_ushll2_2d_4s_31();
- test_sshll_2d_2s_0();
- test_sshll_2d_2s_15();
- test_sshll_2d_2s_31();
- test_sshll2_2d_4s_0();
- test_sshll2_2d_4s_15();
- test_sshll2_2d_4s_31();
- printf("END: {U,S}SHLL{,2} (MISSING h_b and s_h versions)\n\n");
+ // sqdmlal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)
+ // sqdmlsl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)
+ // sqdmull{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)
- printf("BEGIN: XTN{,2}\n");
- test_xtn_2s_2d();
- test_xtn2_4s_2d();
- test_xtn_4h_4s();
- test_xtn2_8h_4s();
- test_xtn_8b_8h();
- test_xtn2_16b_8h();
- printf("END: XTN{,2}\n\n");
+ // sqdmulh s_s_s[], h_h_h[]
+ // sqrdmulh s_s_s[], h_h_h[]
- printf("DUP (element, vector) COMPLETELY MISSING\n\n");
+ // sqdmulh 4s_4s_s[], 2s_2s_s[], 8h_8h_h[], 4h_4h_h[]
+ // sqrdmulh 4s_4s_s[], 2s_2s_s[], 8h_8h_h[], 4h_4h_h[]
- printf("DUP (general, vector) COMPLETELY MISSING\n\n");
+ // sqdmulh h,s
+ // sqrdmulh h,s
- printf("BEGIN: {S,U}MOV\n");
- test_umov_01();
- test_umov_02();
- test_umov_03();
- test_umov_04();
- test_umov_05();
- test_umov_06();
- test_umov_07();
- test_umov_08();
- test_smov_01();
- test_smov_02();
- test_smov_03();
- test_smov_04();
- test_smov_05();
- test_smov_06();
- test_smov_07();
- test_smov_08();
- test_smov_09();
- test_smov_10();
- printf("END: {S,U}MOV\n\n");
+ // sqdmulh 4s,2s,8h,4h
+ // sqrdmulh 4s,2s,8h,4h
- printf("BEGIN: INS (general)\n");
- test_INS_general();
- printf("END: INS (general)\n\n");
+ // sqshl d,s,h,b
+ // uqshl d,s,h,b
+ // sqrshl d,s,h,b
+ // uqrshl d,s,h,b
- printf("BEGIN: NEG (vector)\n");
- test_neg_2d_2d();
- test_neg_4s_4s();
- test_neg_2s_2s();
- test_neg_8h_8h();
- test_neg_4h_4h();
- test_neg_16b_16b();
- test_neg_8b_8b();
- printf("END: NEG (vector)\n\n");
+ // sqshl 2d,4s,2s,8h,4h,16b,8b
+ // uqshl 2d,4s,2s,8h,4h,16b,8b
+ // sqrshl 2d,4s,2s,8h,4h,16b,8b
+ // uqrshl 2d,4s,2s,8h,4h,16b,8b
- printf("BEGIN: TBL, TBX\n");
- test_tbl_16b_1reg();
- test_tbl_16b_2reg();
- test_tbl_16b_3reg();
- test_tbl_16b_4reg();
- test_tbl_8b_1reg();
- test_tbl_8b_2reg();
- test_tbl_8b_3reg();
- test_tbl_8b_4reg();
- test_tbx_16b_1reg();
- test_tbx_16b_2reg();
- test_tbx_16b_3reg();
- test_tbx_16b_4reg();
- test_tbx_8b_1reg();
- test_tbx_8b_2reg();
- test_tbx_8b_3reg();
- test_tbx_8b_4reg();
- printf("END: TBL, TBX\n");
+ // sqrshrn s_d, h_s, b_h #imm
+ // uqrshrn s_d, h_s, b_h #imm
+ // sqshrn s_d, h_s, b_h #imm
+ // uqshrn s_d, h_s, b_h #imm
+
+ // sqrshrun s_d, h_s, b_h #imm
+ // sqshrun s_d, h_s, b_h #imm
+
+ // sqrshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
+ // uqrshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
+ // sqshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
+ // uqshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
+
+ // sqrshrun{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
+ // sqshrun{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
+
+ // sqshl d,s,h,b _#imm
+ // uqshl d,s,h,b _#imm
+ // sqshlu d,s,h,b _#imm
+
+ // sqshl 2d,4s,2s,8h,4h,16b,8b _#imm
+ // uqshl 2d,4s,2s,8h,4h,16b,8b _#imm
+ // sqshlu 2d,4s,2s,8h,4h,16b,8b _#imm
+
+ // sqxtn s_d,h_s,b_h
+ // uqxtn s_d,h_s,b_h
+ // sqxtun s_d,h_s,b_h
+
+ // sqxtn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
+ // uqxtn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
+ // sqxtun{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
+
+ // srhadd 4s,2s,8h,4h,16b,8b
+ // urhadd 4s,2s,8h,4h,16b,8b
+
+ // sshl (reg) d
+ // ushl (reg) d
+ // sshl (reg) 2d,4s,2s,8h,4h,16b,8b
+ // ushl (reg) 2d,4s,2s,8h,4h,16b,8b
+
+ // shl (imm) d
+ // sshr (imm) d
+ // ushr (imm) d
+
+ // shl (imm) 16b,8b,8h,4h,4s,2s,2d
+ // sshr (imm) 2d,4s,2s,8h,4h,16b,8b
+ // ushr (imm) 2d,4s,2s,8h,4h,16b,8b
+ test_shl_2d_2d_1(TyD);
+ test_shl_2d_2d_13(TyD);
+ test_shl_2d_2d_63(TyD);
+ test_shl_4s_4s_1(TyS);
+ test_shl_4s_4s_13(TyS);
+ test_shl_4s_4s_31(TyS);
+ test_shl_2s_2s_1(TyS);
+ test_shl_2s_2s_13(TyS);
+ test_shl_2s_2s_31(TyS);
+ test_shl_8h_8h_1(TyH);
+ test_shl_8h_8h_13(TyH);
+ test_shl_8h_8h_15(TyH);
+ test_shl_4h_4h_1(TyH);
+ test_shl_4h_4h_13(TyH);
+ test_shl_4h_4h_15(TyH);
+ test_shl_16b_16b_1(TyB);
+ test_shl_16b_16b_7(TyB);
+ test_shl_8b_8b_1(TyB);
+ test_shl_8b_8b_7(TyB);
+ test_sshr_2d_2d_1(TyD);
+ test_sshr_2d_2d_13(TyD);
+ test_sshr_2d_2d_63(TyD);
+ test_sshr_4s_4s_1(TyS);
+ test_sshr_4s_4s_13(TyS);
+ test_sshr_4s_4s_31(TyS);
+ test_sshr_2s_2s_1(TyS);
+ test_sshr_2s_2s_13(TyS);
+ test_sshr_2s_2s_31(TyS);
+ test_sshr_8h_8h_1(TyH);
+ test_sshr_8h_8h_13(TyH);
+ test_sshr_8h_8h_15(TyH);
+ test_sshr_4h_4h_1(TyH);
+ test_sshr_4h_4h_13(TyH);
+ test_sshr_4h_4h_15(TyH);
+ test_sshr_16b_16b_1(TyB);
+ test_sshr_16b_16b_7(TyB);
+ test_sshr_8b_8b_1(TyB);
+ test_sshr_8b_8b_7(TyB);
+ test_ushr_2d_2d_1(TyD);
+ test_ushr_2d_2d_13(TyD);
+ test_ushr_2d_2d_63(TyD);
+ test_ushr_4s_4s_1(TyS);
+ test_ushr_4s_4s_13(TyS);
+ test_ushr_4s_4s_31(TyS);
+ test_ushr_2s_2s_1(TyS);
+ test_ushr_2s_2s_13(TyS);
+ test_ushr_2s_2s_31(TyS);
+ test_ushr_8h_8h_1(TyH);
+ test_ushr_8h_8h_13(TyH);
+ test_ushr_8h_8h_15(TyH);
+ test_ushr_4h_4h_1(TyH);
+ test_ushr_4h_4h_13(TyH);
+ test_ushr_4h_4h_15(TyH);
+ test_ushr_16b_16b_1(TyB);
+ test_ushr_16b_16b_7(TyB);
+ test_ushr_8b_8b_1(TyB);
+ test_ushr_8b_8b_7(TyB);
+
+ // ssra (imm) d
+ // usra (imm) d
+ // ssra (imm) 2d,4s,2s,8h,4h,16b,8b
+ // usra (imm) 2d,4s,2s,8h,4h,16b,8b
+
+ // srshl (reg) d
+ // urshl (reg) d
+ // srshl (reg) 2d,4s,2s,8h,4h,16b,8b
+ // urshl (reg) 2d,4s,2s,8h,4h,16b,8b
+
+ // srshr (imm) d
+ // urshr (imm) d
+ // srshr (imm) 2d,4s,2s,8h,4h,16b,8b
+ // urshr (imm) 2d,4s,2s,8h,4h,16b,8b
+
+ // srsra (imm) d
+ // ursra (imm) d
+ // srsra (imm) 2d,4s,2s,8h,4h,16b,8b
+ // ursra (imm) 2d,4s,2s,8h,4h,16b,8b
+
+ // sshll{2} (imm) 2d_2s/4s, 4s_4h/8h, 8h_8b/16b
+ // ushll{2} (imm) 2d_2s/4s, 4s_4h/8h, 8h_8b/16b
+ // INCOMPLETE
+ test_sshll_2d_2s_0(TyS);
+ test_sshll_2d_2s_15(TyS);
+ test_sshll_2d_2s_31(TyS);
+ test_sshll2_2d_4s_0(TyS);
+ test_sshll2_2d_4s_15(TyS);
+ test_sshll2_2d_4s_31(TyS);
+ test_ushll_2d_2s_0(TyS);
+ test_ushll_2d_2s_15(TyS);
+ test_ushll_2d_2s_31(TyS);
+ test_ushll2_2d_4s_0(TyS);
+ test_ushll2_2d_4s_15(TyS);
+ test_ushll2_2d_4s_31(TyS);
+
+ // suqadd d,s,h,b
+ // suqadd 2d,4s,2s,8h,4h,16b,8b
+
+ // tbl 8b_{16b}_8b, 16b_{16b}_16b
+ // tbl 8b_{16b,16b}_8b, 16b_{16b,16b}_16b
+ // tbl 8b_{16b,16b,16b}_8b, 16b_{16b,16b,16b}_16b
+ // tbl 8b_{16b,16b,16b,16b}_8b, 16b_{16b,16b,16b,16b}_16b
+ test_tbl_16b_1reg(TyB);
+ test_tbl_16b_2reg(TyB);
+ test_tbl_16b_3reg(TyB);
+ test_tbl_16b_4reg(TyB);
+ test_tbl_8b_1reg(TyB);
+ test_tbl_8b_2reg(TyB);
+ test_tbl_8b_3reg(TyB);
+ test_tbl_8b_4reg(TyB);
+
+ // tbx 8b_{16b}_8b, 16b_{16b}_16b
+ // tbx 8b_{16b,16b}_8b, 16b_{16b,16b}_16b
+ // tbx 8b_{16b,16b,16b}_8b, 16b_{16b,16b,16b}_16b
+ // tbx 8b_{16b,16b,16b,16b}_8b, 16b_{16b,16b,16b,16b}_16b
+ test_tbx_16b_1reg(TyB);
+ test_tbx_16b_2reg(TyB);
+ test_tbx_16b_3reg(TyB);
+ test_tbx_16b_4reg(TyB);
+ test_tbx_8b_1reg(TyB);
+ test_tbx_8b_2reg(TyB);
+ test_tbx_8b_3reg(TyB);
+ test_tbx_8b_4reg(TyB);
+
+ // trn1 2d,4s,2s,8h,4h,16b,8b
+ // trn2 2d,4s,2s,8h,4h,16b,8b
+
+ // urecpe 4s,2s
+
+ // ursqrte 4s,2s
+
+ // usqadd d,s,h,b
+ // usqadd 2d,4s,2s,8h,4h,16b,8b
+
+ // uzp1 2d,4s,2s,8h,4h,16b,8b
+ // uzp2 2d,4s,2s,8h,4h,16b,8b
+
+ // xtn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
+ test_xtn_2s_2d(TyD);
+ test_xtn2_4s_2d(TyD);
+ test_xtn_4h_4s(TyS);
+ test_xtn2_8h_4s(TyS);
+ test_xtn_8b_8h(TyH);
+ test_xtn2_16b_8h(TyH);
+
+ // zip1 2d,4s,2s,8h,4h,16b,8b
+ // zip2 2d,4s,2s,8h,4h,16b,8b
+
+ // ======================== MEM ========================
+
+ // ld1 (multiple 1-element structures to 1/2/3/4 regs)
+ // ld1 (single 1-element structure to one lane of 1 reg)
+ // ld1r (single 1-element structure and rep to all lanes of 1 reg)
+
+ // ld2 (multiple 2-element structures to 2 regs)
+ // ld2 (single 2-element structure to one lane of 2 regs)
+ // ld2r (single 2-element structure and rep to all lanes of 2 regs)
+
+ // ld3 (multiple 3-element structures to 3 regs)
+ // ld3 (single 3-element structure to one lane of 3 regs)
+ // ld3r (single 3-element structure and rep to all lanes of 3 regs)
+
+ // ld4 (multiple 4-element structures to 4 regs)
+ // ld4 (single 4-element structure to one lane of 4 regs)
+ // ld4r (single 4-element structure and rep to all lanes of 4 regs)
+
+ // ldnp q_q_addr,d_d_addr,s_s_addr (load pair w/ non-temporal hint)
+ // addr = reg + uimm7 * reg_size
+
+ // ldp q_q_addr,d_d_addr,s_s_addr (load pair)
+ // addr = [Xn|SP],#imm or [Xn|SP,#imm]! or [Xn|SP,#imm]
+
+ // ldr q,d,s,h,b from addr
+ // addr = [Xn|SP],#imm or [Xn|SP,#imm]! or [Xn|SP,#imm]
+
+ // ldr q,d,s from pc+#imm19
+
+ // ldr q,d,s,h,b from addr
+ // addr = [Xn|SP, R <extend> <shift]
+
+ // ldur q,d,s,h,b from addr
+ // addr = [Xn|SP,#imm] (unscaled offset)
+
+ // st1 (multiple 1-element structures from 1/2/3/4 regs)
+ // st1 (single 1-element structure for 1 lane of 1 reg)
+
+ // st2 (multiple 2-element structures from 2 regs)
+ // st2 (single 2-element structure from 1 lane of 2 regs)
+
+ // st3 (multiple 3-element structures from 3 regs)
+ // st3 (single 3-element structure from 1 lane of 3 regs)
+
+ // st4 (multiple 4-element structures from 4 regs)
+ // st4 (single 4-element structure from one lane of 4 regs)
+
+ // stnp q_q_addr, d_d_addr, s_s_addr
+ // addr = [Xn|SP, #imm]
+
+ // stp q_q_addr, d_d_addr, s_s_addr
+ // addr = [Xn|SP], #imm or [Xn|SP, #imm]! or [Xn|SP, #imm]
+
+ // str q,d,s,h,b_addr
+ // addr = [Xn|SP], #simm or [Xn|SP, #simm]! or [Xn|SP, #pimm]
+
+ // str q,d,s,h,b_addr
+ // addr = [Xn|SP, R <extend> <shift]
+
+ // stur q,d,s,h,b_addr
+ // addr = [Xn|SP,#imm] (unscaled offset)
+
+ // ======================== CRYPTO ========================
+
+ // aesd 16b (aes single round decryption)
+ // aese 16b (aes single round encryption)
+ // aesimc 16b (aes inverse mix columns)
+ // aesmc 16b (aes mix columns)
+
+ // sha1c q_s_4s
+ // sha1h s_s
+ // sha1m q_s_4s
+ // sha1p q_s_4s
+ // sha1su0 4s_4s_4s
+ // sha1su1 4s_4s
+
+ // sha256h2 q_q_4s
+ // sha256h q_q_4s
+ // sha256su0 4s_4s
+ // sha256su1 4s_4s_4s
return 0;
}
+
+/* ---------------------------------------------------------------- */
+/* -- Alphabetical list of insns -- */
+/* ---------------------------------------------------------------- */
/*
abs d
abs 2d,4s,2s,8h,4h,16b,8b
@@ -2501,3 +3088,660 @@
zip1 2d,4s,2s,8h,4h,16b,8b
zip2 2d,4s,2s,8h,4h,16b,8b
*/
+
+
+/* ---------------------------------------------------------------- */
+/* -- List of insns, grouped somewhat by laneage configuration -- */
+/* ---------------------------------------------------------------- */
+/*
+ ======================== FP ========================
+
+ fabs d,s
+ fabs 2d,4s,2s
+
+ fneg d,s
+ fneg 2d,4s,2s
+
+ fsqrt d,s
+ fsqrt 2d,4s,2s
+
+ fadd d,s
+ fsub d,s
+
+ fadd 2d,4s,2s
+ fsub 2d,4s,2s
+
+ fabd d,s
+ fabd 2d,4s,2s
+
+ faddp d,s (floating add pair)
+ faddp 2d,4s,2s
+
+ fccmp d,s (floating point conditional quiet compare)
+ fccmpe d,s (floating point conditional signaling compare)
+
+ fcmeq d,s
+ fcmge d,s
+ fcmgt d,s
+ facgt d,s (floating abs compare GE)
+ facge d,s (floating abs compare GE)
+
+ fcmeq 2d,4s,2s
+ fcmge 2d,4s,2s
+ fcmgt 2d,4s,2s
+ facge 2d,4s,2s
+ facgt 2d,4s,2s
+
+ fcmeq_z d,s
+ fcmge_z d,s
+ fcmgt_z d,s
+ fcmle_z d,s
+ fcmlt_z d,s
+
+ fcmeq_z 2d,4s,2s
+ fcmge_z 2d,4s,2s
+ fcmgt_z 2d,4s,2s
+ fcmle_z 2d,4s,2s
+ fcmlt_z 2d,4s,2s
+
+ fcmp_z d,s
+ fcmpe_z d,s
+ fcmp d,s (floating point quiet, set flags)
+ fcmpe d,s (floating point signaling, set flags)
+
+ fcsel d,s (fp cond select)
+
+ fdiv d,s
+ fdiv 2d,4s,2s
+
+ fmadd d,s
+ fnmadd d,s
+ fmsub d,s
+ fnmsub d,s
+
+ fnmul d,s
+
+ fmax d,s
+ fmin d,s
+ fmaxnm d,s ("max number")
+ fminnm d,s
+
+ fmax 2d,4s,2s
+ fmin 2d,4s,2s
+ fmaxnm 2d,4s,2s
+ fminnm 2d,4s,2s
+
+ fmaxnmp d_2d,s_2s ("max number pairwise")
+ fminnmp d_2d,s_2s
+
+ fmaxnmp 2d,4s,2s
+ fminnmp 2d,4s,2s
+
+ fmaxnmv s_4s (maxnum across vector)
+ fminnmv s_4s
+
+ fmaxp d_2d,s_2s (max of a pair)
+ fminp d_2d,s_2s (max of a pair)
+
+ fmaxp 2d,4s,2s (max pairwise)
+ fminp 2d,4s,2s
+
+ fmaxv s_4s (max across vector)
+ fminv s_4s
+
+ fmla 2d,4s,2s
+ fmls 2d,4s,2s
+
+ fmla d_d_d[],s_s_s[] (by element)
+ fmls d_d_d[],s_s_s[] (by element)
+
+ fmla 2d_2d_d[],4s_4s_s[],2s_2s_s[]
+ fmls 2d_2d_d[],4s_4s_s[],2s_2s_s[]
+
+ fmov 2d,4s,2s #imm (part of the MOVI/MVNI/ORR/BIC imm group)
+
+ fmov d_d,s_s
+
+ fmov s_w,w_s,d_x,d[1]_x,x_d,x_d[1]
+
+ fmov d,s #imm
+
+ fmul d_d_d[],s_s_s[]
+ fmul 2d_2d_d[],4s_4s_s[],2s_2s_s[]
+
+ fmul 2d,4s,2s
+ fmul d,s
+
+ fmulx d_d_d[],s_s_s[]
+ fmulx 2d_2d_d[],4s_4s_s[],2s_2s_s[]
+
+ fmulx d,s
+ fmulx 2d,4s,2s
+
+ frecpe d,s (recip estimate)
+ frecpe 2d,4s,2s
+
+ frecps d,s (recip step)
+ frecps 2d,4s,2s
+
+ frecpx d,s (recip exponent)
+
+ frinta d,s
+ frinti d,s
+ frintm d,s
+ frintn d,s
+ frintp d,s
+ frintx d,s
+ frintz d,s
+
+ frinta 2d,4s,2s (round to integral, nearest away)
+ frinti 2d,4s,2s (round to integral, per FPCR)
+ frintm 2d,4s,2s (round to integral, minus inf)
+ frintn 2d,4s,2s (round to integral, nearest, to even)
+ frintp 2d,4s,2s (round to integral, plus inf)
+ frintx 2d,4s,2s (round to integral exact, per FPCR)
+ frintz 2d,4s,2s (round to integral, zero)
+
+ frsqrte d,s (est)
+ frsqrte 2d,4s,2s
+
+ frsqrts d,s (step)
+ frsqrts 2d,4s,2s
+
+ ======================== CONV ========================
+
+ fcvt s_h,d_h,h_s,d_s,h_d,s_d (fp convert, scalar)
+
+ fcvtl{2} 4s/4h, 4s/8h, 2d/2s, 2d/4s (float convert to longer form)
+
+ fcvtn{2} 4h/4s, 8h/4s, 2s/2d, 4s/2d (float convert to narrower form)
+
+ fcvtas d,s (fcvt to signed int, nearest, ties away)
+ fcvtau d,s (fcvt to unsigned int, nearest, ties away)
+ fcvtas 2d,4s,2s
+ fcvtau 2d,4s,2s
+ fcvtas w_s,x_s,w_d,x_d
+ fcvtau w_s,x_s,w_d,x_d
+
+ fcvtms d,s (fcvt to signed int, minus inf)
+ fcvtmu d,s (fcvt to unsigned int, minus inf)
+ fcvtms 2d,4s,2s
+ fcvtmu 2d,4s,2s
+ fcvtms w_s,x_s,w_d,x_d
+ fcvtmu w_s,x_s,w_d,x_d
+
+ fcvtns d,s (fcvt to signed int, nearest)
+ fcvtnu d,s (fcvt to unsigned int, nearest)
+ fcvtns 2d,4s,2s
+ fcvtnu 2d,4s,2s
+ fcvtns w_s,x_s,w_d,x_d
+ fcvtnu w_s,x_s,w_d,x_d
+
+ fcvtps d,s (fcvt to signed int, plus inf)
+ fcvtpu d,s (fcvt to unsigned int, plus inf)
+ fcvtps 2d,4s,2s
+ fcvtpu 2d,4s,2s
+ fcvtps w_s,x_s,w_d,x_d
+ fcvtpu w_s,x_s,w_d,x_d
+
+ fcvtzs d,s (fcvt to signed integer, to zero)
+ fcvtzu d,s (fcvt to unsigned integer, to zero)
+ fcvtzs 2d,4s,2s
+ fcvtzu 2d,4s,2s
+ fcvtzs w_s,x_s,w_d,x_d
+ fcvtzu w_s,x_s,w_d,x_d
+
+ fcvtzs d,s (fcvt to signed fixedpt, to zero) (w/ #fbits)
+ fcvtzu d,s (fcvt to unsigned fixedpt, to zero) (w/ #fbits)
+ fcvtzs 2d,4s,2s
+ fcvtzu 2d,4s,2s
+ fcvtzs w_s,x_s,w_d,x_d (fcvt to signed fixedpt, to zero) (w/ #fbits)
+ fcvtzu w_s,x_s,w_d,x_d (fcvt to unsigned fixedpt, to zero) (w/ #fbits)
+
+ fcvtxn s_d (fcvt to lower prec narrow, rounding to odd)
+ fcvtxn 2s_2d,4s_2d
+
+ scvtf d,s _#fbits
+ ucvtf d,s _#fbits
+
+ scvtf 2d,4s,2s _#fbits
+ ucvtf 2d,4s,2s _#fbits
+
+ scvtf d,s
+ ucvtf d,s
+
+ scvtf 2d,4s,2s
+ ucvtf 2d,4s,2s
+
+ scvtf s_w, d_w, s_x, d_x, _#fbits
+ ucvtf s_w, d_w, s_x, d_x, _#fbits
+
+ scvtf s_w, d_w, s_x, d_x
+ ucvtf s_w, d_w, s_x, d_x
+
+ ======================== INT ========================
+
+ abs d
+ neg d
+
+ abs 2d,4s,2s,8h,4h,16b,8b
+ neg 2d,4s,2s,8h,4h,16b,8b
+
+ add d
+ sub d
+
+ add 2d,4s,2s,8h,4h,16b,8b
+ sub 2d,4s,2s,8h,4h,16b,8b
+
+ addhn{2} 2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
+ subhn{2} 2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
+ raddhn{2} 2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
+ rsubhn{2} 2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
+
+ addp d (add pairs, across)
+ addp 2d,4s,2s,8h,4h,16b,8b
+ addv 4s,8h,4h,16b,18b (reduce across vector)
+
+ and 16b,8b
+
+ orr 8h,4h #imm8, LSL #0 or 8
+ orr 4s,2s #imm8, LSL #0, 8, 16 or 24
+ bic 8h,4h #imm8, LSL #0 or 8
+ bic 4s,2s #imm8, LSL #0, 8, 16 or 24
+ also movi, mvni
+
+ bic 16b,8b (vector,reg) (bit clear)
+ bif 16b,8b (vector) (bit insert if false)
+ bit 16b,8b (vector) (bit insert if true)
+ bsl 16b,8b (vector) (bit select)
+
+ cls 4s,2s,8h,4h,16b,8b (count leading sign bits)
+ clz 4s,2s,8h,4h,16b,8b (count leading zero bits)
+
+ cmeq d
+ cmge d
+ cmgt d
+ cmhi d
+ cmhs d
+ cmtst d
+
+ cmeq 2d,4s,2s,8h,4h,16b,8b
+ cmge 2d,4s,2s,8h,4h,16b,8b
+ cmgt 2d,4s,2s,8h,4h,16b,8b
+ cmhi 2d,4s,2s,8h,4h,16b,8b
+ cmhs 2d,4s,2s,8h,4h,16b,8b
+ cmtst 2d,4s,2s,8h,4h,16b,8b
+
+ cmeq_z d
+ cmge_z d
+ cmgt_z d
+ cmle_z d
+ cmlt_z d
+
+ cmeq_z 2d,4s,2s,8h,4h,16b,8b
+ cmge_z 2d,4s,2s,8h,4h,16b,8b
+ cmgt_z 2d,4s,2s,8h,4h,16b,8b
+ cmle_z 2d,4s,2s,8h,4h,16b,8b
+ cmlt_z 2d,4s,2s,8h,4h,16b,8b
+
+ cnt 16b,8b (population count per byte)
+
+ dup d,s,h,b (vec elem to scalar)
+ dup 2d,4s,2s,8h,4h,16b,8b (vec elem to vector)
+ dup 2d,4s,2s,8h,4h,16b,8b (general reg to vector)
+
+ eor 16b,8b (vector)
+ ext 16b,8b,#imm4 (concat 2 vectors, then slice)
+
+ ins d[]_d[],s[]_s[],h[]_h[],b[]_b[]
+
+ ins d[]_x, s[]_w, h[]_w, b[]_w
+
+ mla 4s_4s_s[],2s_2s_s[],8h_8h_h[],4h_4h_h[]
+ mla 4s,2s,8h,4h,16b,8b
+
+ mls 4s_4s_s[],2s_2s_s[],8h_8h_h[],4h_4h_h[]
+ mls 4s,2s,8h,4h,16b,8b
+
+ movi 16b,8b #imm8, LSL #0
+ movi 8h,4h #imm8, LSL #0 or 8
+ movi 4s,2s #imm8, LSL #0, 8, 16, 24
+ movi 4s,2s #imm8, MSL #8 or 16
+ movi d, #imm64
+ movi 2d, #imm64
+
+ mul 4s_4s_s[],2s_2s_s[],8h_8h_h[],4h_4h_h[]
+ mul 4s,2s,8h,4h,16b,8b
+
+ mvni 8h,4h #imm8, LSL #0 or 8
+ mvni 4s,2s #imm8, LSL #0, 8, 16, 24
+ mvni 4s,2s #imm8, MSL #8 or 16
+
+ not 16b,8b
+
+ orn 16b,8b
+ orr 16b,8b
+
+ pmul 16b,8b
+
+ pmull{2} 8h_8b_8b,8h_16b_16b,1q_1d_1d,1d_2d_2d
+
+ rbit 16b,8b
+ rev16 16b,8b
+ rev32 16b,8b,8h,4h
+ rev64 16b,8b,8h,4h,4s,2s
+
+ saba 16b,8b,8h,4h,4s,2s
+ uaba 16b,8b,8h,4h,4s,2s
+
+ sabal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ uabal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+ sabd 16b,8b,8h,4h,4s,2s
+ uabd 16b,8b,8h,4h,4s,2s
+
+ sabdl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ uabdl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+ sadalp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
+ uadalp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
+
+ saddl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ uaddl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ ssubl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ usubl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+ saddlp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
+ uaddlp 4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
+
+ saddlv h_16b/8b, s_8h/4h, d_4s
+ uaddlv h_16b/8b, s_8h/4h, d_4s
+
+ saddw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
+ uaddw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
+ ssubw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
+ usubw{2} 8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
+
+ shadd 16b,8b,8h,4h,4s,2s
+ uhadd 16b,8b,8h,4h,4s,2s
+ shsub 16b,8b,8h,4h,4s,2s
+ uhsub 16b,8b,8h,4h,4s,2s
+
+ shl d_#imm
+ shl 16b,8b,8h,4h,4s,2s,2d _#imm
+
+ shll{2} 8h_8b/16b_#8, 4s_4h/8h_#16, 2d_2s/4s_#32
+
+ shrn{2} 2s/4s_2d, 8h/4h_4s, 2s/4s_2d, #imm in 1 .. elem_bits
+ rshrn{2} 2s/4s_2d, 8h/4h_4s, 2s/4s_2d, #imm in 1 .. elem_bits
+
+ sli d_#imm
+ sri d_#imm
+
+ sli 2d,4s,2s,8h,4h,16b,8b _#imm
+ sri 2d,4s,2s,8h,4h,16b,8b _#imm
+
+ smax 4s,2s,8h,4h,16b,8b
+ umax 4s,2s,8h,4h,16b,8b
+ smin 4s,2s,8h,4h,16b,8b
+ umin 4s,2s,8h,4h,16b,8b
+
+ smaxp 4s,2s,8h,4h,16b,8b
+ umaxp 4s,2s,8h,4h,16b,8b
+ sminp 4s,2s,8h,4h,16b,8b
+ uminp 4s,2s,8h,4h,16b,8b
+
+ smaxv s_4s,h_8h,h_4h,b_16b,b_8b
+ umaxv s_4s,h_8h,h_4h,b_16b,b_8b
+ sminv s_4s,h_8h,h_4h,b_16b,b_8b
+ uminv s_4s,h_8h,h_4h,b_16b,b_8b
+
+ smlal{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
+ umlal{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
+ smlsl{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
+ umlsl{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
+ smull{2} 2d_2s/4s_s[]. 4s_4h/8h_h[]
+ umull{2} 2d_2s/4s_s[]. 4s_4h/8h_h[]
+
+ smlal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ umlal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ smlsl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ umlsl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ smull{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+ umull{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+ smov w_b[], w_h[], x_b[], x_h[], x_s[]
+ umov w_b[], w_h[], x_b[], x_h[], x_s[]
+
+ sqabs d,s,h,b
+ sqneg d,s,h,b
+
+ sqabs 2d,4s,2s,8h,4h,16b,8b
+ sqneg 2d,4s,2s,8h,4h,16b,8b
+
+ sqadd d,s,h,b
+ uqadd d,s,h,b
+ sqsub d,s,h,b
+ uqsub d,s,h,b
+
+ sqadd 2d,4s,2s,8h,4h,16b,8b
+ uqadd 2d,4s,2s,8h,4h,16b,8b
+ sqsub 2d,4s,2s,8h,4h,16b,8b
+ uqsub 2d,4s,2s,8h,4h,16b,8b
+
+ sqdmlal d_s_s[], s_h_h[]
+ sqdmlsl d_s_s[], s_h_h[]
+ sqdmull d_s_s[], s_h_h[]
+
+ sqdmlal{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
+ sqdmlsl{2} 2d_2s/4s_s[], 4s_4h/8h_h[]
+ sqdmull{2} 2d_2s/4s_s[], 4s_4h/2h_h[]
+
+ sqdmlal d_s_s, s_h_h
+ sqdmlsl d_s_s, s_h_h
+ sqdmull d_s_s, s_h_h
+
+ sqdmlal{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)
+ sqdmlsl{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)
+ sqdmull{2} 2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)
+
+ sqdmulh s_s_s[], h_h_h[]
+ sqrdmulh s_s_s[], h_h_h[]
+
+ sqdmulh 4s_4s_s[], 2s_2s_s[], 8h_8h_h[], 4h_4h_h[]
+ sqrdmulh 4s_4s_s[], 2s_2s_s[], 8h_8h_h[], 4h_4h_h[]
+
+ sqdmulh h,s
+ sqrdmulh h,s
+
+ sqdmulh 4s,2s,8h,4h
+ sqrdmulh 4s,2s,8h,4h
+
+ sqshl d,s,h,b
+ uqshl d,s,h,b
+ sqrshl d,s,h,b
+ uqrshl d,s,h,b
+
+ sqshl 2d,4s,2s,8h,4h,16b,8b
+ uqshl 2d,4s,2s,8h,4h,16b,8b
+ sqrshl 2d,4s,2s,8h,4h,16b,8b
+ uqrshl 2d,4s,2s,8h,4h,16b,8b
+
+ sqrshrn s_d, h_s, b_h #imm
+ uqrshrn s_d, h_s, b_h #imm
+ sqshrn s_d, h_s, b_h #imm
+ uqshrn s_d, h_s, b_h #imm
+
+ sqrshrun s_d, h_s, b_h #imm
+ sqshrun s_d, h_s, b_h #imm
+
+ sqrshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
+ uqrshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
+ sqshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
+ uqshrn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
+
+ sqrshrun{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
+ sqshrun{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h, #imm
+
+ sqshl d,s,h,b _#imm
+ uqshl d,s,h,b _#imm
+ sqshlu d,s,h,b _#imm
+
+ sqshl 2d,4s,2s,8h,4h,16b,8b _#imm
+ uqshl 2d,4s,2s,8h,4h,16b,8b _#imm
+ sqshlu 2d,4s,2s,8h,4h,16b,8b _#imm
+
+ sqxtn s_d,h_s,b_h
+ uqxtn s_d,h_s,b_h
+ sqxtun s_d,h_s,b_h
+
+ sqxtn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
+ uqxtn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
+ sqxtun{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
+
+ srhadd 4s,2s,8h,4h,16b,8b
+ urhadd 4s,2s,8h,4h,16b,8b
+
+ sshl (reg) d
+ ushl (reg) d
+ sshr (imm) d
+ ushr (imm) d
+ ssra (imm) d
+ usra (imm) d
+
+ srshl (reg) d
+ urshl (reg) d
+ srshr (imm) d
+ urshr (imm) d
+ srsra (imm) d
+ ursra (imm) d
+
+ sshl 2d,4s,2s,8h,4h,16b,8b
+ ushl 2d,4s,2s,8h,4h,16b,8b
+ sshr 2d,4s,2s,8h,4h,16b,8b
+ ushr 2d,4s,2s,8h,4h,16b,8b
+ ssra 2d,4s,2s,8h,4h,16b,8b
+ usra 2d,4s,2s,8h,4h,16b,8b
+
+ srshl 2d,4s,2s,8h,4h,16b,8b
+ urshl 2d,4s,2s,8h,4h,16b,8b
+ srshr 2d,4s,2s,8h,4h,16b,8b
+ urshr 2d,4s,2s,8h,4h,16b,8b
+ srsra 2d,4s,2s,8h,4h,16b,8b
+ ursra 2d,4s,2s,8h,4h,16b,8b
+
+ sshll{2} (imm) 2d_2s/4s 4s_4h/8h, 8h_8b/16b
+ ushll{2} (imm) 2d_2s/4s 4s_4h/8h, 8h_8b/16b
+
+ suqadd d,s,h,b
+ suqadd 2d,4s,2s,8h,4h,16b,8b
+
+ tbl 8b_{16b}_8b, 16b_{16b}_16b
+ tbl 8b_{16b,16b}_8b, 16b_{16b,16b}_16b
+ tbl 8b_{16b,16b,16b}_8b, 16b_{16b,16b,16b}_16b
+ tbl 8b_{16b,16b,16b,16b}_8b, 16b_{16b,16b,16b,16b}_16b
+
+ tbx 8b_{16b}_8b, 16b_{16b}_16b
+ tbx 8b_{16b,16b}_8b, 16b_{16b,16b}_16b
+ tbx 8b_{16b,16b,16b}_8b, 16b_{16b,16b,16b}_16b
+ tbx 8b_{16b,16b,16b,16b}_8b, 16b_{16b,16b,16b,16b}_16b
+
+ trn1 2d,4s,2s,8h,4h,16b,8b
+ trn2 2d,4s,2s,8h,4h,16b,8b
+
+ urecpe 4s,2s
+
+ ursqrte 4s,2s
+
+ usqadd d,s,h,b
+ usqadd 2d,4s,2s,8h,4h,16b,8b
+
+ uzp1 2d,4s,2s,8h,4h,16b,8b
+ uzp2 2d,4s,2s,8h,4h,16b,8b
+
+ xtn{2} 2s/4s_2d, 4h/8h_4s, 8b/16b_8h
+
+ zip1 2d,4s,2s,8h,4h,16b,8b
+ zip2 2d,4s,2s,8h,4h,16b,8b
+
+ ======================== MEM ========================
+
+ ld1 (multiple 1-element structures to 1/2/3/4 regs)
+ ld1 (single 1-element structure to one lane of 1 reg)
+ ld1r (single 1-element structure and rep to all lanes of 1 reg)
+
+ ld2 (multiple 2-element structures to 2 regs)
+ ld2 (single 2-element structure to one lane of 2 regs)
+ ld2r (single 2-element structure and rep to all lanes of 2 regs)
+
+ ld3 (multiple 3-element structures to 3 regs)
+ ld3 (single 3-element structure to one lane of 3 regs)
+ ld3r (single 3-element structure and rep to all lanes of 3 regs)
+
+ ld4 (multiple 4-element structures to 4 regs)
+ ld4 (single 4-element structure to one lane of 4 regs)
+ ld4r (single 4-element structure and rep to all lanes of 4 regs)
+
+ ldnp q_q_addr,d_d_addr,s_s_addr (load pair w/ non-temporal hint)
+ addr = reg + uimm7 * reg_size
+
+ ldp q_q_addr,d_d_addr,s_s_addr (load pair)
+ addr = [Xn|SP],#imm or [Xn|SP,#imm]! or [Xn|SP,#imm]
+
+ ldr q,d,s,h,b from addr
+ addr = [Xn|SP],#imm or [Xn|SP,#imm]! or [Xn|SP,#imm]
+
+ ldr q,d,s from pc+#imm19
+
+ ldr q,d,s,h,b from addr
+ addr = [Xn|SP, R <extend> <shift]
+
+ ldur q,d,s,h,b from addr
+ addr = [Xn|SP,#imm] (unscaled offset)
+
+ st1 (multiple 1-element structures from 1/2/3/4 regs)
+ st1 (single 1-element structure for 1 lane of 1 reg)
+
+ st2 (multiple 2-element structures from 2 regs)
+ st2 (single 2-element structure from 1 lane of 2 regs)
+
+ st3 (multiple 3-element structures from 3 regs)
+ st3 (single 3-element structure from 1 lane of 3 regs)
+
+ st4 (multiple 4-element structures from 4 regs)
+ st4 (single 4-element structure from one lane of 4 regs)
+
+ stnp q_q_addr, d_d_addr, s_s_addr
+ addr = [Xn|SP, #imm]
+
+ stp q_q_addr, d_d_addr, s_s_addr
+ addr = [Xn|SP], #imm or [Xn|SP, #imm]! or [Xn|SP, #imm]
+
+ str q,d,s,h,b_addr
+ addr = [Xn|SP], #simm or [Xn|SP, #simm]! or [Xn|SP, #pimm]
+
+ str q,d,s,h,b_addr
+ addr = [Xn|SP, R <extend> <shift]
+
+ stur q,d,s,h,b_addr
+ addr = [Xn|SP,#imm] (unscaled offset)
+
+ ======================== CRYPTO ========================
+
+ aesd 16b (aes single round decryption)
+ aese 16b (aes single round encryption)
+ aesimc 16b (aes inverse mix columns)
+ aesmc 16b (aes mix columns)
+
+ sha1c q_s_4s
+ sha1h s_s
+ sha1m q_s_4s
+ sha1p q_s_4s
+ sha1su0 4s_4s_4s
+ sha1su1 4s_4s
+
+ sha256h2 q_q_4s
+ sha256h q_q_4s
+ sha256su0 4s_4s
+ sha256su1 4s_4s_4s
+*/