Add many test cases from arm64.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@13848 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/none/tests/arm64/test_arm64_fp_and_simd.c b/none/tests/arm64/test_arm64_fp_and_simd.c
index 062b729..a3f5bc4 100644
--- a/none/tests/arm64/test_arm64_fp_and_simd.c
+++ b/none/tests/arm64/test_arm64_fp_and_simd.c
@@ -3,6 +3,7 @@
 #include <assert.h>
 #include <malloc.h>  // memalign
 #include <string.h>  // memset
+#include <math.h>    // isnormal
 
 typedef  unsigned char           UChar;
 typedef  unsigned short int      UShort;
@@ -16,15 +17,20 @@
 #define True  ((Bool)1)
 
 
+#define ITERS 1
+
+
 union _V128 {
-   UChar  b[16];
-   UShort h[8];
-   UInt   i[4];
-   ULong  d[2];
+   UChar  u8[16];
+   UShort u16[8];
+   UInt   u32[4];
+   ULong  u64[2];
+   float  f32[4];
+   double f64[2];
 };
 typedef  union _V128   V128;
 
-static UChar randUChar ( void )
+static inline UChar randUChar ( void )
 {
    static UInt seed = 80021;
    seed = 1103515245 * seed + 12345;
@@ -41,18 +47,32 @@
    return r;
 }
 
+/* Generates a random V128.  Ensures that that it contains normalised
+   FP numbers when viewed as either F32x4 or F64x2, so that it is
+   reasonable to use in FP test cases. */
 static void randV128 ( V128* v )
 {
+   static UInt nCalls = 0, nIters = 0;
    Int i;
-   for (i = 0; i < 16; i++)
-      v->b[i] = randUChar();
+   nCalls++;
+   while (1) {
+      nIters++;
+      for (i = 0; i < 16; i++) {
+         v->u8[i] = randUChar();
+      }
+      if (isnormal(v->f32[0]) && isnormal(v->f32[1]) && isnormal(v->f32[2])
+          && isnormal(v->f32[3]) && isnormal(v->f64[0]) && isnormal(v->f64[1]))
+        break;
+   }
+   if (0 == (nCalls & 0xFF))
+      printf("randV128: %u calls, %u iters\n", nCalls, nIters);
 }
 
 static void showV128 ( V128* v )
 {
    Int i;
    for (i = 15; i >= 0; i--)
-      printf("%02x", (Int)v->b[i]);
+      printf("%02x", (Int)v->u8[i]);
 }
 
 __attribute__((unused))
@@ -76,6 +96,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "uminv s8, v7.4s   ; "
@@ -92,6 +113,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "uminv h8, v7.8h   ; "
@@ -108,6 +130,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "uminv h8, v7.4h   ; "
@@ -124,6 +147,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "uminv b8, v7.16b   ; "
@@ -140,6 +164,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "uminv b8, v7.8b   ; "
@@ -164,6 +189,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "umaxv s8, v7.4s   ; "
@@ -180,6 +206,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "umaxv h8, v7.8h   ; "
@@ -196,6 +223,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "umaxv h8, v7.4h   ; "
@@ -212,6 +240,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "umaxv b8, v7.16b   ; "
@@ -228,6 +257,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "umaxv b8, v7.8b   ; "
@@ -249,7 +279,7 @@
   /* -- D[0..1] -- */
 
   memset(&block, 0x55, sizeof(block));
-  block[1].d[0] = randULong();
+  block[1].u64[0] = randULong();
   __asm__ __volatile__(
      "ldr q7, [%0, #0]   ; "
      "ldr x19, [%0, #16] ; "
@@ -257,12 +287,12 @@
      "str q7, [%0, #32] "
      : : "r"(&block[0]) : "memory", "x19", "v7"
   );
-  printf("INS v7.d[0],x19  ");
-  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  printf("INS v7.u64[0],x19  ");
+  showV128(&block[0]); printf("  %016llx  ", block[1].u64[0]);
   showV128(&block[2]); printf("\n");
 
   memset(&block, 0x55, sizeof(block));
-  block[1].d[0] = randULong();
+  block[1].u64[0] = randULong();
   __asm__ __volatile__(
      "ldr q7, [%0, #0]   ; "
      "ldr x19, [%0, #16] ; "
@@ -271,13 +301,13 @@
      : : "r"(&block[0]) : "memory", "x19", "v7"
   );
   printf("INS v7.d[1],x19  ");
-  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[0]); printf("  %016llx  ", block[1].u64[0]);
   showV128(&block[2]); printf("\n");
 
   /* -- S[0..3] -- */
 
   memset(&block, 0x55, sizeof(block));
-  block[1].d[0] = randULong();
+  block[1].u64[0] = randULong();
   __asm__ __volatile__(
      "ldr q7, [%0, #0]   ; "
      "ldr x19, [%0, #16] ; "
@@ -286,11 +316,11 @@
      : : "r"(&block[0]) : "memory", "x19", "v7"
   );
   printf("INS v7.s[0],x19  ");
-  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[0]); printf("  %016llx  ", block[1].u64[0]);
   showV128(&block[2]); printf("\n");
 
   memset(&block, 0x55, sizeof(block));
-  block[1].d[0] = randULong();
+  block[1].u64[0] = randULong();
   __asm__ __volatile__(
      "ldr q7, [%0, #0]   ; "
      "ldr x19, [%0, #16] ; "
@@ -299,11 +329,11 @@
      : : "r"(&block[0]) : "memory", "x19", "v7"
   );
   printf("INS v7.s[1],x19  ");
-  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[0]); printf("  %016llx  ", block[1].u64[0]);
   showV128(&block[2]); printf("\n");
 
   memset(&block, 0x55, sizeof(block));
-  block[1].d[0] = randULong();
+  block[1].u64[0] = randULong();
   __asm__ __volatile__(
      "ldr q7, [%0, #0]   ; "
      "ldr x19, [%0, #16] ; "
@@ -312,11 +342,11 @@
      : : "r"(&block[0]) : "memory", "x19", "v7"
   );
   printf("INS v7.s[2],x19  ");
-  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[0]); printf("  %016llx  ", block[1].u64[0]);
   showV128(&block[2]); printf("\n");
 
   memset(&block, 0x55, sizeof(block));
-  block[1].d[0] = randULong();
+  block[1].u64[0] = randULong();
   __asm__ __volatile__(
      "ldr q7, [%0, #0]   ; "
      "ldr x19, [%0, #16] ; "
@@ -325,13 +355,13 @@
      : : "r"(&block[0]) : "memory", "x19", "v7"
   );
   printf("INS v7.s[3],x19  ");
-  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[0]); printf("  %016llx  ", block[1].u64[0]);
   showV128(&block[2]); printf("\n");
 
   /* -- H[0..7] -- */
 
   memset(&block, 0x55, sizeof(block));
-  block[1].d[0] = randULong();
+  block[1].u64[0] = randULong();
   __asm__ __volatile__(
      "ldr q7, [%0, #0]   ; "
      "ldr x19, [%0, #16] ; "
@@ -340,11 +370,11 @@
      : : "r"(&block[0]) : "memory", "x19", "v7"
   );
   printf("INS v7.h[0],x19  ");
-  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[0]); printf("  %016llx  ", block[1].u64[0]);
   showV128(&block[2]); printf("\n");
 
   memset(&block, 0x55, sizeof(block));
-  block[1].d[0] = randULong();
+  block[1].u64[0] = randULong();
   __asm__ __volatile__(
      "ldr q7, [%0, #0]   ; "
      "ldr x19, [%0, #16] ; "
@@ -353,11 +383,11 @@
      : : "r"(&block[0]) : "memory", "x19", "v7"
   );
   printf("INS v7.h[1],x19  ");
-  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[0]); printf("  %016llx  ", block[1].u64[0]);
   showV128(&block[2]); printf("\n");
 
   memset(&block, 0x55, sizeof(block));
-  block[1].d[0] = randULong();
+  block[1].u64[0] = randULong();
   __asm__ __volatile__(
      "ldr q7, [%0, #0]   ; "
      "ldr x19, [%0, #16] ; "
@@ -366,11 +396,11 @@
      : : "r"(&block[0]) : "memory", "x19", "v7"
   );
   printf("INS v7.h[2],x19  ");
-  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[0]); printf("  %016llx  ", block[1].u64[0]);
   showV128(&block[2]); printf("\n");
 
   memset(&block, 0x55, sizeof(block));
-  block[1].d[0] = randULong();
+  block[1].u64[0] = randULong();
   __asm__ __volatile__(
      "ldr q7, [%0, #0]   ; "
      "ldr x19, [%0, #16] ; "
@@ -379,11 +409,11 @@
      : : "r"(&block[0]) : "memory", "x19", "v7"
   );
   printf("INS v7.h[3],x19  ");
-  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[0]); printf("  %016llx  ", block[1].u64[0]);
   showV128(&block[2]); printf("\n");
 
   memset(&block, 0x55, sizeof(block));
-  block[1].d[0] = randULong();
+  block[1].u64[0] = randULong();
   __asm__ __volatile__(
      "ldr q7, [%0, #0]   ; "
      "ldr x19, [%0, #16] ; "
@@ -392,11 +422,11 @@
      : : "r"(&block[0]) : "memory", "x19", "v7"
   );
   printf("INS v7.h[4],x19  ");
-  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[0]); printf("  %016llx  ", block[1].u64[0]);
   showV128(&block[2]); printf("\n");
 
   memset(&block, 0x55, sizeof(block));
-  block[1].d[0] = randULong();
+  block[1].u64[0] = randULong();
   __asm__ __volatile__(
      "ldr q7, [%0, #0]   ; "
      "ldr x19, [%0, #16] ; "
@@ -405,11 +435,11 @@
      : : "r"(&block[0]) : "memory", "x19", "v7"
   );
   printf("INS v7.h[5],x19  ");
-  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[0]); printf("  %016llx  ", block[1].u64[0]);
   showV128(&block[2]); printf("\n");
 
   memset(&block, 0x55, sizeof(block));
-  block[1].d[0] = randULong();
+  block[1].u64[0] = randULong();
   __asm__ __volatile__(
      "ldr q7, [%0, #0]   ; "
      "ldr x19, [%0, #16] ; "
@@ -418,11 +448,11 @@
      : : "r"(&block[0]) : "memory", "x19", "v7"
   );
   printf("INS v7.h[6],x19  ");
-  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[0]); printf("  %016llx  ", block[1].u64[0]);
   showV128(&block[2]); printf("\n");
 
   memset(&block, 0x55, sizeof(block));
-  block[1].d[0] = randULong();
+  block[1].u64[0] = randULong();
   __asm__ __volatile__(
      "ldr q7, [%0, #0]   ; "
      "ldr x19, [%0, #16] ; "
@@ -431,13 +461,13 @@
      : : "r"(&block[0]) : "memory", "x19", "v7"
   );
   printf("INS v7.h[7],x19  ");
-  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[0]); printf("  %016llx  ", block[1].u64[0]);
   showV128(&block[2]); printf("\n");
 
   /* -- B[0,15] -- */
 
   memset(&block, 0x55, sizeof(block));
-  block[1].d[0] = randULong();
+  block[1].u64[0] = randULong();
   __asm__ __volatile__(
      "ldr q7, [%0, #0]   ; "
      "ldr x19, [%0, #16] ; "
@@ -446,11 +476,11 @@
      : : "r"(&block[0]) : "memory", "x19", "v7"
   );
   printf("INS v7.b[0],x19  ");
-  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[0]); printf("  %016llx  ", block[1].u64[0]);
   showV128(&block[2]); printf("\n");
 
   memset(&block, 0x55, sizeof(block));
-  block[1].d[0] = randULong();
+  block[1].u64[0] = randULong();
   __asm__ __volatile__(
      "ldr q7, [%0, #0]   ; "
      "ldr x19, [%0, #16] ; "
@@ -459,7 +489,7 @@
      : : "r"(&block[0]) : "memory", "x19", "v7"
   );
   printf("INS v7.b[15],x19 ");
-  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[0]); printf("  %016llx  ", block[1].u64[0]);
   showV128(&block[2]); printf("\n");
 }
 
@@ -475,6 +505,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "sminv s8, v7.4s   ; "
@@ -491,6 +522,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "sminv h8, v7.8h   ; "
@@ -507,6 +539,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "sminv h8, v7.4h   ; "
@@ -523,6 +556,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "sminv b8, v7.16b   ; "
@@ -539,6 +573,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "sminv b8, v7.8b   ; "
@@ -563,6 +598,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "smaxv s8, v7.4s   ; "
@@ -579,6 +615,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "smaxv h8, v7.8h   ; "
@@ -595,6 +632,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "smaxv h8, v7.4h   ; "
@@ -611,6 +649,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "smaxv b8, v7.16b   ; "
@@ -627,6 +666,7 @@
   for (i = 0; i < 10; i++) {
     memset(&block, 0x55, sizeof(block));
     randV128(&block[0]);
+    randV128(&block[1]);
     __asm__ __volatile__(
        "ldr   q7, [%0, #0]   ; "
        "smaxv b8, v7.8b   ; "
@@ -640,8 +680,6 @@
 
 }
 
-#define ITERS 100
-
 /* Note this also sets the destination register to a known value (0x55..55)
    since it can sometimes be an input to the instruction too. */
 #define GEN_BINARY_TEST(INSN,SUFFIX) \
@@ -653,6 +691,7 @@
         memset(block, 0x55, sizeof(block)); \
         randV128(&block[0]); \
         randV128(&block[1]); \
+        randV128(&block[2]); \
         __asm__ __volatile__( \
            "ldr   q7, [%0, #0]   ; " \
            "ldr   q8, [%0, #16]   ; " \
@@ -679,6 +718,7 @@
         V128 block[2]; \
         memset(block, 0x55, sizeof(block)); \
         randV128(&block[0]); \
+        randV128(&block[1]); \
         __asm__ __volatile__( \
            "ldr   q7, [%0, #0]   ; " \
            "ldr   q8, [%0, #16]   ; " \
@@ -702,6 +742,7 @@
         V128 block[2]; \
         memset(block, 0x55, sizeof(block)); \
         randV128(&block[0]); \
+        randV128(&block[1]); \
         __asm__ __volatile__( \
            "ldr   q7, [%0, #0]   ; " \
            "ldr   q8, [%0, #16]   ; " \
@@ -855,6 +896,9 @@
 GEN_SHIFT_TEST(sshr, 2d, 2d, 1)
 GEN_SHIFT_TEST(sshr, 2d, 2d, 13)
 GEN_SHIFT_TEST(sshr, 2d, 2d, 63)
+GEN_SHIFT_TEST(shl,  2d, 2d, 1)
+GEN_SHIFT_TEST(shl,  2d, 2d, 13)
+GEN_SHIFT_TEST(shl,  2d, 2d, 63)
 
 GEN_SHIFT_TEST(ushr, 4s, 4s, 1)
 GEN_SHIFT_TEST(ushr, 4s, 4s, 13)
@@ -862,6 +906,9 @@
 GEN_SHIFT_TEST(sshr, 4s, 4s, 1)
 GEN_SHIFT_TEST(sshr, 4s, 4s, 13)
 GEN_SHIFT_TEST(sshr, 4s, 4s, 31)
+GEN_SHIFT_TEST(shl,  4s, 4s, 1)
+GEN_SHIFT_TEST(shl,  4s, 4s, 13)
+GEN_SHIFT_TEST(shl,  4s, 4s, 31)
 
 GEN_SHIFT_TEST(ushr, 2s, 2s, 1)
 GEN_SHIFT_TEST(ushr, 2s, 2s, 13)
@@ -869,6 +916,9 @@
 GEN_SHIFT_TEST(sshr, 2s, 2s, 1)
 GEN_SHIFT_TEST(sshr, 2s, 2s, 13)
 GEN_SHIFT_TEST(sshr, 2s, 2s, 31)
+GEN_SHIFT_TEST(shl,  2s, 2s, 1)
+GEN_SHIFT_TEST(shl,  2s, 2s, 13)
+GEN_SHIFT_TEST(shl,  2s, 2s, 31)
 
 GEN_SHIFT_TEST(ushr, 8h, 8h, 1)
 GEN_SHIFT_TEST(ushr, 8h, 8h, 13)
@@ -876,6 +926,9 @@
 GEN_SHIFT_TEST(sshr, 8h, 8h, 1)
 GEN_SHIFT_TEST(sshr, 8h, 8h, 13)
 GEN_SHIFT_TEST(sshr, 8h, 8h, 15)
+GEN_SHIFT_TEST(shl,  8h, 8h, 1)
+GEN_SHIFT_TEST(shl,  8h, 8h, 13)
+GEN_SHIFT_TEST(shl,  8h, 8h, 15)
 
 GEN_SHIFT_TEST(ushr, 4h, 4h, 1)
 GEN_SHIFT_TEST(ushr, 4h, 4h, 13)
@@ -883,16 +936,23 @@
 GEN_SHIFT_TEST(sshr, 4h, 4h, 1)
 GEN_SHIFT_TEST(sshr, 4h, 4h, 13)
 GEN_SHIFT_TEST(sshr, 4h, 4h, 15)
+GEN_SHIFT_TEST(shl,  4h, 4h, 1)
+GEN_SHIFT_TEST(shl,  4h, 4h, 13)
+GEN_SHIFT_TEST(shl,  4h, 4h, 15)
 
 GEN_SHIFT_TEST(ushr, 16b, 16b, 1)
 GEN_SHIFT_TEST(ushr, 16b, 16b, 7)
 GEN_SHIFT_TEST(sshr, 16b, 16b, 1)
 GEN_SHIFT_TEST(sshr, 16b, 16b, 7)
+GEN_SHIFT_TEST(shl,  16b, 16b, 1)
+GEN_SHIFT_TEST(shl,  16b, 16b, 7)
 
 GEN_SHIFT_TEST(ushr, 8b, 8b, 1)
 GEN_SHIFT_TEST(ushr, 8b, 8b, 7)
 GEN_SHIFT_TEST(sshr, 8b, 8b, 1)
 GEN_SHIFT_TEST(sshr, 8b, 8b, 7)
+GEN_SHIFT_TEST(shl,  8b, 8b, 1)
+GEN_SHIFT_TEST(shl,  8b, 8b, 7)
 
 GEN_SHIFT_TEST(ushll,  2d, 2s, 0)
 GEN_SHIFT_TEST(ushll,  2d, 2s, 15)
@@ -913,27 +973,342 @@
 GEN_UNARY_TEST(xtn,  4h, 4s)
 GEN_UNARY_TEST(xtn2, 8h, 4s)
 
+
+/* Generate a test that involves one integer reg and one vector reg,
+   with no bias as towards which is input or output. */
+#define GEN_ONEINT_ONEVEC_TEST(TESTNAME,INSN,INTREGNO,VECREGNO) \
+  __attribute__((noinline)) \
+  static void test_##TESTNAME ( void ) { \
+     Int i; \
+     for (i = 0; i < ITERS; i++) { \
+        V128 block[4]; \
+        memset(block, 0x55, sizeof(block)); \
+        randV128(&block[0]); \
+        randV128(&block[1]); \
+        randV128(&block[2]); \
+        randV128(&block[3]); \
+        __asm__ __volatile__( \
+           "ldr   q"#VECREGNO", [%0, #0]  ; " \
+           "ldr   x"#INTREGNO", [%0, #16] ; " \
+           INSN " ; " \
+           "str   q"#VECREGNO", [%0, #32] ; " \
+           "str   x"#INTREGNO", [%0, #48] ; " \
+           : : "r"(&block[0]) : "memory", "v"#VECREGNO, "x"#INTREGNO \
+        ); \
+        printf(INSN   "   "); \
+        showV128(&block[0]); printf("  "); \
+        showV128(&block[1]); printf("  "); \
+        showV128(&block[2]); printf("  "); \
+        showV128(&block[3]); printf("\n"); \
+     } \
+  }
+
+GEN_ONEINT_ONEVEC_TEST(umov_01, "umov x9, v10.d[0]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(umov_02, "umov x9, v10.d[1]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(umov_03, "umov w9, v10.s[0]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(umov_04, "umov w9, v10.s[3]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(umov_05, "umov w9, v10.h[0]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(umov_06, "umov w9, v10.h[7]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(umov_07, "umov w9, v10.b[0]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(umov_08, "umov w9, v10.b[15]", 9, 10)
+
+GEN_ONEINT_ONEVEC_TEST(smov_01, "smov x9, v10.s[0]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(smov_02, "smov x9, v10.s[3]", 9, 10)
+
+GEN_ONEINT_ONEVEC_TEST(smov_03, "smov x9, v10.h[0]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(smov_04, "smov x9, v10.h[7]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(smov_05, "smov w9, v10.h[0]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(smov_06, "smov w9, v10.h[7]", 9, 10)
+
+GEN_ONEINT_ONEVEC_TEST(smov_07, "smov x9, v10.b[0]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(smov_08, "smov x9, v10.b[15]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(smov_09, "smov w9, v10.b[0]", 9, 10)
+GEN_ONEINT_ONEVEC_TEST(smov_10, "smov w9, v10.b[15]", 9, 10)
+
+/* Generate a test that involves two vector regs,
+   with no bias as towards which is input or output. */
+#define GEN_TWOVEC_TEST(TESTNAME,INSN,VECREG1NO,VECREG2NO) \
+  __attribute__((noinline)) \
+  static void test_##TESTNAME ( void ) { \
+     Int i; \
+     for (i = 0; i < ITERS; i++) { \
+        V128 block[4]; \
+        memset(block, 0x55, sizeof(block)); \
+        randV128(&block[0]); \
+        randV128(&block[1]); \
+        randV128(&block[2]); \
+        randV128(&block[3]); \
+        __asm__ __volatile__( \
+           "ldr   q"#VECREG1NO", [%0, #0]  ; " \
+           "ldr   q"#VECREG2NO", [%0, #16] ; " \
+           INSN " ; " \
+           "str   q"#VECREG1NO", [%0, #32] ; " \
+           "str   q"#VECREG2NO", [%0, #48] ; " \
+           : : "r"(&block[0]) : "memory", "v"#VECREG1NO, "v"#VECREG2NO \
+        ); \
+        printf(INSN   "   "); \
+        showV128(&block[0]); printf("  "); \
+        showV128(&block[1]); printf("  "); \
+        showV128(&block[2]); printf("  "); \
+        showV128(&block[3]); printf("\n"); \
+     } \
+  }
+
+GEN_TWOVEC_TEST(fcvtn_01, "fcvtn  v22.2s, v23.2d", 22, 23)
+GEN_TWOVEC_TEST(fcvtn_02, "fcvtn2 v22.4s, v23.2d", 22, 23)
+
+GEN_UNARY_TEST(neg, 2d, 2d)
+GEN_UNARY_TEST(neg, 4s, 4s)
+GEN_UNARY_TEST(neg, 2s, 2s)
+GEN_UNARY_TEST(neg, 8h, 8h)
+GEN_UNARY_TEST(neg, 4h, 4h)
+GEN_UNARY_TEST(neg, 16b, 16b)
+GEN_UNARY_TEST(neg, 8b,  8b)
+GEN_BINARY_TEST(fadd, 2d)
+GEN_BINARY_TEST(fadd, 4s)
+GEN_BINARY_TEST(fadd, 2s)
+GEN_BINARY_TEST(fsub, 2d)
+GEN_BINARY_TEST(fsub, 4s)
+GEN_BINARY_TEST(fsub, 2s)
+GEN_BINARY_TEST(fmul, 2d)
+GEN_BINARY_TEST(fmul, 4s)
+GEN_BINARY_TEST(fmul, 2s)
+GEN_BINARY_TEST(fdiv, 2d)
+GEN_BINARY_TEST(fdiv, 4s)
+GEN_BINARY_TEST(fdiv, 2s)
+GEN_BINARY_TEST(fmla, 2d)
+GEN_BINARY_TEST(fmla, 4s)
+GEN_BINARY_TEST(fmla, 2s)
+GEN_BINARY_TEST(fmls, 2d)
+GEN_BINARY_TEST(fmls, 4s)
+GEN_BINARY_TEST(fmls, 2s)
+GEN_BINARY_TEST(fabd, 2d)
+GEN_BINARY_TEST(fabd, 4s)
+GEN_BINARY_TEST(fabd, 2s)
+
+/* Generate a test that involves three vector regs,
+   with no bias as towards which is input or output. */
+#define GEN_THREEVEC_TEST(TESTNAME,INSN,VECREG1NO,VECREG2NO,VECREG3NO)  \
+  __attribute__((noinline)) \
+  static void test_##TESTNAME ( void ) { \
+     Int i; \
+     for (i = 0; i < ITERS; i++) { \
+        V128 block[6]; \
+        memset(block, 0x55, sizeof(block)); \
+        randV128(&block[0]); \
+        randV128(&block[1]); \
+        randV128(&block[2]); \
+        randV128(&block[3]); \
+        randV128(&block[4]); \
+        randV128(&block[5]); \
+        __asm__ __volatile__( \
+           "ldr   q"#VECREG1NO", [%0, #0]  ; " \
+           "ldr   q"#VECREG2NO", [%0, #16] ; " \
+           "ldr   q"#VECREG3NO", [%0, #32] ; " \
+           INSN " ; " \
+           "str   q"#VECREG1NO", [%0, #48] ; " \
+           "str   q"#VECREG2NO", [%0, #64] ; " \
+           "str   q"#VECREG3NO", [%0, #80] ; " \
+           : : "r"(&block[0]) : "memory", "v"#VECREG1NO, "v"#VECREG2NO, "v"#VECREG3NO \
+        ); \
+        printf(INSN   "   "); \
+        showV128(&block[0]); printf("  "); \
+        showV128(&block[1]); printf("  "); \
+        showV128(&block[2]); printf("  "); \
+        showV128(&block[3]); printf("  "); \
+        showV128(&block[4]); printf("  "); \
+        showV128(&block[5]); printf("\n"); \
+     } \
+  }
+
+GEN_THREEVEC_TEST(add_d_d_d, "add d21, d22, d23", 21, 22, 23)
+GEN_THREEVEC_TEST(sub_d_d_d, "sub d21, d22, d23", 21, 22, 23)
+
+/* overkill -- don't need two vecs, only one */
+GEN_TWOVEC_TEST(fmov_scalar_imm_01, "fmov d22, #0.125", 22, 23)
+GEN_TWOVEC_TEST(fmov_scalar_imm_02, "fmov d22, #-4.0",  22, 23)
+GEN_TWOVEC_TEST(fmov_scalar_imm_03, "fmov d22, #1.0",   22, 23)
+GEN_TWOVEC_TEST(fmov_scalar_imm_04, "fmov s22, #0.125", 22, 23)
+GEN_TWOVEC_TEST(fmov_scalar_imm_05, "fmov s22, #-4.0",  22, 23)
+GEN_TWOVEC_TEST(fmov_scalar_imm_06, "fmov s22, #-1.0",   22, 23)
+
+GEN_ONEINT_ONEVEC_TEST(fmov_gen_01, "fmov s7,      w15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(fmov_gen_02, "fmov d7,      x15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(fmov_gen_03, "fmov v7.d[1], x15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(fmov_gen_04, "fmov w15,      s7", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(fmov_gen_05, "fmov x15,      d7", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(fmov_gen_06, "fmov x15, v7.d[1]", 15, 7)
+
+GEN_TWOVEC_TEST(movi_vector_imm_01, "fmov d22,    #0.125", 22, 23)
+GEN_TWOVEC_TEST(movi_vector_imm_02, "fmov d22,    #-4.0",  22, 23)
+GEN_TWOVEC_TEST(movi_vector_imm_03, "fmov d22,    #1.0",   22, 23)
+GEN_TWOVEC_TEST(movi_vector_imm_04, "fmov v22.2d, #0.125", 22, 23)
+GEN_TWOVEC_TEST(movi_vector_imm_05, "fmov v22.2d, #-4.0",  22, 23)
+GEN_TWOVEC_TEST(movi_vector_imm_06, "fmov v22.2d, #1.0",   22, 23)
+
+GEN_ONEINT_ONEVEC_TEST(sucvtf_01, "scvtf s7, w15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(sucvtf_02, "scvtf d7, w15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(sucvtf_03, "scvtf s7, x15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(sucvtf_04, "scvtf d7, x15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(sucvtf_05, "ucvtf s7, w15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(sucvtf_06, "ucvtf d7, w15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(sucvtf_07, "ucvtf s7, x15", 15, 7)
+GEN_ONEINT_ONEVEC_TEST(sucvtf_08, "ucvtf d7, x15", 15, 7)
+
+GEN_THREEVEC_TEST(fadd_d,  "fadd d2, d11, d29", 2, 11, 29)
+GEN_THREEVEC_TEST(fadd_s,  "fadd s2, s11, s29", 2, 11, 29)
+GEN_THREEVEC_TEST(fsub_d,  "fsub d2, d11, d29", 2, 11, 29)
+GEN_THREEVEC_TEST(fsub_s,  "fsub s2, s11, s29", 2, 11, 29)
+GEN_THREEVEC_TEST(fmul_d,  "fmul d2, d11, d29", 2, 11, 29)
+GEN_THREEVEC_TEST(fmul_s,  "fmul s2, s11, s29", 2, 11, 29)
+GEN_THREEVEC_TEST(fdiv_d,  "fdiv d2, d11, d29", 2, 11, 29)
+GEN_THREEVEC_TEST(fdiv_s,  "fdiv s2, s11, s29", 2, 11, 29)
+GEN_THREEVEC_TEST(fnmul_d, "fnmul d2, d11, d29", 2, 11, 29)
+GEN_THREEVEC_TEST(fnmul_s, "fnmul s2, s11, s29", 2, 11, 29)
+
+GEN_THREEVEC_TEST(fabd_d,  "fabd d2, d11, d29", 2, 11, 29)
+GEN_THREEVEC_TEST(fabd_s,  "fabd s2, s11, s29", 2, 11, 29)
+
+GEN_TWOVEC_TEST(fmov_d,  "fmov d22, d23",   22, 23)
+GEN_TWOVEC_TEST(fmov_s,  "fmov s22, s23",   22, 23)
+GEN_TWOVEC_TEST(fabs_d,  "fabs d22, d23",   22, 23)
+GEN_TWOVEC_TEST(fabs_s,  "fabs s22, s23",   22, 23)
+GEN_TWOVEC_TEST(fneg_d,  "fneg d22, d23",   22, 23)
+GEN_TWOVEC_TEST(fneg_s,  "fneg s22, s23",   22, 23)
+GEN_TWOVEC_TEST(fsqrt_d, "fsqrt d22, d23",   22, 23)
+GEN_TWOVEC_TEST(fsqrt_s, "fsqrt s22, s23",   22, 23)
+
+GEN_UNARY_TEST(fneg, 2d, 2d)
+GEN_UNARY_TEST(fneg, 4s, 4s)
+GEN_UNARY_TEST(fneg, 2s, 2s)
+GEN_UNARY_TEST(fabs, 2d, 2d)
+GEN_UNARY_TEST(fabs, 4s, 4s)
+GEN_UNARY_TEST(fabs, 2s, 2s)
+
+/* IMPORTANT: keep the tests in here in the same order as the
+   implementations are in guest_arm64_toIR.c. */
 int main ( void )
 {
    assert(sizeof(V128) == 16);
 
-   printf("FMOV (general) MISSING\n");
-   printf("FMOV (scalar, immediate) MISSING\n");
-   printf("{FMOV,MOVI} (vector, immediate) MISSING\n");
-   printf("{S,U}CVTF (scalar, integer) MISSING\n");
-   printf("F{ADD,SUB,MUL,DIV,NMUL} (scalar) MISSING\n");
-   printf("F{MOV,ABS,NEG,SQRT} D/D or S/S MISSING\n");
-   printf("F{ABS,NEG} (vector) MISSING\n");
-   printf("FCMP,FCMPE MISSING\n");
-   printf("F{N}M{ADD,SUB} MISSING\n");
-   printf("FCVT{N,P,M,Z}{S,U} (scalar, integer) MISSING\n");
-   printf("FRINT{I,M,P,Z} (scalar) MISSING\n");
-   printf("FCVT (scalar) MISSING\n");
-   printf("FABD (scalar) MISSING\n");
-   printf("{S,U}CVTF (vector, integer) MISSING\n");
-   printf("F{ADD,SUB,MUL,DIV,MLA,MLS} (vector) MISSING\n");
+   printf("BEGIN: FMOV (general)\n");
+   test_fmov_gen_01();
+   test_fmov_gen_02();
+   test_fmov_gen_03();
+   test_fmov_gen_04();
+   test_fmov_gen_05();
+   test_fmov_gen_06();
+   printf("END:   FMOV (general)\n\n");
 
-   printf("ADD/SUB (vector) MISSING\n");
+   printf("BEGIN: FMOV (scalar, immediate)\n");
+   test_fmov_scalar_imm_01();
+   test_fmov_scalar_imm_02();
+   test_fmov_scalar_imm_03();
+   test_fmov_scalar_imm_04();
+   test_fmov_scalar_imm_05();
+   test_fmov_scalar_imm_06();
+   printf("END:   FMOV (scalar, immediate)\n\n");
+
+   printf("BEGIN: {FMOV,MOVI} (vector, immediate)\n");
+   test_movi_vector_imm_01();
+   test_movi_vector_imm_02();
+   test_movi_vector_imm_03();
+   test_movi_vector_imm_04();
+   test_movi_vector_imm_05();
+   test_movi_vector_imm_06();
+   printf("END:   {FMOV,MOVI} (vector, immediate)\n\n");
+
+   printf("BEGIN: {S,U}CVTF (scalar, integer)\n");
+   test_sucvtf_01();
+   test_sucvtf_02();
+   test_sucvtf_03();
+   test_sucvtf_04();
+   //test_sucvtf_05();
+   test_sucvtf_06();
+   test_sucvtf_07();
+   test_sucvtf_08();
+   printf("END:   {S,U}CVTF (scalar, integer) (MISSING 1 case of 8)\n\n");
+
+   printf("BEGIN: F{ADD,SUB,MUL,DIV,NMUL} (scalar)\n");
+   test_fadd_d();
+   test_fadd_s();
+   test_fsub_d();
+   test_fsub_s();
+   test_fmul_d();
+   test_fmul_s();
+   test_fdiv_d();
+   test_fdiv_s();
+   test_fnmul_d();
+   test_fnmul_s();
+   printf("END:   F{ADD,SUB,MUL,DIV,NMUL} (scalar)\n\n");
+
+   printf("BEGIN: F{MOV,ABS,NEG,SQRT} D/D or S/S\n");
+   test_fmov_d();
+   test_fmov_s();
+   test_fabs_d();
+   test_fabs_s();
+   test_fneg_d();
+   test_fneg_s();
+   test_fsqrt_d();
+   test_fsqrt_s();
+   printf("END:   F{MOV,ABS,NEG,SQRT} D/D or S/S\n\n");
+
+   printf("BEGIN: F{ABS,NEG} (vector)\n");
+   test_fabs_2d_2d();
+   //test_fabs_4s_4s();
+   //test_fabs_2s_2s();
+   test_fneg_2d_2d();
+   //test_fneg_4s_4s();
+   //test_fneg_2s_2s();
+   printf("END:   F{ABS,NEG} (vector) (MISSING 4s/2s cases)\n\n");
+
+   printf("FCMP,FCMPE MISSING\n\n");
+
+   printf("F{N}M{ADD,SUB} MISSING\n\n");
+
+   printf("FCVT{N,P,M,Z}{S,U} (scalar, integer) MISSING\n\n");
+
+   printf("FRINT{I,M,P,Z} (scalar) MISSING\n\n");
+
+   printf("FCVT (scalar) MISSING\n\n");
+
+   printf("BEGIN: FABD (scalar) MISSING\n");
+   test_fabd_d();
+   test_fabd_s();
+   printf("END:   FABD (scalar) MISSING\n\n");
+
+   printf("{S,U}CVTF (vector, integer) MISSING\n\n");
+
+   printf("BEGIN: F{ADD,SUB,MUL,DIV,MLA,MLS,ABD} (vector)\n");
+   test_fadd_2d();
+   test_fadd_4s();
+   test_fadd_2s();
+   test_fsub_2d();
+   test_fsub_4s();
+   test_fsub_2s();
+   test_fmul_2d();
+   test_fmul_4s();
+   test_fmul_2s();
+   test_fdiv_2d();
+   test_fdiv_4s();
+   test_fdiv_2s();
+   test_fmla_2d();
+   test_fmla_4s();
+   test_fmla_2s();
+   test_fmls_2d();
+   test_fmls_4s();
+   test_fmls_2s();
+   test_fabd_2d();
+   //test_fabd_4s();
+   //test_fabd_2s();
+   printf("END:   F{ADD,SUB,MUL,DIV,MLA,MLS,ABD} (vector) (MISSING fabd 2s/4s)\n\n");
+
+   printf("BEGIN: FCVTN (MISSING 16F <- 32F cases)\n");
+   test_fcvtn_01();
+   test_fcvtn_02();
+   printf("END:   FCVTN (MISSING 16F <- 32F cases)\n\n");
+
+   printf("BEGIN: ADD/SUB (vector)\n");
    test_add_2d();
    test_add_4s();
    test_add_2s();
@@ -948,9 +1323,14 @@
    test_sub_4h();
    //test_sub_16b();
    //test_sub_8b();
+   printf("END:   ADD/SUB (vector) (MISSING b16/b8 cases)\n\n");
 
-   printf("ADD/SUB (scalar) MISSING\n");
+   printf("BEGIN: ADD/SUB (scalar)\n");
+   test_add_d_d_d();
+   test_sub_d_d_d();
+   printf("END:   ADD/SUB (scalar)\n\n");
 
+   printf("BEGIN: MUL/PMUL/MLA/MLS (vector)\n");
    test_mul_4s();
    test_mul_2s();
    test_mul_8h();
@@ -969,8 +1349,9 @@
    test_mls_4h();
    //test_mls_16b();
    //test_mls_8b();
-   printf("MUL/PMUL/MLA/MLS (vector) (partly MISSING)\n");
+   printf("END:   MUL/PMUL/MLA/MLS (vector) (partly MISSING)\n\n");
 
+   printf("BEGIN: {S,U}{MIN,MAX} (vector)\n");
    test_umax_4s();
    test_umax_8h();
    test_umax_4h();
@@ -991,12 +1372,16 @@
    test_smin_4h();
    test_smin_16b();
    test_smin_8b();
+   printf("END:   {S,U}{MIN,MAX} (vector)\n\n");
 
+   printf("BEGIN: {S,U}{MIN,MAX}V\n");
    test_UMINV();
    test_UMAXV();
    test_SMINV();
    test_SMAXV();
+   printf("END:   {S,U}{MIN,MAX}V\n\n");
 
+   printf("BEGIN: {AND,BIC,ORR,ORN} (vector)\n");
    test_and_16b();
    test_and_8b();
    test_bic_16b();
@@ -1005,7 +1390,9 @@
    test_orr_8b();
    test_orn_16b();
    test_orn_8b();
+   printf("END:   {AND,BIC,ORR,ORN} (vector)\n\n");
 
+   printf("BEGIN: CM{EQ,HI,HS,GE,GT,TST,LE,LT} (vector)\n\n");
    test_cmeq_2d();
 #if 0
    test_cmeq_4s();
@@ -1050,8 +1437,10 @@
    test_cmge_16b();
    test_cmge_8b();
 #endif
-   printf("CM{EQ,HI,HS,GE,GT,TST,LE,LT} (vector) (w/zero cases MISSING)\n");
+   printf("END:   CM{EQ,HI,HS,GE,GT,TST,LE,LT} (vector) "
+          "(w/zero and many other cases MISSING)\n\n");
 
+   printf("BEGIN: {EOR,BSL,BIT,BIF} (vector)\n");
    test_eor_16b();
    test_eor_8b();
    test_bsl_16b();
@@ -1060,7 +1449,9 @@
    test_bit_8b();
    test_bif_16b();
    test_bif_8b();
+   printf("END:   {EOR,BSL,BIT,BIF} (vector)\n\n");
 
+   printf("BEGIN: {USHR,SSHR,SHL} (vector, immediate)\n");
    test_ushr_2d_2d_1();
    test_ushr_2d_2d_13();
    test_ushr_2d_2d_63();
@@ -1068,41 +1459,68 @@
    test_sshr_2d_2d_13();
    test_sshr_2d_2d_63();
 #if 0
+   test_shl_2d_2d_1();
+   test_shl_2d_2d_13();
+   test_shl_2d_2d_63();
+
    test_ushr_4s_4s_1();
    test_ushr_4s_4s_13();
    test_ushr_4s_4s_31();
    test_sshr_4s_4s_1();
    test_sshr_4s_4s_13();
    test_sshr_4s_4s_31();
+#endif
+   test_shl_4s_4s_1();
+   test_shl_4s_4s_13();
+   test_shl_4s_4s_31();
+#if 0
    test_ushr_2s_2s_1();
    test_ushr_2s_2s_13();
    test_ushr_2s_2s_31();
    test_sshr_2s_2s_1();
    test_sshr_2s_2s_13();
    test_sshr_2s_2s_31();
+   test_shl_2s_2s_1();
+   test_shl_2s_2s_13();
+   test_shl_2s_2s_31();
+
    test_ushr_8h_8h_1();
    test_ushr_8h_8h_13();
    test_ushr_8h_8h_15();
    test_sshr_8h_8h_1();
    test_sshr_8h_8h_13();
    test_sshr_8h_8h_15();
+   test_shl_8h_8h_1();
+   test_shl_8h_8h_13();
+   test_shl_8h_8h_15();
+
    test_ushr_4h_4h_1();
    test_ushr_4h_4h_13();
    test_ushr_4h_4h_15();
    test_sshr_4h_4h_1();
    test_sshr_4h_4h_13();
    test_sshr_4h_4h_15();
+   test_shl_4h_4h_1();
+   test_shl_4h_4h_13();
+   test_shl_4h_4h_15();
+
    test_ushr_16b_16b_1();
    test_ushr_16b_16b_7();
    test_sshr_16b_16b_1();
    test_sshr_16b_16b_7();
+   test_shl_16b_16b_1();
+   test_shl_16b_16b_7();
+
    test_ushr_8b_8b_1();
    test_ushr_8b_8b_7();
    test_sshr_8b_8b_1();
    test_sshr_8b_8b_7();
+   test_shl_8b_8b_1();
+   test_shl_8b_8b_7();
 #endif
+   printf("END:   {USHR,SSHR,SHL} (vector, immediate) (many cases MISSING)\n\n");
 
-   printf("{U,S}SHLL{,2} (MISSING h_b and s_h versions)\n");
+   printf("BEGIN: {U,S}SHLL{,2}\n");
    test_ushll_2d_2s_0();
    test_ushll_2d_2s_15();
    test_ushll_2d_2s_31();
@@ -1115,19 +1533,53 @@
    test_sshll2_2d_4s_0();
    test_sshll2_2d_4s_15();
    test_sshll2_2d_4s_31();
-   printf("{U,S}SHLL{,2} (MISSING h_b and s_h versions)\n");
+   printf("END:   {U,S}SHLL{,2} (MISSING h_b and s_h versions)\n\n");
 
+   printf("BEGIN: XTN{,2}\n");
    test_xtn_2s_2d();
    test_xtn2_4s_2d();
    test_xtn_4h_4s();
    test_xtn2_8h_4s();
-   printf("XTN{,2} (MISSING b_h versions)\n");
+   printf("END:   XTN{,2} (MISSING b_h versions)\n\n");
 
-   printf("DUP (element, vector) MISSING\n");
-   printf("DUP (general, vector) MISSING\n");
-   printf("{S,U}MOV MISSING\n");
+   printf("DUP (element, vector) COMPLETELY MISSING\n\n");
 
+   printf("DUP (general, vector) COMPLETELY MISSING\n\n");
+
+   printf("BEGIN: {S,U}MOV\n");
+   test_umov_01();
+   test_umov_02();
+   test_umov_03();
+   test_umov_04();
+   test_umov_05();
+   test_umov_06();
+   test_umov_07();
+   test_umov_08();
+   test_smov_01();
+   test_smov_02();
+   test_smov_03();
+   test_smov_04();
+   test_smov_05();
+   test_smov_06();
+   test_smov_07();
+   test_smov_08();
+   test_smov_09();
+   test_smov_10();
+   printf("END:   {S,U}MOV\n\n");
+
+   printf("BEGIN: INS (general)\n");
    test_INS_general();
+   printf("END:   INS (general)\n\n");
+
+   printf("BEGIN: NEG (vector)\n");
+   test_neg_2d_2d();
+   test_neg_4s_4s();
+   test_neg_2s_2s();
+   test_neg_8h_8h();
+   test_neg_4h_4h();
+   //test_neg_16b_16b();
+   //test_neg_8b_8b();
+   printf("END:   NEG (vector) (MISSING 8b/16b)\n\n");
 
    return 0;
 }
diff --git a/none/tests/arm64/test_arm64_int.c b/none/tests/arm64/test_arm64_int.c
index cf6ebcb..7010f54 100644
--- a/none/tests/arm64/test_arm64_int.c
+++ b/none/tests/arm64/test_arm64_int.c
@@ -10289,6 +10289,26 @@
 
 
 ////////////////////////////////////////////////////////////////
+printf("REV16\n");
+
+TESTINST2("rev16 x11,x23", 0xfd79baaee550b488, x11,x23,0);
+TESTINST2("rev16 x11,x23", 0xe861540945421773, x11,x23,0);
+TESTINST2("rev16 x11,x23", 0x9a1140d0fd1dbf6c, x11,x23,0);
+
+TESTINST2("rev16 w11,w23", 0xfd79baaee550b488, x11,x23,0);
+TESTINST2("rev16 w11,w23", 0xe861540945421773, x11,x23,0);
+TESTINST2("rev16 w11,w23", 0x9a1140d0fd1dbf6c, x11,x23,0);
+
+
+////////////////////////////////////////////////////////////////
+printf("REV32\n");
+
+TESTINST2("rev32 x11,x23", 0xfd79baaee550b488, x11,x23,0);
+TESTINST2("rev32 x11,x23", 0xe861540945421773, x11,x23,0);
+TESTINST2("rev32 x11,x23", 0x9a1140d0fd1dbf6c, x11,x23,0);
+
+
+////////////////////////////////////////////////////////////////
 printf("CLZ\n");
 
 TESTINST2("clz x17, x22", 0xFFFFFFFFFFFFFFFFULL, x17, x22, 0);