Add test cases for SIMD and FP instructions.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@13846 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/none/tests/arm64/test_arm64_fp_and_simd.c b/none/tests/arm64/test_arm64_fp_and_simd.c
new file mode 100644
index 0000000..062b729
--- /dev/null
+++ b/none/tests/arm64/test_arm64_fp_and_simd.c
@@ -0,0 +1,1133 @@
+
+#include <stdio.h>
+#include <assert.h>
+#include <malloc.h>  // memalign
+#include <string.h>  // memset
+
+typedef  unsigned char           UChar;
+typedef  unsigned short int      UShort;
+typedef  unsigned int            UInt;
+typedef  signed int              Int;
+typedef  unsigned char           UChar;
+typedef  unsigned long long int  ULong;
+
+typedef  unsigned char           Bool;
+#define False ((Bool)0)
+#define True  ((Bool)1)
+
+
+union _V128 {
+   UChar  b[16];
+   UShort h[8];
+   UInt   i[4];
+   ULong  d[2];
+};
+typedef  union _V128   V128;
+
+static UChar randUChar ( void )
+{
+   static UInt seed = 80021;
+   seed = 1103515245 * seed + 12345;
+   return (seed >> 17) & 0xFF;
+}
+
+static ULong randULong ( void )
+{
+   Int i;
+   ULong r = 0;
+   for (i = 0; i < 8; i++) {
+      r = (r << 8) | (ULong)(0xFF & randUChar());
+   }
+   return r;
+}
+
+static void randV128 ( V128* v )
+{
+   Int i;
+   for (i = 0; i < 16; i++)
+      v->b[i] = randUChar();
+}
+
+static void showV128 ( V128* v )
+{
+   Int i;
+   for (i = 15; i >= 0; i--)
+      printf("%02x", (Int)v->b[i]);
+}
+
+__attribute__((unused))
+static void* memalign16(size_t szB)
+{
+   void* x;
+   x = memalign(16, szB);
+   assert(x);
+   assert(0 == ((16-1) & (unsigned long)x));
+   return x;
+}
+
+
+void test_UMINV ( void )
+{
+  int i;
+  V128 block[2];
+
+  /* -- 4s -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "uminv s8, v7.4s   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("UMINV v8, v7.4s  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+  /* -- 8h -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "uminv h8, v7.8h   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("UMINV h8, v7.8h  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+  /* -- 4h -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "uminv h8, v7.4h   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("UMINV h8, v7.4h  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+  /* -- 16b -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "uminv b8, v7.16b   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("UMINV b8, v7.16b  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+  /* -- 8b -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "uminv b8, v7.8b   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("UMINV b8, v7.8b  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+}
+
+
+void test_UMAXV ( void )
+{
+  int i;
+  V128 block[2];
+
+  /* -- 4s -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "umaxv s8, v7.4s   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("UMAXV v8, v7.4s  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+  /* -- 8h -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "umaxv h8, v7.8h   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("UMAXV h8, v7.8h  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+  /* -- 4h -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "umaxv h8, v7.4h   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("UMAXV h8, v7.4h  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+  /* -- 16b -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "umaxv b8, v7.16b   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("UMAXV b8, v7.16b  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+  /* -- 8b -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "umaxv b8, v7.8b   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("UMAXV b8, v7.8b  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+}
+
+
+void test_INS_general ( void )
+{
+  V128 block[3];
+
+  /* -- D[0..1] -- */
+
+  memset(&block, 0x55, sizeof(block));
+  block[1].d[0] = randULong();
+  __asm__ __volatile__(
+     "ldr q7, [%0, #0]   ; "
+     "ldr x19, [%0, #16] ; "
+     "ins v7.d[0], x19   ; "
+     "str q7, [%0, #32] "
+     : : "r"(&block[0]) : "memory", "x19", "v7"
+  );
+  printf("INS v7.d[0],x19  ");
+  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[2]); printf("\n");
+
+  memset(&block, 0x55, sizeof(block));
+  block[1].d[0] = randULong();
+  __asm__ __volatile__(
+     "ldr q7, [%0, #0]   ; "
+     "ldr x19, [%0, #16] ; "
+     "ins v7.d[1], x19   ; "
+     "str q7, [%0, #32] "
+     : : "r"(&block[0]) : "memory", "x19", "v7"
+  );
+  printf("INS v7.d[1],x19  ");
+  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[2]); printf("\n");
+
+  /* -- S[0..3] -- */
+
+  memset(&block, 0x55, sizeof(block));
+  block[1].d[0] = randULong();
+  __asm__ __volatile__(
+     "ldr q7, [%0, #0]   ; "
+     "ldr x19, [%0, #16] ; "
+     "ins v7.s[0], w19   ; "
+     "str q7, [%0, #32] "
+     : : "r"(&block[0]) : "memory", "x19", "v7"
+  );
+  printf("INS v7.s[0],x19  ");
+  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[2]); printf("\n");
+
+  memset(&block, 0x55, sizeof(block));
+  block[1].d[0] = randULong();
+  __asm__ __volatile__(
+     "ldr q7, [%0, #0]   ; "
+     "ldr x19, [%0, #16] ; "
+     "ins v7.s[1], w19   ; "
+     "str q7, [%0, #32] "
+     : : "r"(&block[0]) : "memory", "x19", "v7"
+  );
+  printf("INS v7.s[1],x19  ");
+  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[2]); printf("\n");
+
+  memset(&block, 0x55, sizeof(block));
+  block[1].d[0] = randULong();
+  __asm__ __volatile__(
+     "ldr q7, [%0, #0]   ; "
+     "ldr x19, [%0, #16] ; "
+     "ins v7.s[2], w19   ; "
+     "str q7, [%0, #32] "
+     : : "r"(&block[0]) : "memory", "x19", "v7"
+  );
+  printf("INS v7.s[2],x19  ");
+  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[2]); printf("\n");
+
+  memset(&block, 0x55, sizeof(block));
+  block[1].d[0] = randULong();
+  __asm__ __volatile__(
+     "ldr q7, [%0, #0]   ; "
+     "ldr x19, [%0, #16] ; "
+     "ins v7.s[3], w19   ; "
+     "str q7, [%0, #32] "
+     : : "r"(&block[0]) : "memory", "x19", "v7"
+  );
+  printf("INS v7.s[3],x19  ");
+  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[2]); printf("\n");
+
+  /* -- H[0..7] -- */
+
+  memset(&block, 0x55, sizeof(block));
+  block[1].d[0] = randULong();
+  __asm__ __volatile__(
+     "ldr q7, [%0, #0]   ; "
+     "ldr x19, [%0, #16] ; "
+     "ins v7.h[0], w19   ; "
+     "str q7, [%0, #32] "
+     : : "r"(&block[0]) : "memory", "x19", "v7"
+  );
+  printf("INS v7.h[0],x19  ");
+  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[2]); printf("\n");
+
+  memset(&block, 0x55, sizeof(block));
+  block[1].d[0] = randULong();
+  __asm__ __volatile__(
+     "ldr q7, [%0, #0]   ; "
+     "ldr x19, [%0, #16] ; "
+     "ins v7.h[1], w19   ; "
+     "str q7, [%0, #32] "
+     : : "r"(&block[0]) : "memory", "x19", "v7"
+  );
+  printf("INS v7.h[1],x19  ");
+  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[2]); printf("\n");
+
+  memset(&block, 0x55, sizeof(block));
+  block[1].d[0] = randULong();
+  __asm__ __volatile__(
+     "ldr q7, [%0, #0]   ; "
+     "ldr x19, [%0, #16] ; "
+     "ins v7.h[2], w19   ; "
+     "str q7, [%0, #32] "
+     : : "r"(&block[0]) : "memory", "x19", "v7"
+  );
+  printf("INS v7.h[2],x19  ");
+  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[2]); printf("\n");
+
+  memset(&block, 0x55, sizeof(block));
+  block[1].d[0] = randULong();
+  __asm__ __volatile__(
+     "ldr q7, [%0, #0]   ; "
+     "ldr x19, [%0, #16] ; "
+     "ins v7.h[3], w19   ; "
+     "str q7, [%0, #32] "
+     : : "r"(&block[0]) : "memory", "x19", "v7"
+  );
+  printf("INS v7.h[3],x19  ");
+  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[2]); printf("\n");
+
+  memset(&block, 0x55, sizeof(block));
+  block[1].d[0] = randULong();
+  __asm__ __volatile__(
+     "ldr q7, [%0, #0]   ; "
+     "ldr x19, [%0, #16] ; "
+     "ins v7.h[4], w19   ; "
+     "str q7, [%0, #32] "
+     : : "r"(&block[0]) : "memory", "x19", "v7"
+  );
+  printf("INS v7.h[4],x19  ");
+  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[2]); printf("\n");
+
+  memset(&block, 0x55, sizeof(block));
+  block[1].d[0] = randULong();
+  __asm__ __volatile__(
+     "ldr q7, [%0, #0]   ; "
+     "ldr x19, [%0, #16] ; "
+     "ins v7.h[5], w19   ; "
+     "str q7, [%0, #32] "
+     : : "r"(&block[0]) : "memory", "x19", "v7"
+  );
+  printf("INS v7.h[5],x19  ");
+  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[2]); printf("\n");
+
+  memset(&block, 0x55, sizeof(block));
+  block[1].d[0] = randULong();
+  __asm__ __volatile__(
+     "ldr q7, [%0, #0]   ; "
+     "ldr x19, [%0, #16] ; "
+     "ins v7.h[6], w19   ; "
+     "str q7, [%0, #32] "
+     : : "r"(&block[0]) : "memory", "x19", "v7"
+  );
+  printf("INS v7.h[6],x19  ");
+  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[2]); printf("\n");
+
+  memset(&block, 0x55, sizeof(block));
+  block[1].d[0] = randULong();
+  __asm__ __volatile__(
+     "ldr q7, [%0, #0]   ; "
+     "ldr x19, [%0, #16] ; "
+     "ins v7.h[7], w19   ; "
+     "str q7, [%0, #32] "
+     : : "r"(&block[0]) : "memory", "x19", "v7"
+  );
+  printf("INS v7.h[7],x19  ");
+  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[2]); printf("\n");
+
+  /* -- B[0,15] -- */
+
+  memset(&block, 0x55, sizeof(block));
+  block[1].d[0] = randULong();
+  __asm__ __volatile__(
+     "ldr q7, [%0, #0]   ; "
+     "ldr x19, [%0, #16] ; "
+     "ins v7.b[0], w19   ; "
+     "str q7, [%0, #32] "
+     : : "r"(&block[0]) : "memory", "x19", "v7"
+  );
+  printf("INS v7.b[0],x19  ");
+  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[2]); printf("\n");
+
+  memset(&block, 0x55, sizeof(block));
+  block[1].d[0] = randULong();
+  __asm__ __volatile__(
+     "ldr q7, [%0, #0]   ; "
+     "ldr x19, [%0, #16] ; "
+     "ins v7.b[15], w19   ; "
+     "str q7, [%0, #32] "
+     : : "r"(&block[0]) : "memory", "x19", "v7"
+  );
+  printf("INS v7.b[15],x19 ");
+  showV128(&block[0]); printf("  %016llx  ", block[1].d[0]);
+  showV128(&block[2]); printf("\n");
+}
+
+
+
+void test_SMINV ( void )
+{
+  int i;
+  V128 block[2];
+
+  /* -- 4s -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "sminv s8, v7.4s   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("SMINV v8, v7.4s  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+  /* -- 8h -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "sminv h8, v7.8h   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("SMINV h8, v7.8h  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+  /* -- 4h -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "sminv h8, v7.4h   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("SMINV h8, v7.4h  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+  /* -- 16b -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "sminv b8, v7.16b   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("SMINV b8, v7.16b  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+  /* -- 8b -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "sminv b8, v7.8b   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("SMINV b8, v7.8b  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+}
+
+
+void test_SMAXV ( void )
+{
+  int i;
+  V128 block[2];
+
+  /* -- 4s -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "smaxv s8, v7.4s   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("SMAXV v8, v7.4s  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+  /* -- 8h -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "smaxv h8, v7.8h   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("SMAXV h8, v7.8h  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+  /* -- 4h -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "smaxv h8, v7.4h   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("SMAXV h8, v7.4h  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+  /* -- 16b -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "smaxv b8, v7.16b   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("SMAXV b8, v7.16b  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+  /* -- 8b -- */
+
+  for (i = 0; i < 10; i++) {
+    memset(&block, 0x55, sizeof(block));
+    randV128(&block[0]);
+    __asm__ __volatile__(
+       "ldr   q7, [%0, #0]   ; "
+       "smaxv b8, v7.8b   ; "
+       "str   q8, [%0, #16] "
+       : : "r"(&block[0]) : "memory", "v7", "v8"
+                         );
+    printf("SMAXV b8, v7.8b  ");
+    showV128(&block[0]); printf("  ");
+    showV128(&block[1]); printf("\n");
+  }
+
+}
+
+#define ITERS 100
+
+/* Note this also sets the destination register to a known value (0x55..55)
+   since it can sometimes be an input to the instruction too. */
+#define GEN_BINARY_TEST(INSN,SUFFIX) \
+  __attribute__((noinline)) \
+  static void test_##INSN##_##SUFFIX ( void ) { \
+     Int i; \
+     for (i = 0; i < ITERS; i++) { \
+        V128 block[3]; \
+        memset(block, 0x55, sizeof(block)); \
+        randV128(&block[0]); \
+        randV128(&block[1]); \
+        __asm__ __volatile__( \
+           "ldr   q7, [%0, #0]   ; " \
+           "ldr   q8, [%0, #16]   ; " \
+           "ldr   q9, [%0, #32]   ; " \
+           #INSN " v9." #SUFFIX ", v7." #SUFFIX ", v8." #SUFFIX " ; " \
+           "str   q9, [%0, #32] " \
+           : : "r"(&block[0]) : "memory", "v7", "v8", "v9" \
+        ); \
+        printf(#INSN   " v9." #SUFFIX ", v7." #SUFFIX ", v8." #SUFFIX "  "); \
+        showV128(&block[0]); printf("  "); \
+        showV128(&block[1]); printf("  "); \
+        showV128(&block[2]); printf("\n"); \
+     } \
+  }
+
+
+/* Note this also sets the destination register to a known value (0x55..55)
+   since it can sometimes be an input to the instruction too. */
+#define GEN_SHIFT_TEST(INSN,SUFFIXD,SUFFIXN,AMOUNT) \
+  __attribute__((noinline)) \
+  static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_##AMOUNT ( void ) { \
+     Int i; \
+     for (i = 0; i < ITERS; i++) { \
+        V128 block[2]; \
+        memset(block, 0x55, sizeof(block)); \
+        randV128(&block[0]); \
+        __asm__ __volatile__( \
+           "ldr   q7, [%0, #0]   ; " \
+           "ldr   q8, [%0, #16]   ; " \
+           #INSN " v8." #SUFFIXD ", v7." #SUFFIXN ", #" #AMOUNT " ; " \
+           "str   q8, [%0, #16] " \
+           : : "r"(&block[0]) : "memory", "v7", "v8" \
+        ); \
+        printf(#INSN   " v8." #SUFFIXD ", v7." #SUFFIXN ", #" #AMOUNT "  "); \
+        showV128(&block[0]); printf("  "); \
+        showV128(&block[1]); printf("\n"); \
+     } \
+  }
+
+/* Note this also sets the destination register to a known value (0x55..55)
+   since it can sometimes be an input to the instruction too. */
+#define GEN_UNARY_TEST(INSN,SUFFIXD,SUFFIXN) \
+  __attribute__((noinline)) \
+  static void test_##INSN##_##SUFFIXD##_##SUFFIXN ( void ) { \
+     Int i; \
+     for (i = 0; i < ITERS; i++) { \
+        V128 block[2]; \
+        memset(block, 0x55, sizeof(block)); \
+        randV128(&block[0]); \
+        __asm__ __volatile__( \
+           "ldr   q7, [%0, #0]   ; " \
+           "ldr   q8, [%0, #16]   ; " \
+           #INSN " v8." #SUFFIXD ", v7." #SUFFIXN " ; " \
+           "str   q8, [%0, #16] " \
+           : : "r"(&block[0]) : "memory", "v7", "v8" \
+        ); \
+        printf(#INSN   " v8." #SUFFIXD ", v7." #SUFFIXN); \
+        showV128(&block[0]); printf("  "); \
+        showV128(&block[1]); printf("\n"); \
+     } \
+  }
+
+
+GEN_BINARY_TEST(umax, 4s)
+GEN_BINARY_TEST(umax, 8h)
+GEN_BINARY_TEST(umax, 4h)
+GEN_BINARY_TEST(umax, 16b)
+GEN_BINARY_TEST(umax, 8b)
+
+GEN_BINARY_TEST(umin, 4s)
+GEN_BINARY_TEST(umin, 8h)
+GEN_BINARY_TEST(umin, 4h)
+GEN_BINARY_TEST(umin, 16b)
+GEN_BINARY_TEST(umin, 8b)
+
+GEN_BINARY_TEST(smax, 4s)
+GEN_BINARY_TEST(smax, 8h)
+GEN_BINARY_TEST(smax, 4h)
+GEN_BINARY_TEST(smax, 16b)
+GEN_BINARY_TEST(smax, 8b)
+
+GEN_BINARY_TEST(smin, 4s)
+GEN_BINARY_TEST(smin, 8h)
+GEN_BINARY_TEST(smin, 4h)
+GEN_BINARY_TEST(smin, 16b)
+GEN_BINARY_TEST(smin, 8b)
+
+GEN_BINARY_TEST(add, 2d)
+GEN_BINARY_TEST(add, 4s)
+GEN_BINARY_TEST(add, 2s)
+GEN_BINARY_TEST(add, 8h)
+GEN_BINARY_TEST(add, 4h)
+GEN_BINARY_TEST(add, 16b)
+GEN_BINARY_TEST(add, 8b)
+
+GEN_BINARY_TEST(sub, 2d)
+GEN_BINARY_TEST(sub, 4s)
+GEN_BINARY_TEST(sub, 2s)
+GEN_BINARY_TEST(sub, 8h)
+GEN_BINARY_TEST(sub, 4h)
+GEN_BINARY_TEST(sub, 16b)
+GEN_BINARY_TEST(sub, 8b)
+
+GEN_BINARY_TEST(mul, 4s)
+GEN_BINARY_TEST(mul, 2s)
+GEN_BINARY_TEST(mul, 8h)
+GEN_BINARY_TEST(mul, 4h)
+GEN_BINARY_TEST(mul, 16b)
+GEN_BINARY_TEST(mul, 8b)
+
+GEN_BINARY_TEST(mla, 4s)
+GEN_BINARY_TEST(mla, 2s)
+GEN_BINARY_TEST(mla, 8h)
+GEN_BINARY_TEST(mla, 4h)
+GEN_BINARY_TEST(mla, 16b)
+GEN_BINARY_TEST(mla, 8b)
+
+GEN_BINARY_TEST(mls, 4s)
+GEN_BINARY_TEST(mls, 2s)
+GEN_BINARY_TEST(mls, 8h)
+GEN_BINARY_TEST(mls, 4h)
+GEN_BINARY_TEST(mls, 16b)
+GEN_BINARY_TEST(mls, 8b)
+
+GEN_BINARY_TEST(and, 16b)
+GEN_BINARY_TEST(and, 8b)
+
+GEN_BINARY_TEST(bic, 16b)
+GEN_BINARY_TEST(bic, 8b)
+
+GEN_BINARY_TEST(orr, 16b)
+GEN_BINARY_TEST(orr, 8b)
+
+GEN_BINARY_TEST(orn, 16b)
+GEN_BINARY_TEST(orn, 8b)
+
+GEN_BINARY_TEST(eor, 16b)
+GEN_BINARY_TEST(eor, 8b)
+
+GEN_BINARY_TEST(bsl, 16b)
+GEN_BINARY_TEST(bsl, 8b)
+
+GEN_BINARY_TEST(bit, 16b)
+GEN_BINARY_TEST(bit, 8b)
+
+GEN_BINARY_TEST(bif, 16b)
+GEN_BINARY_TEST(bif, 8b)
+
+GEN_BINARY_TEST(cmeq, 2d)
+GEN_BINARY_TEST(cmeq, 4s)
+GEN_BINARY_TEST(cmeq, 2s)
+GEN_BINARY_TEST(cmeq, 8h)
+GEN_BINARY_TEST(cmeq, 4h)
+GEN_BINARY_TEST(cmeq, 16b)
+GEN_BINARY_TEST(cmeq, 8b)
+
+GEN_BINARY_TEST(cmtst, 2d)
+GEN_BINARY_TEST(cmtst, 4s)
+GEN_BINARY_TEST(cmtst, 2s)
+GEN_BINARY_TEST(cmtst, 8h)
+GEN_BINARY_TEST(cmtst, 4h)
+GEN_BINARY_TEST(cmtst, 16b)
+GEN_BINARY_TEST(cmtst, 8b)
+
+GEN_BINARY_TEST(cmhi, 2d)
+GEN_BINARY_TEST(cmhi, 4s)
+GEN_BINARY_TEST(cmhi, 2s)
+GEN_BINARY_TEST(cmhi, 8h)
+GEN_BINARY_TEST(cmhi, 4h)
+GEN_BINARY_TEST(cmhi, 16b)
+GEN_BINARY_TEST(cmhi, 8b)
+
+GEN_BINARY_TEST(cmgt, 2d)
+GEN_BINARY_TEST(cmgt, 4s)
+GEN_BINARY_TEST(cmgt, 2s)
+GEN_BINARY_TEST(cmgt, 8h)
+GEN_BINARY_TEST(cmgt, 4h)
+GEN_BINARY_TEST(cmgt, 16b)
+GEN_BINARY_TEST(cmgt, 8b)
+
+GEN_BINARY_TEST(cmhs, 2d)
+GEN_BINARY_TEST(cmhs, 4s)
+GEN_BINARY_TEST(cmhs, 2s)
+GEN_BINARY_TEST(cmhs, 8h)
+GEN_BINARY_TEST(cmhs, 4h)
+GEN_BINARY_TEST(cmhs, 16b)
+GEN_BINARY_TEST(cmhs, 8b)
+
+GEN_BINARY_TEST(cmge, 2d)
+GEN_BINARY_TEST(cmge, 4s)
+GEN_BINARY_TEST(cmge, 2s)
+GEN_BINARY_TEST(cmge, 8h)
+GEN_BINARY_TEST(cmge, 4h)
+GEN_BINARY_TEST(cmge, 16b)
+GEN_BINARY_TEST(cmge, 8b)
+
+GEN_SHIFT_TEST(ushr, 2d, 2d, 1)
+GEN_SHIFT_TEST(ushr, 2d, 2d, 13)
+GEN_SHIFT_TEST(ushr, 2d, 2d, 63)
+GEN_SHIFT_TEST(sshr, 2d, 2d, 1)
+GEN_SHIFT_TEST(sshr, 2d, 2d, 13)
+GEN_SHIFT_TEST(sshr, 2d, 2d, 63)
+
+GEN_SHIFT_TEST(ushr, 4s, 4s, 1)
+GEN_SHIFT_TEST(ushr, 4s, 4s, 13)
+GEN_SHIFT_TEST(ushr, 4s, 4s, 31)
+GEN_SHIFT_TEST(sshr, 4s, 4s, 1)
+GEN_SHIFT_TEST(sshr, 4s, 4s, 13)
+GEN_SHIFT_TEST(sshr, 4s, 4s, 31)
+
+GEN_SHIFT_TEST(ushr, 2s, 2s, 1)
+GEN_SHIFT_TEST(ushr, 2s, 2s, 13)
+GEN_SHIFT_TEST(ushr, 2s, 2s, 31)
+GEN_SHIFT_TEST(sshr, 2s, 2s, 1)
+GEN_SHIFT_TEST(sshr, 2s, 2s, 13)
+GEN_SHIFT_TEST(sshr, 2s, 2s, 31)
+
+GEN_SHIFT_TEST(ushr, 8h, 8h, 1)
+GEN_SHIFT_TEST(ushr, 8h, 8h, 13)
+GEN_SHIFT_TEST(ushr, 8h, 8h, 15)
+GEN_SHIFT_TEST(sshr, 8h, 8h, 1)
+GEN_SHIFT_TEST(sshr, 8h, 8h, 13)
+GEN_SHIFT_TEST(sshr, 8h, 8h, 15)
+
+GEN_SHIFT_TEST(ushr, 4h, 4h, 1)
+GEN_SHIFT_TEST(ushr, 4h, 4h, 13)
+GEN_SHIFT_TEST(ushr, 4h, 4h, 15)
+GEN_SHIFT_TEST(sshr, 4h, 4h, 1)
+GEN_SHIFT_TEST(sshr, 4h, 4h, 13)
+GEN_SHIFT_TEST(sshr, 4h, 4h, 15)
+
+GEN_SHIFT_TEST(ushr, 16b, 16b, 1)
+GEN_SHIFT_TEST(ushr, 16b, 16b, 7)
+GEN_SHIFT_TEST(sshr, 16b, 16b, 1)
+GEN_SHIFT_TEST(sshr, 16b, 16b, 7)
+
+GEN_SHIFT_TEST(ushr, 8b, 8b, 1)
+GEN_SHIFT_TEST(ushr, 8b, 8b, 7)
+GEN_SHIFT_TEST(sshr, 8b, 8b, 1)
+GEN_SHIFT_TEST(sshr, 8b, 8b, 7)
+
+GEN_SHIFT_TEST(ushll,  2d, 2s, 0)
+GEN_SHIFT_TEST(ushll,  2d, 2s, 15)
+GEN_SHIFT_TEST(ushll,  2d, 2s, 31)
+GEN_SHIFT_TEST(ushll2, 2d, 4s, 0)
+GEN_SHIFT_TEST(ushll2, 2d, 4s, 15)
+GEN_SHIFT_TEST(ushll2, 2d, 4s, 31)
+
+GEN_SHIFT_TEST(sshll,  2d, 2s, 0)
+GEN_SHIFT_TEST(sshll,  2d, 2s, 15)
+GEN_SHIFT_TEST(sshll,  2d, 2s, 31)
+GEN_SHIFT_TEST(sshll2, 2d, 4s, 0)
+GEN_SHIFT_TEST(sshll2, 2d, 4s, 15)
+GEN_SHIFT_TEST(sshll2, 2d, 4s, 31)
+
+GEN_UNARY_TEST(xtn,  2s, 2d)
+GEN_UNARY_TEST(xtn2, 4s, 2d)
+GEN_UNARY_TEST(xtn,  4h, 4s)
+GEN_UNARY_TEST(xtn2, 8h, 4s)
+
+int main ( void )
+{
+   assert(sizeof(V128) == 16);
+
+   printf("FMOV (general) MISSING\n");
+   printf("FMOV (scalar, immediate) MISSING\n");
+   printf("{FMOV,MOVI} (vector, immediate) MISSING\n");
+   printf("{S,U}CVTF (scalar, integer) MISSING\n");
+   printf("F{ADD,SUB,MUL,DIV,NMUL} (scalar) MISSING\n");
+   printf("F{MOV,ABS,NEG,SQRT} D/D or S/S MISSING\n");
+   printf("F{ABS,NEG} (vector) MISSING\n");
+   printf("FCMP,FCMPE MISSING\n");
+   printf("F{N}M{ADD,SUB} MISSING\n");
+   printf("FCVT{N,P,M,Z}{S,U} (scalar, integer) MISSING\n");
+   printf("FRINT{I,M,P,Z} (scalar) MISSING\n");
+   printf("FCVT (scalar) MISSING\n");
+   printf("FABD (scalar) MISSING\n");
+   printf("{S,U}CVTF (vector, integer) MISSING\n");
+   printf("F{ADD,SUB,MUL,DIV,MLA,MLS} (vector) MISSING\n");
+
+   printf("ADD/SUB (vector) MISSING\n");
+   test_add_2d();
+   test_add_4s();
+   test_add_2s();
+   test_add_8h();
+   test_add_4h();
+   //test_add_16b();
+   //test_add_8b();
+   test_sub_2d();
+   test_sub_4s();
+   test_sub_2s();
+   test_sub_8h();
+   test_sub_4h();
+   //test_sub_16b();
+   //test_sub_8b();
+
+   printf("ADD/SUB (scalar) MISSING\n");
+
+   test_mul_4s();
+   test_mul_2s();
+   test_mul_8h();
+   test_mul_4h();
+   //test_mul_16b();
+   //test_mul_8b();
+   test_mla_4s();
+   test_mla_2s();
+   test_mla_8h();
+   test_mla_4h();
+   //test_mla_16b();
+   //test_mla_8b();
+   test_mls_4s();
+   test_mls_2s();
+   test_mls_8h();
+   test_mls_4h();
+   //test_mls_16b();
+   //test_mls_8b();
+   printf("MUL/PMUL/MLA/MLS (vector) (partly MISSING)\n");
+
+   test_umax_4s();
+   test_umax_8h();
+   test_umax_4h();
+   test_umax_16b();
+   test_umax_8b();
+   test_umin_4s();
+   test_umin_8h();
+   test_umin_4h();
+   test_umin_16b();
+   test_umin_8b();
+   test_smax_4s();
+   test_smax_8h();
+   test_smax_4h();
+   test_smax_16b();
+   test_smax_8b();
+   test_smin_4s();
+   test_smin_8h();
+   test_smin_4h();
+   test_smin_16b();
+   test_smin_8b();
+
+   test_UMINV();
+   test_UMAXV();
+   test_SMINV();
+   test_SMAXV();
+
+   test_and_16b();
+   test_and_8b();
+   test_bic_16b();
+   test_bic_8b();
+   test_orr_16b();
+   test_orr_8b();
+   test_orn_16b();
+   test_orn_8b();
+
+   test_cmeq_2d();
+#if 0
+   test_cmeq_4s();
+   test_cmeq_2s();
+   test_cmeq_8h();
+   test_cmeq_4h();
+   test_cmeq_16b();
+   test_cmeq_8b();
+   test_cmtst_2d();
+   test_cmtst_4s();
+   test_cmtst_2s();
+   test_cmtst_8h();
+   test_cmtst_4h();
+   test_cmtst_16b();
+   test_cmtst_8b();
+   test_cmhi_2d();
+   test_cmhi_4s();
+   test_cmhi_2s();
+   test_cmhi_8h();
+   test_cmhi_4h();
+   test_cmhi_16b();
+   test_cmhi_8b();
+   test_cmgt_2d();
+   test_cmgt_4s();
+   test_cmgt_2s();
+   test_cmgt_8h();
+   test_cmgt_4h();
+   test_cmgt_16b();
+   test_cmgt_8b();
+   test_cmhs_2d();
+   test_cmhs_4s();
+   test_cmhs_2s();
+   test_cmhs_8h();
+   test_cmhs_4h();
+   test_cmhs_16b();
+   test_cmhs_8b();
+   test_cmge_2d();
+   test_cmge_4s();
+   test_cmge_2s();
+   test_cmge_8h();
+   test_cmge_4h();
+   test_cmge_16b();
+   test_cmge_8b();
+#endif
+   printf("CM{EQ,HI,HS,GE,GT,TST,LE,LT} (vector) (w/zero cases MISSING)\n");
+
+   test_eor_16b();
+   test_eor_8b();
+   test_bsl_16b();
+   test_bsl_8b();
+   test_bit_16b();
+   test_bit_8b();
+   test_bif_16b();
+   test_bif_8b();
+
+   test_ushr_2d_2d_1();
+   test_ushr_2d_2d_13();
+   test_ushr_2d_2d_63();
+   test_sshr_2d_2d_1();
+   test_sshr_2d_2d_13();
+   test_sshr_2d_2d_63();
+#if 0
+   test_ushr_4s_4s_1();
+   test_ushr_4s_4s_13();
+   test_ushr_4s_4s_31();
+   test_sshr_4s_4s_1();
+   test_sshr_4s_4s_13();
+   test_sshr_4s_4s_31();
+   test_ushr_2s_2s_1();
+   test_ushr_2s_2s_13();
+   test_ushr_2s_2s_31();
+   test_sshr_2s_2s_1();
+   test_sshr_2s_2s_13();
+   test_sshr_2s_2s_31();
+   test_ushr_8h_8h_1();
+   test_ushr_8h_8h_13();
+   test_ushr_8h_8h_15();
+   test_sshr_8h_8h_1();
+   test_sshr_8h_8h_13();
+   test_sshr_8h_8h_15();
+   test_ushr_4h_4h_1();
+   test_ushr_4h_4h_13();
+   test_ushr_4h_4h_15();
+   test_sshr_4h_4h_1();
+   test_sshr_4h_4h_13();
+   test_sshr_4h_4h_15();
+   test_ushr_16b_16b_1();
+   test_ushr_16b_16b_7();
+   test_sshr_16b_16b_1();
+   test_sshr_16b_16b_7();
+   test_ushr_8b_8b_1();
+   test_ushr_8b_8b_7();
+   test_sshr_8b_8b_1();
+   test_sshr_8b_8b_7();
+#endif
+
+   printf("{U,S}SHLL{,2} (MISSING h_b and s_h versions)\n");
+   test_ushll_2d_2s_0();
+   test_ushll_2d_2s_15();
+   test_ushll_2d_2s_31();
+   test_ushll2_2d_4s_0();
+   test_ushll2_2d_4s_15();
+   test_ushll2_2d_4s_31();
+   test_sshll_2d_2s_0();
+   test_sshll_2d_2s_15();
+   test_sshll_2d_2s_31();
+   test_sshll2_2d_4s_0();
+   test_sshll2_2d_4s_15();
+   test_sshll2_2d_4s_31();
+   printf("{U,S}SHLL{,2} (MISSING h_b and s_h versions)\n");
+
+   test_xtn_2s_2d();
+   test_xtn2_4s_2d();
+   test_xtn_4h_4s();
+   test_xtn2_8h_4s();
+   printf("XTN{,2} (MISSING b_h versions)\n");
+
+   printf("DUP (element, vector) MISSING\n");
+   printf("DUP (general, vector) MISSING\n");
+   printf("{S,U}MOV MISSING\n");
+
+   test_INS_general();
+
+   return 0;
+}