Add test cases for 16 bit PCMPxSTRx variants.  See #293754.
(Eliot Moss, moss@cs.umass.edu)


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@12389 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/none/tests/amd64/Makefile.am b/none/tests/amd64/Makefile.am
index a9f6e54..ac69bff 100644
--- a/none/tests/amd64/Makefile.am
+++ b/none/tests/amd64/Makefile.am
@@ -55,8 +55,12 @@
 	nibz_bennee_mmap.vgtest \
 	pcmpstr64.stderr.exp pcmpstr64.stdout.exp \
 	pcmpstr64.vgtest \
+	pcmpstr64w.stderr.exp pcmpstr64w.stdout.exp \
+	pcmpstr64w.vgtest \
 	pcmpxstrx64.stderr.exp pcmpxstrx64.stdout.exp \
 	pcmpxstrx64.vgtest \
+	pcmpxstrx64w.stderr.exp pcmpxstrx64w.stdout.exp \
+	pcmpxstrx64w.vgtest \
 	rcl-amd64.vgtest rcl-amd64.stdout.exp rcl-amd64.stderr.exp \
 	redundantRexW.vgtest redundantRexW.stdout.exp \
 	redundantRexW.stderr.exp \
@@ -92,7 +96,9 @@
  check_PROGRAMS += lzcnt64
 endif
 if BUILD_SSE42_TESTS
- check_PROGRAMS += pcmpstr64 pcmpxstrx64 sse4-64 crc32 aes
+ check_PROGRAMS += \
+	pcmpstr64 pcmpxstrx64 sse4-64 crc32 aes \
+	pcmpstr64w pcmpxstrx64w
 endif
 
 # DDD: these need to be made to work on Darwin like the x86/ ones were.
diff --git a/none/tests/amd64/pcmpstr64w.c b/none/tests/amd64/pcmpstr64w.c
new file mode 100644
index 0000000..7f408fc
--- /dev/null
+++ b/none/tests/amd64/pcmpstr64w.c
@@ -0,0 +1,1269 @@
+
+/* Tests in detail the core arithmetic for pcmp{e,i}str{i,m} using
+   pcmpistri to drive it.  Does not check the e-vs-i or i-vs-m
+   aspect. */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+
+typedef  unsigned int   UInt;
+typedef  signed int     Int;
+typedef  unsigned char  UChar;
+typedef  unsigned short UShort;
+typedef  unsigned long long int ULong;
+typedef  UChar          Bool;
+#define False ((Bool)0)
+#define True  ((Bool)1)
+
+//typedef  unsigned char  V128[16];
+typedef
+   union {
+      UChar  uChar[16];
+      UShort uShort[8];
+      UInt   uInt[4];
+      UInt   w32[4];
+   }
+   V128;
+
+#define SHIFT_O   11
+#define SHIFT_S   7
+#define SHIFT_Z   6
+#define SHIFT_A   4
+#define SHIFT_C   0
+#define SHIFT_P   2
+
+#define MASK_O    (1ULL << SHIFT_O)
+#define MASK_S    (1ULL << SHIFT_S)
+#define MASK_Z    (1ULL << SHIFT_Z)
+#define MASK_A    (1ULL << SHIFT_A)
+#define MASK_C    (1ULL << SHIFT_C)
+#define MASK_P    (1ULL << SHIFT_P)
+
+
+UInt clz32 ( UInt x )
+{
+   Int y, m, n;
+   y = -(x >> 16);
+   m = (y >> 16) & 16;
+   n = 16 - m;
+   x = x >> m;
+   y = x - 0x100;
+   m = (y >> 16) & 8;
+   n = n + m;
+   x = x << m;
+   y = x - 0x1000;
+   m = (y >> 16) & 4;
+   n = n + m;
+   x = x << m;
+   y = x - 0x4000;
+   m = (y >> 16) & 2;
+   n = n + m;
+   x = x << m;
+   y = x >> 14;
+   m = y & ~(y >> 1);
+   return n + 2 - m;
+}
+
+UInt ctz32 ( UInt x )
+{
+   return 32 - clz32((~x) & (x-1));
+}
+
+void expand ( V128* dst, char* summary )
+{
+   Int i;
+   assert( strlen(summary) == 16 );
+   for (i = 0; i < 16; i++) {
+      UChar xx = 0;
+      UChar x = summary[15-i];
+      if      (x >= '0' && x <= '9') { xx = x - '0'; }
+      else if (x >= 'A' && x <= 'F') { xx = x - 'A' + 10; }
+      else if (x >= 'a' && x <= 'f') { xx = x - 'a' + 10; }
+      else assert(0);
+
+      assert(xx < 16);
+      xx = (xx << 4) | xx;
+      assert(xx < 256);
+      dst->uChar[i] = xx;
+   }
+}
+
+void try_istri ( char* which,
+                 UInt(*h_fn)(V128*,V128*),
+                 UInt(*s_fn)(V128*,V128*),
+                 char* summL, char* summR )
+{
+   assert(strlen(which) == 2);
+   V128 argL, argR;
+   expand(&argL, summL);
+   expand(&argR, summR);
+   UInt h_res = h_fn(&argL, &argR);
+   UInt s_res = s_fn(&argL, &argR);
+   printf("istri %s  %s %s -> %08x %08x %s\n",
+          which, summL, summR, h_res, s_res, h_res == s_res ? "" : "!!!!");
+}
+
+UInt zmask_from_V128 ( V128* arg )
+{
+   UInt i, res = 0;
+   for (i = 0; i < 8; i++) {
+      res |=  ((arg->uShort[i] == 0) ? 1 : 0) << i;
+   }
+   return res;
+}
+
+//////////////////////////////////////////////////////////
+//                                                      //
+//                       GENERAL                        //
+//                                                      //
+//////////////////////////////////////////////////////////
+
+
+/* Given partial results from a 16-bit pcmpXstrX operation (intRes1,
+   basically), generate an I- or M-format output value, also the new
+   OSZACP flags.  */
+static
+void PCMPxSTRx_WRK_gen_output_fmt_I_wide ( /*OUT*/V128* resV,
+					   /*OUT*/UInt* resOSZACP,
+					   UInt intRes1,
+					   UInt zmaskL, UInt zmaskR,
+					   UInt validL,
+					   UInt pol, UInt idx )
+{
+   assert((pol >> 2) == 0);
+   assert((idx >> 1) == 0);
+
+   UInt intRes2 = 0;
+   switch (pol) {
+      case 0: intRes2 = intRes1;          break; // pol +
+      case 1: intRes2 = ~intRes1;         break; // pol -
+      case 2: intRes2 = intRes1;          break; // pol m+
+      case 3: intRes2 = intRes1 ^ validL; break; // pol m-
+   }
+   intRes2 &= 0xFF;
+
+   // generate I-format output (an index in ECX)
+   // generate ecx value
+   UInt newECX = 0;
+   if (idx) {
+     // index of ms-1-bit
+     newECX = intRes2 == 0 ? 8 : (31 - clz32(intRes2));
+   } else {
+     // index of ls-1-bit
+     newECX = intRes2 == 0 ? 8 : ctz32(intRes2);
+   }
+
+   resV->w32[0] = newECX;
+   resV->w32[1] = 0;
+   resV->w32[2] = 0;
+   resV->w32[3] = 0;
+
+   // generate new flags, common to all ISTRI and ISTRM cases
+   *resOSZACP    // A, P are zero
+     = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
+     | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
+     | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
+     | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
+}
+
+/* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
+   variants on 16-bit characters.
+
+   For xSTRI variants, the new ECX value is placed in the 32 bits
+   pointed to by *resV, and the top 96 bits are zeroed.  For xSTRM
+   variants, the result is a 128 bit value and is placed at *resV in
+   the obvious way.
+
+   For all variants, the new OSZACP value is placed at *resOSZACP.
+
+   argLV and argRV are the vector args.  The caller must prepare a
+   8-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
+   must be 1 for each zero byte of of the respective arg.  For ESTRx
+   variants this is derived from the explicit length indication, and
+   must be 0 in all places except at the bit index corresponding to
+   the valid length (0 .. 8).  If the valid length is 8 then the
+   mask must be all zeroes.  In all cases, bits 31:8 must be zero.
+
+   imm8 is the original immediate from the instruction.  isSTRM
+   indicates whether this is a xSTRM or xSTRI variant, which controls
+   how much of *res is written.
+
+   If the given imm8 case can be handled, the return value is True.
+   If not, False is returned, and neither *res not *resOSZACP are
+   altered.
+*/
+
+Bool pcmpXstrX_WRK_wide ( /*OUT*/V128* resV,
+			  /*OUT*/UInt* resOSZACP,
+			  V128* argLV,  V128* argRV,
+			  UInt zmaskL, UInt zmaskR,
+			  UInt imm8,   Bool isxSTRM )
+{
+   assert(imm8 < 0x80);
+   assert((zmaskL >> 8) == 0);
+   assert((zmaskR >> 8) == 0);
+
+   /* Explicitly reject any imm8 values that haven't been validated,
+      even if they would probably work.  Life is too short to have
+      unvalidated cases in the code base. */
+   switch (imm8) {
+      case 0x01:
+      case 0x03: case 0x09: case 0x0B: case 0x0D: case 0x13:
+      case 0x1B: case 0x39: case 0x3B: case 0x45: case 0x4B:
+         break;
+      default:
+         return False;
+   }
+
+   UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
+   UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
+   UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
+   UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
+
+   /*----------------------------------------*/
+   /*-- strcmp on wide data                --*/
+   /*----------------------------------------*/
+
+   if (agg == 2/*equal each, aka strcmp*/
+       && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
+      Int    i;
+      UShort* argL = (UShort*)argLV;
+      UShort* argR = (UShort*)argRV;
+      UInt boolResII = 0;
+      for (i = 7; i >= 0; i--) {
+         UShort cL  = argL[i];
+         UShort cR  = argR[i];
+         boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
+      }
+      UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
+      UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
+
+      // do invalidation, common to all equal-each cases
+      UInt intRes1
+         = (boolResII & validL & validR)  // if both valid, use cmpres
+           | (~ (validL | validR));       // if both invalid, force 1
+                                          // else force 0
+      intRes1 &= 0xFF;
+
+      // generate I-format output
+      PCMPxSTRx_WRK_gen_output_fmt_I_wide(
+         resV, resOSZACP,
+         intRes1, zmaskL, zmaskR, validL, pol, idx
+      );
+
+      return True;
+   }
+
+   /*----------------------------------------*/
+   /*-- set membership on wide data        --*/
+   /*----------------------------------------*/
+
+   if (agg == 0/*equal any, aka find chars in a set*/
+       && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
+      /* argL: the string,  argR: charset */
+      UInt   si, ci;
+      UShort* argL    = (UShort*)argLV;
+      UShort* argR    = (UShort*)argRV;
+      UInt   boolRes = 0;
+      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
+      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
+
+      for (si = 0; si < 8; si++) {
+         if ((validL & (1 << si)) == 0)
+            // run off the end of the string.
+            break;
+         UInt m = 0;
+         for (ci = 0; ci < 8; ci++) {
+            if ((validR & (1 << ci)) == 0) break;
+            if (argR[ci] == argL[si]) { m = 1; break; }
+         }
+         boolRes |= (m << si);
+      }
+
+      // boolRes is "pre-invalidated"
+      UInt intRes1 = boolRes & 0xFF;
+
+      // generate I-format output
+      PCMPxSTRx_WRK_gen_output_fmt_I_wide(
+         resV, resOSZACP,
+         intRes1, zmaskL, zmaskR, validL, pol, idx
+      );
+
+      return True;
+   }
+
+   /*----------------------------------------*/
+   /*-- substring search on wide data      --*/
+   /*----------------------------------------*/
+
+   if (agg == 3/*equal ordered, aka substring search*/
+       && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
+
+      /* argL: haystack,  argR: needle */
+      UInt   ni, hi;
+      UShort* argL    = (UShort*)argLV;
+      UShort* argR    = (UShort*)argRV;
+      UInt   boolRes = 0;
+      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
+      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
+      for (hi = 0; hi < 8; hi++) {
+         if ((validL & (1 << hi)) == 0)
+            // run off the end of the haystack
+            break;
+         UInt m = 1;
+         for (ni = 0; ni < 8; ni++) {
+            if ((validR & (1 << ni)) == 0) break;
+            UInt i = ni + hi;
+            if (i >= 8) break;
+            if (argL[i] != argR[ni]) { m = 0; break; }
+         }
+         boolRes |= (m << hi);
+      }
+
+      // boolRes is "pre-invalidated"
+      UInt intRes1 = boolRes & 0xFF;
+
+      // generate I-format output
+      PCMPxSTRx_WRK_gen_output_fmt_I_wide(
+         resV, resOSZACP,
+         intRes1, zmaskL, zmaskR, validL, pol, idx
+      );
+
+      return True;
+   }
+
+   /*----------------------------------------*/
+   /*-- ranges, unsigned wide data         --*/
+   /*----------------------------------------*/
+
+   if (agg == 1/*ranges*/
+       && fmt == 1/*uw*/) {
+
+      /* argL: string,  argR: range-pairs */
+      UInt   ri, si;
+      UShort* argL    = (UShort*)argLV;
+      UShort* argR    = (UShort*)argRV;
+      UInt   boolRes = 0;
+      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
+      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
+      for (si = 0; si < 8; si++) {
+         if ((validL & (1 << si)) == 0)
+            // run off the end of the string
+            break;
+         UInt m = 0;
+         for (ri = 0; ri < 8; ri += 2) {
+            if ((validR & (3 << ri)) != (3 << ri)) break;
+            if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
+               m = 1; break;
+            }
+         }
+         boolRes |= (m << si);
+      }
+
+      // boolRes is "pre-invalidated"
+      UInt intRes1 = boolRes & 0xFF;
+
+      // generate I-format output
+      PCMPxSTRx_WRK_gen_output_fmt_I_wide(
+         resV, resOSZACP,
+         intRes1, zmaskL, zmaskR, validL, pol, idx
+      );
+
+      return True;
+   }
+
+   return False;
+}
+
+//////////////////////////////////////////////////////////
+//                                                      //
+//                       ISTRI_4B                       //
+//                                                      //
+//////////////////////////////////////////////////////////
+
+UInt h_pcmpistri_4B ( V128* argL, V128* argR )
+{
+   V128 block[2];
+   memcpy(&block[0], argL, sizeof(V128));
+   memcpy(&block[1], argR, sizeof(V128));
+   ULong res, flags;
+   __asm__ __volatile__(
+      "subq      $1024,  %%rsp"             "\n\t"
+      "movdqu    0(%2),  %%xmm2"            "\n\t"
+      "movdqu    16(%2), %%xmm11"           "\n\t"
+      "pcmpistri $0x4B,  %%xmm2, %%xmm11"   "\n\t"
+      "pushfq"                              "\n\t"
+      "popq      %%rdx"                     "\n\t"
+      "movq      %%rcx,  %0"                "\n\t"
+      "movq      %%rdx,  %1"                "\n\t"
+      "addq      $1024,  %%rsp"             "\n\t"
+      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
+      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
+   );
+   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
+}
+
+UInt s_pcmpistri_4B ( V128* argLU, V128* argRU )
+{
+   V128 resV;
+   UInt resOSZACP, resECX;
+   Bool ok
+      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
+			    zmask_from_V128(argLU),
+			    zmask_from_V128(argRU),
+			    0x4B, False/*!isSTRM*/
+        );
+   assert(ok);
+   resECX = resV.uInt[0];
+   return (resOSZACP << 16) | resECX;
+}
+
+void istri_4B ( void )
+{
+   char* wot = "4B";
+   UInt(*h)(V128*,V128*) = h_pcmpistri_4B;
+   UInt(*s)(V128*,V128*) = s_pcmpistri_4B;
+
+   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
+
+   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
+   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
+
+   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
+   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
+   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
+
+   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
+}
+
+//////////////////////////////////////////////////////////
+//                                                      //
+//                       ISTRI_3B                       //
+//                                                      //
+//////////////////////////////////////////////////////////
+
+UInt h_pcmpistri_3B ( V128* argL, V128* argR )
+{
+   V128 block[2];
+   memcpy(&block[0], argL, sizeof(V128));
+   memcpy(&block[1], argR, sizeof(V128));
+   ULong res, flags;
+   __asm__ __volatile__(
+      "subq      $1024,  %%rsp"             "\n\t"
+      "movdqu    0(%2),  %%xmm2"            "\n\t"
+      "movdqu    16(%2), %%xmm11"           "\n\t"
+      "pcmpistri $0x3B,  %%xmm2, %%xmm11"   "\n\t"
+      "pushfq"                              "\n\t"
+      "popq      %%rdx"                     "\n\t"
+      "movq      %%rcx,  %0"                "\n\t"
+      "movq      %%rdx,  %1"                "\n\t"
+      "addq      $1024,  %%rsp"             "\n\t"
+      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
+      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
+   );
+   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
+}
+
+UInt s_pcmpistri_3B ( V128* argLU, V128* argRU )
+{
+   V128 resV;
+   UInt resOSZACP, resECX;
+   Bool ok
+      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
+			    zmask_from_V128(argLU),
+			    zmask_from_V128(argRU),
+			    0x3B, False/*!isSTRM*/
+        );
+   assert(ok);
+   resECX = resV.uInt[0];
+   return (resOSZACP << 16) | resECX;
+}
+
+void istri_3B ( void )
+{
+   char* wot = "3B";
+   UInt(*h)(V128*,V128*) = h_pcmpistri_3B;
+   UInt(*s)(V128*,V128*) = s_pcmpistri_3B;
+
+   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
+
+   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
+   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
+
+   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
+   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
+   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
+
+   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
+}
+
+
+
+//////////////////////////////////////////////////////////
+//                                                      //
+//                       ISTRI_0D                       //
+//                                                      //
+//////////////////////////////////////////////////////////
+
+__attribute__((noinline))
+UInt h_pcmpistri_0D ( V128* argL, V128* argR )
+{
+   V128 block[2];
+   memcpy(&block[0], argL, sizeof(V128));
+   memcpy(&block[1], argR, sizeof(V128));
+   ULong res = 0, flags = 0;
+   __asm__ __volatile__(
+      "movdqa    0(%2),  %%xmm2"            "\n\t"
+      "movdqa    16(%2), %%xmm11"           "\n\t"
+      "pcmpistri $0x0D,  %%xmm2, %%xmm11"   "\n\t"
+      //"pcmpistrm $0x0D,  %%xmm2, %%xmm11"   "\n\t"
+      //"movd %%xmm0, %%ecx" "\n\t"
+      "pushfq"                              "\n\t"
+      "popq      %%rdx"                     "\n\t"
+      "movq      %%rcx,  %0"                "\n\t"
+      "movq      %%rdx,  %1"                "\n\t"
+      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
+      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
+   );
+   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
+}
+
+UInt s_pcmpistri_0D ( V128* argLU, V128* argRU )
+{
+   V128 resV;
+   UInt resOSZACP, resECX;
+   Bool ok
+      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
+			    zmask_from_V128(argLU),
+			    zmask_from_V128(argRU),
+			    0x0D, False/*!isSTRM*/
+        );
+   assert(ok);
+   resECX = resV.uInt[0];
+   return (resOSZACP << 16) | resECX;
+}
+
+void istri_0D ( void )
+{
+   char* wot = "0D";
+   UInt(*h)(V128*,V128*) = h_pcmpistri_0D;
+   UInt(*s)(V128*,V128*) = s_pcmpistri_0D;
+
+   try_istri(wot,h,s, "11111111abcdef11", "0000000000abcdef");
+
+   try_istri(wot,h,s, "11111111abcdef11", "00abcdef00abcdef");
+
+   try_istri(wot,h,s, "11111111abcdef11", "0000000000abcdef");
+   try_istri(wot,h,s, "1111111111abcdef", "0000000000abcdef");
+   try_istri(wot,h,s, "111111111111abcd", "0000000000abcdef");
+
+   try_istri(wot,h,s, "1111abcd11abcd11", "000000000000abcd");
+
+   try_istri(wot,h,s, "11abcd1111abcd11", "000000000000abcd");
+   try_istri(wot,h,s, "abcd111111abcd11", "000000000000abcd");
+   try_istri(wot,h,s, "cd11111111abcd11", "000000000000abcd");
+
+   try_istri(wot,h,s, "01abcd11abcd1111", "000000000000abcd");
+   try_istri(wot,h,s, "00abcd11abcd1111", "000000000000abcd");
+   try_istri(wot,h,s, "0000cd11abcd1111", "000000000000abcd");
+
+   try_istri(wot,h,s, "00abcd1100abcd11", "000000000000abcd");
+   try_istri(wot,h,s, "00abcd110000cd11", "000000000000abcd");
+
+   try_istri(wot,h,s, "1111111111111234", "0000000000000000");
+   try_istri(wot,h,s, "1111111111111234", "0000000000000011");
+   try_istri(wot,h,s, "1111111111111234", "0000000000001111");
+
+   try_istri(wot,h,s, "1111111111111234", "1111111111111234");
+   try_istri(wot,h,s, "0a11111111111111", "000000000000000a");
+   try_istri(wot,h,s, "0b11111111111111", "000000000000000a");
+}
+
+
+//////////////////////////////////////////////////////////
+//                                                      //
+//                       ISTRI_09                       //
+//                                                      //
+//////////////////////////////////////////////////////////
+
+UInt h_pcmpistri_09 ( V128* argL, V128* argR )
+{
+   V128 block[2];
+   memcpy(&block[0], argL, sizeof(V128));
+   memcpy(&block[1], argR, sizeof(V128));
+   ULong res, flags;
+   __asm__ __volatile__(
+      "subq      $1024,  %%rsp"             "\n\t"
+      "movdqu    0(%2),  %%xmm2"            "\n\t"
+      "movdqu    16(%2), %%xmm11"           "\n\t"
+      "pcmpistri $0x09,  %%xmm2, %%xmm11"   "\n\t"
+      "pushfq"                              "\n\t"
+      "popq      %%rdx"                     "\n\t"
+      "movq      %%rcx,  %0"                "\n\t"
+      "movq      %%rdx,  %1"                "\n\t"
+      "addq      $1024,  %%rsp"             "\n\t"
+      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
+      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
+   );
+   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
+}
+
+UInt s_pcmpistri_09 ( V128* argLU, V128* argRU )
+{
+   V128 resV;
+   UInt resOSZACP, resECX;
+   Bool ok
+      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
+			    zmask_from_V128(argLU),
+			    zmask_from_V128(argRU),
+			    0x09, False/*!isSTRM*/
+        );
+   assert(ok);
+   resECX = resV.uInt[0];
+   return (resOSZACP << 16) | resECX;
+}
+
+void istri_09 ( void )
+{
+   char* wot = "09";
+   UInt(*h)(V128*,V128*) = h_pcmpistri_09;
+   UInt(*s)(V128*,V128*) = s_pcmpistri_09;
+
+   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
+
+   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
+   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
+
+   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
+   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
+   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
+
+   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
+}
+
+
+
+//////////////////////////////////////////////////////////
+//                                                      //
+//                       ISTRI_1B                       //
+//                                                      //
+//////////////////////////////////////////////////////////
+
+UInt h_pcmpistri_1B ( V128* argL, V128* argR )
+{
+   V128 block[2];
+   memcpy(&block[0], argL, sizeof(V128));
+   memcpy(&block[1], argR, sizeof(V128));
+   ULong res, flags;
+   __asm__ __volatile__(
+      "subq      $1024,  %%rsp"             "\n\t"
+      "movdqu    0(%2),  %%xmm2"            "\n\t"
+      "movdqu    16(%2), %%xmm11"           "\n\t"
+      "pcmpistri $0x1B,  %%xmm2, %%xmm11"   "\n\t"
+      "pushfq"                              "\n\t"
+      "popq      %%rdx"                     "\n\t"
+      "movq      %%rcx,  %0"                "\n\t"
+      "movq      %%rdx,  %1"                "\n\t"
+      "addq      $1024,  %%rsp"             "\n\t"
+      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
+      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
+   );
+   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
+}
+
+UInt s_pcmpistri_1B ( V128* argLU, V128* argRU )
+{
+   V128 resV;
+   UInt resOSZACP, resECX;
+   Bool ok
+      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
+			    zmask_from_V128(argLU),
+			    zmask_from_V128(argRU),
+			    0x1B, False/*!isSTRM*/
+        );
+   assert(ok);
+   resECX = resV.uInt[0];
+   return (resOSZACP << 16) | resECX;
+}
+
+void istri_1B ( void )
+{
+   char* wot = "1B";
+   UInt(*h)(V128*,V128*) = h_pcmpistri_1B;
+   UInt(*s)(V128*,V128*) = s_pcmpistri_1B;
+
+   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
+
+   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
+   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
+
+   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
+   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
+   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
+
+   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
+}
+
+
+
+//////////////////////////////////////////////////////////
+//                                                      //
+//                       ISTRI_03                       //
+//                                                      //
+//////////////////////////////////////////////////////////
+
+UInt h_pcmpistri_03 ( V128* argL, V128* argR )
+{
+   V128 block[2];
+   memcpy(&block[0], argL, sizeof(V128));
+   memcpy(&block[1], argR, sizeof(V128));
+   ULong res, flags;
+   __asm__ __volatile__(
+      "subq      $1024,  %%rsp"             "\n\t"
+      "movdqu    0(%2),  %%xmm2"            "\n\t"
+      "movdqu    16(%2), %%xmm11"           "\n\t"
+      "pcmpistri $0x03,  %%xmm2, %%xmm11"   "\n\t"
+//"pcmpistrm $0x03, %%xmm2, %%xmm11"   "\n\t"
+//"movd %%xmm0, %%ecx" "\n\t"
+      "pushfq"                              "\n\t"
+      "popq      %%rdx"                     "\n\t"
+      "movq      %%rcx,  %0"                "\n\t"
+      "movq      %%rdx,  %1"                "\n\t"
+      "addq      $1024,  %%rsp"             "\n\t"
+      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
+      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
+   );
+   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
+}
+
+UInt s_pcmpistri_03 ( V128* argLU, V128* argRU )
+{
+   V128 resV;
+   UInt resOSZACP, resECX;
+   Bool ok
+      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
+			    zmask_from_V128(argLU),
+			    zmask_from_V128(argRU),
+			    0x03, False/*!isSTRM*/
+        );
+   assert(ok);
+   resECX = resV.uInt[0];
+   return (resOSZACP << 16) | resECX;
+}
+
+void istri_03 ( void )
+{
+   char* wot = "03";
+   UInt(*h)(V128*,V128*) = h_pcmpistri_03;
+   UInt(*s)(V128*,V128*) = s_pcmpistri_03;
+
+   try_istri(wot,h,s, "aacdacbdaacdaacd", "00000000000000aa");
+   try_istri(wot,h,s, "aabbaabbaabbaabb", "00000000000000bb");
+   try_istri(wot,h,s, "aabbccddaabbccdd", "000000000000aabb");
+   try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
+
+   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
+   try_istri(wot,h,s, "00bbccddaabbccdd", "00000000aabbccdd");
+   try_istri(wot,h,s, "aabbccddaa00ccdd", "00000000aabbccdd");
+   try_istri(wot,h,s, "aabbccddaabb00dd", "00000000aabbccdd");
+   try_istri(wot,h,s, "aabbccddaabbcc00", "00000000aabbccdd");
+
+   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
+   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aa00ccdd");
+   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabb00dd");
+   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbcc00");
+
+   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+
+   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
+   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
+   try_istri(wot,h,s, "0000aabbaabbaabb", "000000000000bbbb");
+   try_istri(wot,h,s, "0000ccddaabbccdd", "00000000bbaabbaa");
+
+   try_istri(wot,h,s, "0000ccddaabbccdd", "000000bbaabbaa00");
+
+   try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
+   try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
+}
+
+
+//////////////////////////////////////////////////////////
+//                                                      //
+//                       ISTRI_13                       //
+//                                                      //
+//////////////////////////////////////////////////////////
+
+UInt h_pcmpistri_13 ( V128* argL, V128* argR )
+{
+   V128 block[2];
+   memcpy(&block[0], argL, sizeof(V128));
+   memcpy(&block[1], argR, sizeof(V128));
+   ULong res, flags;
+   __asm__ __volatile__(
+      "subq      $1024,  %%rsp"             "\n\t"
+      "movdqu    0(%2),  %%xmm2"            "\n\t"
+      "movdqu    16(%2), %%xmm11"           "\n\t"
+      "pcmpistri $0x13,  %%xmm2, %%xmm11"   "\n\t"
+//"pcmpistrm $0x13, %%xmm2, %%xmm11"   "\n\t"
+//"movd %%xmm0, %%ecx" "\n\t"
+      "pushfq"                              "\n\t"
+      "popq      %%rdx"                     "\n\t"
+      "movq      %%rcx,  %0"                "\n\t"
+      "movq      %%rdx,  %1"                "\n\t"
+      "addq      $1024,  %%rsp"             "\n\t"
+      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
+      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
+   );
+   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
+}
+
+UInt s_pcmpistri_13 ( V128* argLU, V128* argRU )
+{
+   V128 resV;
+   UInt resOSZACP, resECX;
+   Bool ok
+      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
+			    zmask_from_V128(argLU),
+			    zmask_from_V128(argRU),
+			    0x13, False/*!isSTRM*/
+        );
+   assert(ok);
+   resECX = resV.uInt[0];
+   return (resOSZACP << 16) | resECX;
+}
+
+void istri_13 ( void )
+{
+   char* wot = "13";
+   UInt(*h)(V128*,V128*) = h_pcmpistri_13;
+   UInt(*s)(V128*,V128*) = s_pcmpistri_13;
+
+   try_istri(wot,h,s, "aacdacbdaacdaacd", "00000000000000aa");
+   try_istri(wot,h,s, "aabbaabbaabbaabb", "00000000000000bb");
+   try_istri(wot,h,s, "aabbccddaabbccdd", "000000000000aabb");
+   try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
+
+   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
+   try_istri(wot,h,s, "00bbccddaabbccdd", "00000000aabbccdd");
+   try_istri(wot,h,s, "aabbccddaa00ccdd", "00000000aabbccdd");
+   try_istri(wot,h,s, "aabbccddaabb00dd", "00000000aabbccdd");
+   try_istri(wot,h,s, "aabbccddaabbcc00", "00000000aabbccdd");
+
+   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
+   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aa00ccdd");
+   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabb00dd");
+   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbcc00");
+
+   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+
+   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
+   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
+   try_istri(wot,h,s, "0000aabbaabbaabb", "000000000000bbbb");
+   try_istri(wot,h,s, "0000ccddaabbccdd", "00000000bbaabbaa");
+
+   try_istri(wot,h,s, "0000ccddaabbccdd", "000000bbaabbaa00");
+
+   try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
+   try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
+}
+
+
+
+//////////////////////////////////////////////////////////
+//                                                      //
+//                       ISTRI_45                       //
+//                                                      //
+//////////////////////////////////////////////////////////
+
+UInt h_pcmpistri_45 ( V128* argL, V128* argR )
+{
+   V128 block[2];
+   memcpy(&block[0], argL, sizeof(V128));
+   memcpy(&block[1], argR, sizeof(V128));
+   ULong res, flags;
+   __asm__ __volatile__(
+      "subq      $1024,  %%rsp"             "\n\t"
+      "movdqu    0(%2),  %%xmm2"            "\n\t"
+      "movdqu    16(%2), %%xmm11"           "\n\t"
+      "pcmpistri $0x45,  %%xmm2, %%xmm11"   "\n\t"
+//"pcmpistrm $0x04, %%xmm2, %%xmm11"   "\n\t"
+//"movd %%xmm0, %%ecx" "\n\t"
+      "pushfq"                              "\n\t"
+      "popq      %%rdx"                     "\n\t"
+      "movq      %%rcx,  %0"                "\n\t"
+      "movq      %%rdx,  %1"                "\n\t"
+      "addq      $1024,  %%rsp"             "\n\t"
+      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
+      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
+   );
+   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
+}
+
+UInt s_pcmpistri_45 ( V128* argLU, V128* argRU )
+{
+   V128 resV;
+   UInt resOSZACP, resECX;
+   Bool ok
+      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
+			    zmask_from_V128(argLU),
+			    zmask_from_V128(argRU),
+			    0x45, False/*!isSTRM*/
+        );
+   assert(ok);
+   resECX = resV.uInt[0];
+   return (resOSZACP << 16) | resECX;
+}
+
+void istri_45 ( void )
+{
+   char* wot = "45";
+   UInt(*h)(V128*,V128*) = h_pcmpistri_45;
+   UInt(*s)(V128*,V128*) = s_pcmpistri_45;
+
+   try_istri(wot,h,s, "aaaabbbbccccdddd", "000000000000bbcc");
+   try_istri(wot,h,s, "aaaabbbbccccdddd", "000000000000ccbb");
+   try_istri(wot,h,s, "baaabbbbccccdddd", "000000000000ccbb");
+   try_istri(wot,h,s, "baaabbbbccccdddc", "000000000000ccbb");
+
+   try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000ccbb");
+   try_istri(wot,h,s, "bbbbbbbb00bbbbbb", "000000000000ccbb");
+   try_istri(wot,h,s, "bbbbbbbbbbbb00bb", "000000000000ccbb");
+   try_istri(wot,h,s, "bbbbbbbbbbbbbb00", "000000000000ccbb");
+   try_istri(wot,h,s, "0000000000000000", "000000000000ccbb");
+
+   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
+
+   try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000ccbb");
+   try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000bb");
+   try_istri(wot,h,s, "bb44bb44bb44bb44", "000000006622ccbb");
+
+   try_istri(wot,h,s, "bb44bb44bb44bb44", "000000000022ccbb");
+   try_istri(wot,h,s, "bb44bb44bb44bb44", "000000000000ccbb");
+   try_istri(wot,h,s, "bb44bb44bb44bb44", "00000000000000bb");
+
+   try_istri(wot,h,s, "0011223344556677", "0000997755442211");
+   try_istri(wot,h,s, "1122334455667711", "0000997755442211");
+
+   try_istri(wot,h,s, "0011223344556677", "0000aa8866553322");
+   try_istri(wot,h,s, "1122334455667711", "0000aa8866553322");
+}
+
+
+//////////////////////////////////////////////////////////
+//                                                      //
+//                       ISTRI_01                       //
+//                                                      //
+//////////////////////////////////////////////////////////
+
+UInt h_pcmpistri_01 ( V128* argL, V128* argR )
+{
+   V128 block[2];
+   memcpy(&block[0], argL, sizeof(V128));
+   memcpy(&block[1], argR, sizeof(V128));
+   ULong res, flags;
+   __asm__ __volatile__(
+      "subq      $1024,  %%rsp"             "\n\t"
+      "movdqu    0(%2),  %%xmm2"            "\n\t"
+      "movdqu    16(%2), %%xmm11"           "\n\t"
+      "pcmpistri $0x01,  %%xmm2, %%xmm11"   "\n\t"
+//"pcmpistrm $0x01, %%xmm2, %%xmm11"   "\n\t"
+//"movd %%xmm0, %%ecx" "\n\t"
+      "pushfq"                              "\n\t"
+      "popq      %%rdx"                     "\n\t"
+      "movq      %%rcx,  %0"                "\n\t"
+      "movq      %%rdx,  %1"                "\n\t"
+      "addq      $1024,  %%rsp"             "\n\t"
+      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
+      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
+   );
+   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
+}
+
+UInt s_pcmpistri_01 ( V128* argLU, V128* argRU )
+{
+   V128 resV;
+   UInt resOSZACP, resECX;
+   Bool ok
+      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
+			    zmask_from_V128(argLU),
+			    zmask_from_V128(argRU),
+			    0x01, False/*!isSTRM*/
+        );
+   assert(ok);
+   resECX = resV.uInt[0];
+   return (resOSZACP << 16) | resECX;
+}
+
+void istri_01 ( void )
+{
+   char* wot = "01";
+   UInt(*h)(V128*,V128*) = h_pcmpistri_01;
+   UInt(*s)(V128*,V128*) = s_pcmpistri_01;
+
+   try_istri(wot,h,s, "aacdacbdaacdaacd", "00000000000000aa");
+   try_istri(wot,h,s, "aabbaabbaabbaabb", "00000000000000bb");
+   try_istri(wot,h,s, "aabbccddaabbccdd", "000000000000aabb");
+   try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
+
+   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
+   try_istri(wot,h,s, "00bbccddaabbccdd", "00000000aabbccdd");
+   try_istri(wot,h,s, "aabbccddaa00ccdd", "00000000aabbccdd");
+   try_istri(wot,h,s, "aabbccddaabb00dd", "00000000aabbccdd");
+   try_istri(wot,h,s, "aabbccddaabbcc00", "00000000aabbccdd");
+
+   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
+   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aa00ccdd");
+   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabb00dd");
+   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbcc00");
+
+   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+
+   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
+   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
+   try_istri(wot,h,s, "0000aabbaabbaabb", "000000000000bbbb");
+   try_istri(wot,h,s, "0000ccddaabbccdd", "00000000bbaabbaa");
+
+   try_istri(wot,h,s, "0000ccddaabbccdd", "000000bbaabbaa00");
+
+   try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
+   try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
+}
+
+
+//////////////////////////////////////////////////////////
+//                                                      //
+//                       ISTRI_39                       //
+//                                                      //
+//////////////////////////////////////////////////////////
+
+UInt h_pcmpistri_39 ( V128* argL, V128* argR )
+{
+   V128 block[2];
+   memcpy(&block[0], argL, sizeof(V128));
+   memcpy(&block[1], argR, sizeof(V128));
+   ULong res, flags;
+   __asm__ __volatile__(
+      "subq      $1024,  %%rsp"             "\n\t"
+      "movdqu    0(%2),  %%xmm2"            "\n\t"
+      "movdqu    16(%2), %%xmm11"           "\n\t"
+      "pcmpistri $0x39,  %%xmm2, %%xmm11"   "\n\t"
+      "pushfq"                              "\n\t"
+      "popq      %%rdx"                     "\n\t"
+      "movq      %%rcx,  %0"                "\n\t"
+      "movq      %%rdx,  %1"                "\n\t"
+      "addq      $1024,  %%rsp"             "\n\t"
+      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
+      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
+   );
+   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
+}
+
+UInt s_pcmpistri_39 ( V128* argLU, V128* argRU )
+{
+   V128 resV;
+   UInt resOSZACP, resECX;
+   Bool ok
+      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
+			    zmask_from_V128(argLU),
+			    zmask_from_V128(argRU),
+			    0x39, False/*!isSTRM*/
+        );
+   assert(ok);
+   resECX = resV.uInt[0];
+   return (resOSZACP << 16) | resECX;
+}
+
+void istri_39 ( void )
+{
+   char* wot = "39";
+   UInt(*h)(V128*,V128*) = h_pcmpistri_39;
+   UInt(*s)(V128*,V128*) = s_pcmpistri_39;
+
+   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
+
+   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
+   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
+
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
+
+   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
+   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
+   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
+
+   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
+}
+
+
+
+//////////////////////////////////////////////////////////
+//                                                      //
+//                         main                         //
+//                                                      //
+//////////////////////////////////////////////////////////
+
+int main ( void )
+{
+   istri_4B();
+   istri_3B();
+   istri_09();
+   istri_1B();
+   istri_03();
+   istri_0D();
+   istri_13();
+   istri_45();
+   istri_01();
+   istri_39();
+   return 0;
+}
diff --git a/none/tests/amd64/pcmpstr64w.stderr.exp b/none/tests/amd64/pcmpstr64w.stderr.exp
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/none/tests/amd64/pcmpstr64w.stderr.exp
diff --git a/none/tests/amd64/pcmpstr64w.stdout.exp b/none/tests/amd64/pcmpstr64w.stdout.exp
new file mode 100644
index 0000000..358b82e
--- /dev/null
+++ b/none/tests/amd64/pcmpstr64w.stdout.exp
@@ -0,0 +1,256 @@
+istri 4B  0000000000000000 0000000000000000 -> 08c10007 08c10007 
+istri 4B  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 08010007 08010007 
+istri 4B  aaaa2aaaaaaaaaaa aaaaaaaaaaaaaaaa -> 08010007 08010007 
+istri 4B  aaaaaaaaa2aaaaaa aaaaaaaaaaaaaaaa -> 08010007 08010007 
+istri 4B  aaaaaaaaaaaaa2aa aaaaaaaaaaaaaaaa -> 08010007 08010007 
+istri 4B  aaaaaaaaaaaaaaaa aaaa2aaaaaaaaaaa -> 08010007 08010007 
+istri 4B  aaaaaaaaaaaaaaaa aaaaaaaaa2aaaaaa -> 08010007 08010007 
+istri 4B  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaa2a -> 00010007 00010007 
+istri 4B  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 08010007 08010007 
+istri 4B  baaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 08010006 08010006 
+istri 4B  b9aaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 08010006 08010006 
+istri 4B  b9baaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 08010005 08010005 
+istri 4B  b9baaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 08010005 08010005 
+istri 4B  b9baaaaaaaaaaaaa aaaaaaaaaaaa7aaa -> 08010005 08010005 
+istri 4B  b9baaaaaaaaaaaaa aaaaaaaa2aaa4aaa -> 08010005 08010005 
+istri 4B  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 08010007 08010007 
+istri 4B  aaaaaaaaaaaa00aa aaaaaaaaaaaaaaaa -> 08410000 08410000 
+istri 4B  aaaaaaaaaaaaaaaa aaaaaaaaaaaa00aa -> 08810000 08810000 
+istri 4B  aaaaaaaaaaaa00aa aaaaaaaaaaaa00aa -> 08c10007 08c10007 
+istri 4B  aaaaaaaa00aaaaaa aaaaaaaaaaaaaaaa -> 08410002 08410002 
+istri 4B  aaaaaaaaaaaaaaaa aaaaaaaaaaaa00aa -> 08810000 08810000 
+istri 4B  aaaaaaaa00aaaaaa aaaaaaaaaaaa00aa -> 08c10007 08c10007 
+istri 4B  aaaaaaaaaaaa00aa aaaaaaaaaaaaaaaa -> 08410000 08410000 
+istri 4B  aaaaaaaaaaaaaaaa aaaaaaaa00aaaaaa -> 08810002 08810002 
+istri 4B  aaaaaaaaaaaa00aa aaaaaaaa00aaaaaa -> 08c10007 08c10007 
+istri 4B  0000000000000000 aaaaaaaa00aaaaaa -> 00c10007 00c10007 
+istri 4B  8000000000000000 aaaaaaaa00aaaaaa -> 00c10007 00c10007 
+istri 4B  0000000000000001 aaaaaaaa00aaaaaa -> 00c10007 00c10007 
+istri 4B  0000000000000000 aaaaaaaaaaaaaaaa -> 00400008 00400008 
+istri 4B  aaaaaaaaaaaaaaaa 0000000000000000 -> 00800008 00800008 
+istri 3B  0000000000000000 0000000000000000 -> 08c10000 08c10000 
+istri 3B  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00000008 00000008 
+istri 3B  aaaa2aaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00010005 00010005 
+istri 3B  aaaaaaaaa2aaaaaa aaaaaaaaaaaaaaaa -> 00010003 00010003 
+istri 3B  aaaaaaaaaaaaa2aa aaaaaaaaaaaaaaaa -> 00010001 00010001 
+istri 3B  aaaaaaaaaaaaaaaa aaaa2aaaaaaaaaaa -> 00010005 00010005 
+istri 3B  aaaaaaaaaaaaaaaa aaaaaaaaa2aaaaaa -> 00010003 00010003 
+istri 3B  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaa2a -> 08010000 08010000 
+istri 3B  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00000008 00000008 
+istri 3B  baaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00010007 00010007 
+istri 3B  b9aaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00010007 00010007 
+istri 3B  b9baaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00010006 00010006 
+istri 3B  b9baaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00010006 00010006 
+istri 3B  b9baaaaaaaaaaaaa aaaaaaaaaaaa7aaa -> 00010001 00010001 
+istri 3B  b9baaaaaaaaaaaaa aaaaaaaa2aaa4aaa -> 00010001 00010001 
+istri 3B  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00000008 00000008 
+istri 3B  aaaaaaaaaaaa00aa aaaaaaaaaaaaaaaa -> 00400008 00400008 
+istri 3B  aaaaaaaaaaaaaaaa aaaaaaaaaaaa00aa -> 00810001 00810001 
+istri 3B  aaaaaaaaaaaa00aa aaaaaaaaaaaa00aa -> 00c10001 00c10001 
+istri 3B  aaaaaaaa00aaaaaa aaaaaaaaaaaaaaaa -> 00400008 00400008 
+istri 3B  aaaaaaaaaaaaaaaa aaaaaaaaaaaa00aa -> 00810001 00810001 
+istri 3B  aaaaaaaa00aaaaaa aaaaaaaaaaaa00aa -> 00c10001 00c10001 
+istri 3B  aaaaaaaaaaaa00aa aaaaaaaaaaaaaaaa -> 00400008 00400008 
+istri 3B  aaaaaaaaaaaaaaaa aaaaaaaa00aaaaaa -> 00810003 00810003 
+istri 3B  aaaaaaaaaaaa00aa aaaaaaaa00aaaaaa -> 00c10003 00c10003 
+istri 3B  0000000000000000 aaaaaaaa00aaaaaa -> 00c10003 00c10003 
+istri 3B  8000000000000000 aaaaaaaa00aaaaaa -> 00c10003 00c10003 
+istri 3B  0000000000000001 aaaaaaaa00aaaaaa -> 08c10000 08c10000 
+istri 3B  0000000000000000 aaaaaaaaaaaaaaaa -> 00400008 00400008 
+istri 3B  aaaaaaaaaaaaaaaa 0000000000000000 -> 08810000 08810000 
+istri 09  0000000000000000 0000000000000000 -> 08c10000 08c10000 
+istri 09  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 08010000 08010000 
+istri 09  aaaa2aaaaaaaaaaa aaaaaaaaaaaaaaaa -> 08010000 08010000 
+istri 09  aaaaaaaaa2aaaaaa aaaaaaaaaaaaaaaa -> 08010000 08010000 
+istri 09  aaaaaaaaaaaaa2aa aaaaaaaaaaaaaaaa -> 08010000 08010000 
+istri 09  aaaaaaaaaaaaaaaa aaaa2aaaaaaaaaaa -> 08010000 08010000 
+istri 09  aaaaaaaaaaaaaaaa aaaaaaaaa2aaaaaa -> 08010000 08010000 
+istri 09  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaa2a -> 00010001 00010001 
+istri 09  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 08010000 08010000 
+istri 09  baaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 08010000 08010000 
+istri 09  b9aaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 08010000 08010000 
+istri 09  b9baaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 08010000 08010000 
+istri 09  b9baaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 08010000 08010000 
+istri 09  b9baaaaaaaaaaaaa aaaaaaaaaaaa7aaa -> 08010000 08010000 
+istri 09  b9baaaaaaaaaaaaa aaaaaaaa2aaa4aaa -> 08010000 08010000 
+istri 09  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 08010000 08010000 
+istri 09  aaaaaaaaaaaa00aa aaaaaaaaaaaaaaaa -> 08410000 08410000 
+istri 09  aaaaaaaaaaaaaaaa aaaaaaaaaaaa00aa -> 08810000 08810000 
+istri 09  aaaaaaaaaaaa00aa aaaaaaaaaaaa00aa -> 08c10000 08c10000 
+istri 09  aaaaaaaa00aaaaaa aaaaaaaaaaaaaaaa -> 08410000 08410000 
+istri 09  aaaaaaaaaaaaaaaa aaaaaaaaaaaa00aa -> 08810000 08810000 
+istri 09  aaaaaaaa00aaaaaa aaaaaaaaaaaa00aa -> 08c10000 08c10000 
+istri 09  aaaaaaaaaaaa00aa aaaaaaaaaaaaaaaa -> 08410000 08410000 
+istri 09  aaaaaaaaaaaaaaaa aaaaaaaa00aaaaaa -> 08810000 08810000 
+istri 09  aaaaaaaaaaaa00aa aaaaaaaa00aaaaaa -> 08c10000 08c10000 
+istri 09  0000000000000000 aaaaaaaa00aaaaaa -> 00c10003 00c10003 
+istri 09  8000000000000000 aaaaaaaa00aaaaaa -> 00c10003 00c10003 
+istri 09  0000000000000001 aaaaaaaa00aaaaaa -> 00c10003 00c10003 
+istri 09  0000000000000000 aaaaaaaaaaaaaaaa -> 00400008 00400008 
+istri 09  aaaaaaaaaaaaaaaa 0000000000000000 -> 00800008 00800008 
+istri 1B  0000000000000000 0000000000000000 -> 00c00008 00c00008 
+istri 1B  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00000008 00000008 
+istri 1B  aaaa2aaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00010005 00010005 
+istri 1B  aaaaaaaaa2aaaaaa aaaaaaaaaaaaaaaa -> 00010003 00010003 
+istri 1B  aaaaaaaaaaaaa2aa aaaaaaaaaaaaaaaa -> 00010001 00010001 
+istri 1B  aaaaaaaaaaaaaaaa aaaa2aaaaaaaaaaa -> 00010005 00010005 
+istri 1B  aaaaaaaaaaaaaaaa aaaaaaaaa2aaaaaa -> 00010003 00010003 
+istri 1B  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaa2a -> 08010000 08010000 
+istri 1B  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00000008 00000008 
+istri 1B  baaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00010007 00010007 
+istri 1B  b9aaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00010007 00010007 
+istri 1B  b9baaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00010006 00010006 
+istri 1B  b9baaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00010006 00010006 
+istri 1B  b9baaaaaaaaaaaaa aaaaaaaaaaaa7aaa -> 00010001 00010001 
+istri 1B  b9baaaaaaaaaaaaa aaaaaaaa2aaa4aaa -> 00010001 00010001 
+istri 1B  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00000008 00000008 
+istri 1B  aaaaaaaaaaaa00aa aaaaaaaaaaaaaaaa -> 00410001 00410001 
+istri 1B  aaaaaaaaaaaaaaaa aaaaaaaaaaaa00aa -> 00810001 00810001 
+istri 1B  aaaaaaaaaaaa00aa aaaaaaaaaaaa00aa -> 00c00008 00c00008 
+istri 1B  aaaaaaaa00aaaaaa aaaaaaaaaaaaaaaa -> 00410003 00410003 
+istri 1B  aaaaaaaaaaaaaaaa aaaaaaaaaaaa00aa -> 00810001 00810001 
+istri 1B  aaaaaaaa00aaaaaa aaaaaaaaaaaa00aa -> 00c10001 00c10001 
+istri 1B  aaaaaaaaaaaa00aa aaaaaaaaaaaaaaaa -> 00410001 00410001 
+istri 1B  aaaaaaaaaaaaaaaa aaaaaaaa00aaaaaa -> 00810003 00810003 
+istri 1B  aaaaaaaaaaaa00aa aaaaaaaa00aaaaaa -> 00c10001 00c10001 
+istri 1B  0000000000000000 aaaaaaaa00aaaaaa -> 08c10000 08c10000 
+istri 1B  8000000000000000 aaaaaaaa00aaaaaa -> 08c10000 08c10000 
+istri 1B  0000000000000001 aaaaaaaa00aaaaaa -> 08c10000 08c10000 
+istri 1B  0000000000000000 aaaaaaaaaaaaaaaa -> 08410000 08410000 
+istri 1B  aaaaaaaaaaaaaaaa 0000000000000000 -> 08810000 08810000 
+istri 03  aacdacbdaacdaacd 00000000000000aa -> 00810001 00810001 
+istri 03  aabbaabbaabbaabb 00000000000000bb -> 08810000 08810000 
+istri 03  aabbccddaabbccdd 000000000000aabb -> 00810002 00810002 
+istri 03  abcdabc0abcdabcd 000000000000abcd -> 08810000 08810000 
+istri 03  aabbccddaabbccdd 00000000aabbccdd -> 08810000 08810000 
+istri 03  00bbccddaabbccdd 00000000aabbccdd -> 08c10000 08c10000 
+istri 03  aabbccddaa00ccdd 00000000aabbccdd -> 08c10000 08c10000 
+istri 03  aabbccddaabb00dd 00000000aabbccdd -> 08c10000 08c10000 
+istri 03  aabbccddaabbcc00 00000000aabbccdd -> 00c00008 00c00008 
+istri 03  aabbccddaabbccdd 00000000aabbccdd -> 08810000 08810000 
+istri 03  aabbccddaabbccdd 00000000aa00ccdd -> 08810000 08810000 
+istri 03  aabbccddaabbccdd 00000000aabb00dd -> 08810000 08810000 
+istri 03  aabbccddaabbccdd 00000000aabbcc00 -> 00800008 00800008 
+istri 03  0000000000000000 0000000000000000 -> 00c00008 00c00008 
+istri 03  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 08010000 08010000 
+istri 03  0000abcdabcdabcd 000000000000abcd -> 08c10000 08c10000 
+istri 03  0000abcdabcdabcd 000000000000dcba -> 00c00008 00c00008 
+istri 03  0000aabbaabbaabb 000000000000bbbb -> 08c10000 08c10000 
+istri 03  0000ccddaabbccdd 00000000bbaabbaa -> 00c10002 00c10002 
+istri 03  0000ccddaabbccdd 000000bbaabbaa00 -> 00c00008 00c00008 
+istri 03  0ddc0ffeebadf00d 00000000cafebabe -> 00810004 00810004 
+istri 03  0ddc0ffeebadfeed 00000000cafebabe -> 00810001 00810001 
+istri 0D  11111111abcdef11 0000000000abcdef -> 00810001 00810001 
+istri 0D  11111111abcdef11 00abcdef00abcdef -> 00810001 00810001 
+istri 0D  11111111abcdef11 0000000000abcdef -> 00810001 00810001 
+istri 0D  1111111111abcdef 0000000000abcdef -> 08810000 08810000 
+istri 0D  111111111111abcd 0000000000abcdef -> 00800008 00800008 
+istri 0D  1111abcd11abcd11 000000000000abcd -> 00810001 00810001 
+istri 0D  11abcd1111abcd11 000000000000abcd -> 00810001 00810001 
+istri 0D  abcd111111abcd11 000000000000abcd -> 00810001 00810001 
+istri 0D  cd11111111abcd11 000000000000abcd -> 00810001 00810001 
+istri 0D  01abcd11abcd1111 000000000000abcd -> 00810002 00810002 
+istri 0D  00abcd11abcd1111 000000000000abcd -> 00c10002 00c10002 
+istri 0D  0000cd11abcd1111 000000000000abcd -> 00c10002 00c10002 
+istri 0D  00abcd1100abcd11 000000000000abcd -> 00c10001 00c10001 
+istri 0D  00abcd110000cd11 000000000000abcd -> 00c00008 00c00008 
+istri 0D  1111111111111234 0000000000000000 -> 08810000 08810000 
+istri 0D  1111111111111234 0000000000000011 -> 00810002 00810002 
+istri 0D  1111111111111234 0000000000001111 -> 00810002 00810002 
+istri 0D  1111111111111234 1111111111111234 -> 08010000 08010000 
+istri 0D  0a11111111111111 000000000000000a -> 00810007 00810007 
+istri 0D  0b11111111111111 000000000000000a -> 00800008 00800008 
+istri 13  aacdacbdaacdaacd 00000000000000aa -> 08810000 08810000 
+istri 13  aabbaabbaabbaabb 00000000000000bb -> 00810001 00810001 
+istri 13  aabbccddaabbccdd 000000000000aabb -> 08810000 08810000 
+istri 13  abcdabc0abcdabcd 000000000000abcd -> 00810004 00810004 
+istri 13  aabbccddaabbccdd 00000000aabbccdd -> 00800008 00800008 
+istri 13  00bbccddaabbccdd 00000000aabbccdd -> 00c10007 00c10007 
+istri 13  aabbccddaa00ccdd 00000000aabbccdd -> 00c10002 00c10002 
+istri 13  aabbccddaabb00dd 00000000aabbccdd -> 00c10001 00c10001 
+istri 13  aabbccddaabbcc00 00000000aabbccdd -> 08c10000 08c10000 
+istri 13  aabbccddaabbccdd 00000000aabbccdd -> 00800008 00800008 
+istri 13  aabbccddaabbccdd 00000000aa00ccdd -> 00810002 00810002 
+istri 13  aabbccddaabbccdd 00000000aabb00dd -> 00810001 00810001 
+istri 13  aabbccddaabbccdd 00000000aabbcc00 -> 08810000 08810000 
+istri 13  0000000000000000 0000000000000000 -> 08c10000 08c10000 
+istri 13  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00000008 00000008 
+istri 13  0000abcdabcdabcd 000000000000abcd -> 00c10006 00c10006 
+istri 13  0000abcdabcdabcd 000000000000dcba -> 08c10000 08c10000 
+istri 13  0000aabbaabbaabb 000000000000bbbb -> 00c10001 00c10001 
+istri 13  0000ccddaabbccdd 00000000bbaabbaa -> 08c10000 08c10000 
+istri 13  0000ccddaabbccdd 000000bbaabbaa00 -> 08c10000 08c10000 
+istri 13  0ddc0ffeebadf00d 00000000cafebabe -> 08810000 08810000 
+istri 13  0ddc0ffeebadfeed 00000000cafebabe -> 08810000 08810000 
+istri 45  aaaabbbbccccdddd 000000000000bbcc -> 00800008 00800008 
+istri 45  aaaabbbbccccdddd 000000000000ccbb -> 00810005 00810005 
+istri 45  baaabbbbccccdddd 000000000000ccbb -> 00810005 00810005 
+istri 45  baaabbbbccccdddc 000000000000ccbb -> 00810005 00810005 
+istri 45  bbbbbbbbbbbbbbbb 000000000000ccbb -> 08810007 08810007 
+istri 45  bbbbbbbb00bbbbbb 000000000000ccbb -> 08c10002 08c10002 
+istri 45  bbbbbbbbbbbb00bb 000000000000ccbb -> 08c10000 08c10000 
+istri 45  bbbbbbbbbbbbbb00 000000000000ccbb -> 00c00008 00c00008 
+istri 45  0000000000000000 000000000000ccbb -> 00c00008 00c00008 
+istri 45  0000000000000000 0000000000000000 -> 00c00008 00c00008 
+istri 45  bbbbbbbbbbbbbbbb 000000000000ccbb -> 08810007 08810007 
+istri 45  bbbbbbbbbbbbbbbb 00000000000000bb -> 00800008 00800008 
+istri 45  bb44bb44bb44bb44 000000006622ccbb -> 08810007 08810007 
+istri 45  bb44bb44bb44bb44 000000000022ccbb -> 00810007 00810007 
+istri 45  bb44bb44bb44bb44 000000000000ccbb -> 00810007 00810007 
+istri 45  bb44bb44bb44bb44 00000000000000bb -> 00800008 00800008 
+istri 45  0011223344556677 0000997755442211 -> 08c10006 08c10006 
+istri 45  1122334455667711 0000997755442211 -> 08810007 08810007 
+istri 45  0011223344556677 0000aa8866553322 -> 00c10005 00c10005 
+istri 45  1122334455667711 0000aa8866553322 -> 00810006 00810006 
+istri 01  aacdacbdaacdaacd 00000000000000aa -> 00810001 00810001 
+istri 01  aabbaabbaabbaabb 00000000000000bb -> 08810000 08810000 
+istri 01  aabbccddaabbccdd 000000000000aabb -> 00810002 00810002 
+istri 01  abcdabc0abcdabcd 000000000000abcd -> 08810000 08810000 
+istri 01  aabbccddaabbccdd 00000000aabbccdd -> 08810000 08810000 
+istri 01  00bbccddaabbccdd 00000000aabbccdd -> 08c10000 08c10000 
+istri 01  aabbccddaa00ccdd 00000000aabbccdd -> 08c10000 08c10000 
+istri 01  aabbccddaabb00dd 00000000aabbccdd -> 08c10000 08c10000 
+istri 01  aabbccddaabbcc00 00000000aabbccdd -> 00c00008 00c00008 
+istri 01  aabbccddaabbccdd 00000000aabbccdd -> 08810000 08810000 
+istri 01  aabbccddaabbccdd 00000000aa00ccdd -> 08810000 08810000 
+istri 01  aabbccddaabbccdd 00000000aabb00dd -> 08810000 08810000 
+istri 01  aabbccddaabbccdd 00000000aabbcc00 -> 00800008 00800008 
+istri 01  0000000000000000 0000000000000000 -> 00c00008 00c00008 
+istri 01  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 08010000 08010000 
+istri 01  0000abcdabcdabcd 000000000000abcd -> 08c10000 08c10000 
+istri 01  0000abcdabcdabcd 000000000000dcba -> 00c00008 00c00008 
+istri 01  0000aabbaabbaabb 000000000000bbbb -> 08c10000 08c10000 
+istri 01  0000ccddaabbccdd 00000000bbaabbaa -> 00c10002 00c10002 
+istri 01  0000ccddaabbccdd 000000bbaabbaa00 -> 00c00008 00c00008 
+istri 01  0ddc0ffeebadf00d 00000000cafebabe -> 00810004 00810004 
+istri 01  0ddc0ffeebadfeed 00000000cafebabe -> 00810001 00810001 
+istri 39  0000000000000000 0000000000000000 -> 08c10000 08c10000 
+istri 39  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00000008 00000008 
+istri 39  aaaa2aaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00010005 00010005 
+istri 39  aaaaaaaaa2aaaaaa aaaaaaaaaaaaaaaa -> 00010003 00010003 
+istri 39  aaaaaaaaaaaaa2aa aaaaaaaaaaaaaaaa -> 00010001 00010001 
+istri 39  aaaaaaaaaaaaaaaa aaaa2aaaaaaaaaaa -> 00010005 00010005 
+istri 39  aaaaaaaaaaaaaaaa aaaaaaaaa2aaaaaa -> 00010003 00010003 
+istri 39  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaa2a -> 08010000 08010000 
+istri 39  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00000008 00000008 
+istri 39  baaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00010007 00010007 
+istri 39  b9aaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00010007 00010007 
+istri 39  b9baaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00010006 00010006 
+istri 39  b9baaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00010006 00010006 
+istri 39  b9baaaaaaaaaaaaa aaaaaaaaaaaa7aaa -> 00010001 00010001 
+istri 39  b9baaaaaaaaaaaaa aaaaaaaa2aaa4aaa -> 00010001 00010001 
+istri 39  aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaa -> 00000008 00000008 
+istri 39  aaaaaaaaaaaa00aa aaaaaaaaaaaaaaaa -> 00400008 00400008 
+istri 39  aaaaaaaaaaaaaaaa aaaaaaaaaaaa00aa -> 00810001 00810001 
+istri 39  aaaaaaaaaaaa00aa aaaaaaaaaaaa00aa -> 00c10001 00c10001 
+istri 39  aaaaaaaa00aaaaaa aaaaaaaaaaaaaaaa -> 00400008 00400008 
+istri 39  aaaaaaaaaaaaaaaa aaaaaaaaaaaa00aa -> 00810001 00810001 
+istri 39  aaaaaaaa00aaaaaa aaaaaaaaaaaa00aa -> 00c10001 00c10001 
+istri 39  aaaaaaaaaaaa00aa aaaaaaaaaaaaaaaa -> 00400008 00400008 
+istri 39  aaaaaaaaaaaaaaaa aaaaaaaa00aaaaaa -> 00810003 00810003 
+istri 39  aaaaaaaaaaaa00aa aaaaaaaa00aaaaaa -> 00c10003 00c10003 
+istri 39  0000000000000000 aaaaaaaa00aaaaaa -> 00c10003 00c10003 
+istri 39  8000000000000000 aaaaaaaa00aaaaaa -> 00c10003 00c10003 
+istri 39  0000000000000001 aaaaaaaa00aaaaaa -> 08c10000 08c10000 
+istri 39  0000000000000000 aaaaaaaaaaaaaaaa -> 00400008 00400008 
+istri 39  aaaaaaaaaaaaaaaa 0000000000000000 -> 08810000 08810000 
diff --git a/none/tests/amd64/pcmpstr64w.vgtest b/none/tests/amd64/pcmpstr64w.vgtest
new file mode 100644
index 0000000..d088a43
--- /dev/null
+++ b/none/tests/amd64/pcmpstr64w.vgtest
@@ -0,0 +1,3 @@
+prog: pcmpstr64w
+prereq: ../../../tests/x86_amd64_features amd64-sse42
+vgopts: -q
diff --git a/none/tests/amd64/pcmpxstrx64w.c b/none/tests/amd64/pcmpxstrx64w.c
new file mode 100644
index 0000000..f44b9e2
--- /dev/null
+++ b/none/tests/amd64/pcmpxstrx64w.c
@@ -0,0 +1,335 @@
+
+/* Tests e-vs-i or i-vs-m aspects for pcmp{e,i}str{i,m}.  Does not
+   check the core arithmetic in any detail. This file checks the 16-bit
+   character versions (w is for wide) */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+
+typedef  unsigned char  V128[16];
+typedef  unsigned int   UInt;
+typedef  signed int     Int;
+typedef  unsigned char  UChar;
+typedef  unsigned long long int ULong;
+typedef  UChar          Bool;
+#define False ((Bool)0)
+#define True  ((Bool)1)
+
+void show_V128 ( V128* vec )
+{
+   Int i;
+   for (i = 15; i >= 0; i--)
+      printf("%02x", (UInt)( (*vec)[i] ));
+}
+
+void expand ( V128* dst, char* summary )
+{
+   Int i;
+   assert( strlen(summary) == 16 );
+   for (i = 0; i < 16; i++) {
+      UChar xx = 0;
+      UChar x = summary[15-i];
+      if      (x >= '0' && x <= '9') { xx = x - '0'; }
+      else if (x >= 'A' && x <= 'F') { xx = x - 'A' + 10; }
+      else if (x >= 'a' && x <= 'f') { xx = x - 'a' + 10; }
+      else assert(0);
+
+      assert(xx < 16);
+      xx = (xx << 4) | xx;
+      assert(xx < 256);
+      (*dst)[i] = xx;
+   }
+}
+
+void one_test ( char* summL, ULong rdxIN, char* summR, ULong raxIN )
+{
+   V128 argL, argR;
+   expand( &argL, summL );
+   expand( &argR, summR );
+   printf("\n");
+   printf("rdx %016llx  argL ", rdxIN);
+   show_V128(&argL);
+   printf("  rax %016llx  argR ", raxIN);
+   show_V128(&argR);
+   printf("\n");
+
+   ULong block[ 2/*in:argL*/          // 0  0
+                + 2/*in:argR*/        // 2  16
+                + 1/*in:rdx*/         // 4  32
+                + 1/*in:rax*/         // 5  40
+                + 2/*inout:xmm0*/     // 6  48
+                + 1/*inout:rcx*/      // 8  64
+                + 1/*out:rflags*/ ];  // 9  72
+   assert(sizeof(block) == 80);
+
+   UChar* blockC = (UChar*)&block[0];
+
+   /* ---------------- ISTRI_4B ---------------- */
+   memset(blockC, 0x55, 80);
+   memcpy(blockC + 0,  &argL,  16);
+   memcpy(blockC + 16, &argR,  16);
+   memcpy(blockC + 24, &rdxIN, 8);
+   memcpy(blockC + 32, &raxIN, 8);
+   memcpy(blockC + 40, &rdxIN, 8);
+   __asm__ __volatile__(
+      "movupd    0(%0), %%xmm2"           "\n\t"
+      "movupd    16(%0), %%xmm13"         "\n\t"
+      "movq      32(%0), %%rdx"           "\n\t"
+      "movq      40(%0), %%rax"           "\n\t"
+      "movupd    48(%0), %%xmm0"          "\n\t"
+      "movw      64(%0), %%rcx"           "\n\t"
+      "pcmpistri $0x4B, %%xmm2, %%xmm13"  "\n\t"
+      "movupd    %%xmm0, 48(%0)"          "\n\t"
+      "movw      %%rcx, 64(%0)"           "\n\t"
+      "pushfq"                            "\n\t"
+      "popq      %%r15"                   "\n\t"
+      "movq      %%r15, 72(%0)"           "\n\t"
+      : /*out*/ 
+      : /*in*/"r"(blockC) 
+      : /*trash*/"memory","cc","xmm2","xmm13","xmm0","rdx","rax","rcx","r15"
+   );
+   printf("  istri $0x4B:  ");
+   printf("    xmm0 ");
+   show_V128( (V128*)(blockC+48) );
+   printf("  rcx %016llx  flags %08llx\n", block[8], block[9] & 0x8D5);
+
+   /* ---------------- ISTRI_0B ---------------- */
+   memset(blockC, 0x55, 80);
+   memcpy(blockC + 0,  &argL,  16);
+   memcpy(blockC + 16, &argR,  16);
+   memcpy(blockC + 24, &rdxIN, 8);
+   memcpy(blockC + 32, &raxIN, 8);
+   memcpy(blockC + 40, &rdxIN, 8);
+   __asm__ __volatile__(
+      "movupd    0(%0), %%xmm2"           "\n\t"
+      "movupd    16(%0), %%xmm13"         "\n\t"
+      "movq      32(%0), %%rdx"           "\n\t"
+      "movq      40(%0), %%rax"           "\n\t"
+      "movupd    48(%0), %%xmm0"          "\n\t"
+      "movw      64(%0), %%rcx"           "\n\t"
+      "pcmpistri $0x0B, %%xmm2, %%xmm13"  "\n\t"
+      "movupd    %%xmm0, 48(%0)"          "\n\t"
+      "movw      %%rcx, 64(%0)"           "\n\t"
+      "pushfq"                            "\n\t"
+      "popq      %%r15"                   "\n\t"
+      "movq      %%r15, 72(%0)"           "\n\t"
+      : /*out*/ 
+      : /*in*/"r"(blockC) 
+      : /*trash*/"memory","cc","xmm2","xmm13","xmm0","rdx","rax","rcx","r15"
+   );
+   printf("  istri $0x0B:  ");
+   printf("    xmm0 ");
+   show_V128( (V128*)(blockC+48) );
+   printf("  rcx %016llx  flags %08llx\n", block[8], block[9] & 0x8D5);
+
+   /* ---------------- ISTRM_4B ---------------- */
+   memset(blockC, 0x55, 80);
+   memcpy(blockC + 0,  &argL,  16);
+   memcpy(blockC + 16, &argR,  16);
+   memcpy(blockC + 24, &rdxIN, 8);
+   memcpy(blockC + 32, &raxIN, 8);
+   memcpy(blockC + 40, &rdxIN, 8);
+   __asm__ __volatile__(
+      "movupd    0(%0), %%xmm2"           "\n\t"
+      "movupd    16(%0), %%xmm13"         "\n\t"
+      "movq      32(%0), %%rdx"           "\n\t"
+      "movq      40(%0), %%rax"           "\n\t"
+      "movupd    48(%0), %%xmm0"          "\n\t"
+      "movw      64(%0), %%rcx"           "\n\t"
+      "pcmpistrm $0x4B, %%xmm2, %%xmm13"  "\n\t"
+      "movupd    %%xmm0, 48(%0)"          "\n\t"
+      "movw      %%rcx, 64(%0)"           "\n\t"
+      "pushfq"                            "\n\t"
+      "popq      %%r15"                   "\n\t"
+      "movq      %%r15, 72(%0)"           "\n\t"
+      : /*out*/ 
+      : /*in*/"r"(blockC) 
+      : /*trash*/"memory","cc","xmm2","xmm13","xmm0","rdx","rax","rcx","r15"
+   );
+   printf("  istrm $0x4B:  ");
+   printf("    xmm0 ");
+   show_V128( (V128*)(blockC+48) );
+   printf("  rcx %016llx  flags %08llx\n", block[8], block[9] & 0x8D5);
+
+   /* ---------------- ISTRM_0B ---------------- */
+   memset(blockC, 0x55, 80);
+   memcpy(blockC + 0,  &argL,  16);
+   memcpy(blockC + 16, &argR,  16);
+   memcpy(blockC + 24, &rdxIN, 8);
+   memcpy(blockC + 32, &raxIN, 8);
+   memcpy(blockC + 40, &rdxIN, 8);
+   __asm__ __volatile__(
+      "movupd    0(%0), %%xmm2"           "\n\t"
+      "movupd    16(%0), %%xmm13"         "\n\t"
+      "movq      32(%0), %%rdx"           "\n\t"
+      "movq      40(%0), %%rax"           "\n\t"
+      "movupd    48(%0), %%xmm0"          "\n\t"
+      "movw      64(%0), %%rcx"           "\n\t"
+      "pcmpistrm $0x0B, %%xmm2, %%xmm13"  "\n\t"
+      "movupd    %%xmm0, 48(%0)"          "\n\t"
+      "movw      %%rcx, 64(%0)"           "\n\t"
+      "pushfq"                            "\n\t"
+      "popq      %%r15"                   "\n\t"
+      "movq      %%r15, 72(%0)"           "\n\t"
+      : /*out*/ 
+      : /*in*/"r"(blockC) 
+      : /*trash*/"memory","cc","xmm2","xmm13","xmm0","rdx","rax","rcx","r15"
+   );
+   printf("  istrm $0x0B:  ");
+   printf("    xmm0 ");
+   show_V128( (V128*)(blockC+48) );
+   printf("  rcx %016llx  flags %08llx\n", block[8], block[9] & 0x8D5);
+
+   /* ---------------- ESTRI_4B ---------------- */
+   memset(blockC, 0x55, 80);
+   memcpy(blockC + 0,  &argL,  16);
+   memcpy(blockC + 16, &argR,  16);
+   memcpy(blockC + 24, &rdxIN, 8);
+   memcpy(blockC + 32, &raxIN, 8);
+   memcpy(blockC + 40, &rdxIN, 8);
+   __asm__ __volatile__(
+      "movupd    0(%0), %%xmm2"           "\n\t"
+      "movupd    16(%0), %%xmm13"         "\n\t"
+      "movq      32(%0), %%rdx"           "\n\t"
+      "movq      40(%0), %%rax"           "\n\t"
+      "movupd    48(%0), %%xmm0"          "\n\t"
+      "movw      64(%0), %%rcx"           "\n\t"
+      "pcmpestri $0x4B, %%xmm2, %%xmm13"  "\n\t"
+      "movupd    %%xmm0, 48(%0)"          "\n\t"
+      "movw      %%rcx, 64(%0)"           "\n\t"
+      "pushfq"                            "\n\t"
+      "popq      %%r15"                   "\n\t"
+      "movq      %%r15, 72(%0)"           "\n\t"
+      : /*out*/ 
+      : /*in*/"r"(blockC) 
+      : /*trash*/"memory","cc","xmm2","xmm13","xmm0","rdx","rax","rcx","r15"
+   );
+   printf("  estri $0x4B:  ");
+   printf("    xmm0 ");
+   show_V128( (V128*)(blockC+48) );
+   printf("  rcx %016llx  flags %08llx\n", block[8], block[9] & 0x8D5);
+
+   /* ---------------- ESTRI_0B ---------------- */
+   memset(blockC, 0x55, 80);
+   memcpy(blockC + 0,  &argL,  16);
+   memcpy(blockC + 16, &argR,  16);
+   memcpy(blockC + 24, &rdxIN, 8);
+   memcpy(blockC + 32, &raxIN, 8);
+   memcpy(blockC + 40, &rdxIN, 8);
+   __asm__ __volatile__(
+      "movupd    0(%0), %%xmm2"           "\n\t"
+      "movupd    16(%0), %%xmm13"         "\n\t"
+      "movq      32(%0), %%rdx"           "\n\t"
+      "movq      40(%0), %%rax"           "\n\t"
+      "movupd    48(%0), %%xmm0"          "\n\t"
+      "movw      64(%0), %%rcx"           "\n\t"
+      "pcmpestri $0x0B, %%xmm2, %%xmm13"  "\n\t"
+      "movupd    %%xmm0, 48(%0)"          "\n\t"
+      "movw      %%rcx, 64(%0)"           "\n\t"
+      "pushfq"                            "\n\t"
+      "popq      %%r15"                   "\n\t"
+      "movq      %%r15, 72(%0)"           "\n\t"
+      : /*out*/ 
+      : /*in*/"r"(blockC) 
+      : /*trash*/"memory","cc","xmm2","xmm13","xmm0","rdx","rax","rcx","r15"
+   );
+   printf("  estri $0x0B:  ");
+   printf("    xmm0 ");
+   show_V128( (V128*)(blockC+48) );
+   printf("  rcx %016llx  flags %08llx\n", block[8], block[9] & 0x8D5);
+
+   /* ---------------- ESTRM_4B ---------------- */
+   memset(blockC, 0x55, 80);
+   memcpy(blockC + 0,  &argL,  16);
+   memcpy(blockC + 16, &argR,  16);
+   memcpy(blockC + 24, &rdxIN, 8);
+   memcpy(blockC + 32, &raxIN, 8);
+   memcpy(blockC + 40, &rdxIN, 8);
+   __asm__ __volatile__(
+      "movupd    0(%0), %%xmm2"           "\n\t"
+      "movupd    16(%0), %%xmm13"         "\n\t"
+      "movq      32(%0), %%rdx"           "\n\t"
+      "movq      40(%0), %%rax"           "\n\t"
+      "movupd    48(%0), %%xmm0"          "\n\t"
+      "movw      64(%0), %%rcx"           "\n\t"
+      "pcmpestrm $0x4B, %%xmm2, %%xmm13"  "\n\t"
+      "movupd    %%xmm0, 48(%0)"          "\n\t"
+      "movw      %%rcx, 64(%0)"           "\n\t"
+      "pushfq"                            "\n\t"
+      "popq      %%r15"                   "\n\t"
+      "movq      %%r15, 72(%0)"           "\n\t"
+      : /*out*/ 
+      : /*in*/"r"(blockC) 
+      : /*trash*/"memory","cc","xmm2","xmm13","xmm0","rdx","rax","rcx","r15"
+   );
+   printf("  estrm $0x4B:  ");
+   printf("    xmm0 ");
+   show_V128( (V128*)(blockC+48) );
+   printf("  rcx %016llx  flags %08llx\n", block[8], block[9] & 0x8D5);
+
+   /* ---------------- ESTRM_0B ---------------- */
+   memset(blockC, 0x55, 80);
+   memcpy(blockC + 0,  &argL,  16);
+   memcpy(blockC + 16, &argR,  16);
+   memcpy(blockC + 24, &rdxIN, 8);
+   memcpy(blockC + 32, &raxIN, 8);
+   memcpy(blockC + 40, &rdxIN, 8);
+   __asm__ __volatile__(
+      "movupd    0(%0), %%xmm2"           "\n\t"
+      "movupd    16(%0), %%xmm13"         "\n\t"
+      "movq      32(%0), %%rdx"           "\n\t"
+      "movq      40(%0), %%rax"           "\n\t"
+      "movupd    48(%0), %%xmm0"          "\n\t"
+      "movw      64(%0), %%rcx"           "\n\t"
+      "pcmpestrm $0x0B, %%xmm2, %%xmm13"  "\n\t"
+      "movupd    %%xmm0, 48(%0)"          "\n\t"
+      "movw      %%rcx, 64(%0)"           "\n\t"
+      "pushfq"                            "\n\t"
+      "popq      %%r15"                   "\n\t"
+      "movq      %%r15, 72(%0)"           "\n\t"
+      : /*out*/ 
+      : /*in*/"r"(blockC) 
+      : /*trash*/"memory","cc","xmm2","xmm13","xmm0","rdx","rax","rcx","r15"
+   );
+   printf("  estrm $0x0B:  ");
+   printf("    xmm0 ");
+   show_V128( (V128*)(blockC+48) );
+   printf("  rcx %016llx  flags %08llx\n", block[8], block[9] & 0x8D5);
+
+
+
+
+}
+
+int main ( void )
+{
+   one_test("aaaaaaaaaaaaaaaa", 0, "aaaaaaaa00aaaaaa", 0 );
+   one_test("0000000000000000", 0, "aaaaaaaa00aaaaaa", 0 );
+
+   one_test("aaaaaaaaaaaaaaaa", 0, "aaaaaaaaaaaaaaaa", 0 );
+   one_test("aaaaaaaaaaaaaaaa", 5, "aaaaaaaaaaaaaaaa", 0 );
+   one_test("aaaaaaaaaaaaaaaa", 0, "aaaaaaaaaaaaaaaa", 6 );
+
+   one_test("aaaaaaaaaaaaaaaa", 5, "aaaaaaaaaaaaaaaa", 6 );
+   one_test("aaaaaaaaaaaaaaaa", 5, "aaaaaaaaaaaaaaaa", 15 );
+   one_test("aaaaaaaaaaaaaaaa", 5, "aaaaaaaaaaaaaaaa", 16 );
+   one_test("aaaaaaaaaaaaaaaa", 5, "aaaaaaaaaaaaaaaa", 17 );
+
+   one_test("aaaaaaaaaaaaaaaa", 5, "aaaaaaaaaaaaaaaa", -6 );
+   one_test("aaaaaaaaaaaaaaaa", 5, "aaaaaaaaaaaaaaaa", -15 );
+   one_test("aaaaaaaaaaaaaaaa", 5, "aaaaaaaaaaaaaaaa", -16 );
+   one_test("aaaaaaaaaaaaaaaa", 5, "aaaaaaaaaaaaaaaa", -17 );
+
+   one_test("aaaaaaaaaaaaaaaa", 5,  "aaaaaaaaaaaaaaaa", 6 );
+   one_test("aaaaaaaaaaaaaaaa", 15, "aaaaaaaaaaaaaaaa", 6 );
+   one_test("aaaaaaaaaaaaaaaa", 16, "aaaaaaaaaaaaaaaa", 6 );
+   one_test("aaaaaaaaaaaaaaaa", 17, "aaaaaaaaaaaaaaaa", 6 );
+
+   one_test("aaaaaaaaaaaaaaaa", -5,  "aaaaaaaaaaaaaaaa", 6 );
+   one_test("aaaaaaaaaaaaaaaa", -15, "aaaaaaaaaaaaaaaa", 6 );
+   one_test("aaaaaaaaaaaaaaaa", -16, "aaaaaaaaaaaaaaaa", 6 );
+   one_test("aaaaaaaaaaaaaaaa", -17, "aaaaaaaaaaaaaaaa", 6 );
+
+   return 0;
+}
diff --git a/none/tests/amd64/pcmpxstrx64w.stderr.exp b/none/tests/amd64/pcmpxstrx64w.stderr.exp
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/none/tests/amd64/pcmpxstrx64w.stderr.exp
diff --git a/none/tests/amd64/pcmpxstrx64w.stdout.exp b/none/tests/amd64/pcmpxstrx64w.stdout.exp
new file mode 100644
index 0000000..d19ebdd
--- /dev/null
+++ b/none/tests/amd64/pcmpxstrx64w.stdout.exp
@@ -0,0 +1,210 @@
+
+rdx 0000000000000000  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax 0000000000000000  argR aaaaaaaaaaaaaaaa0000aaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550002  flags 00000881
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  istrm $0x4B:      xmm0 00000000000000000000ffffffffffff  rcx 5555555555555555  flags 00000881
+  istrm $0x0B:      xmm0 00000000000000000000000000000007  rcx 5555555555555555  flags 00000881
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550007  flags 000008c1
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 000008c1
+  estrm $0x4B:      xmm0 ffffffffffffffffffffffffffffffff  rcx 5555555555555555  flags 000008c1
+  estrm $0x0B:      xmm0 000000000000000000000000000000ff  rcx 5555555555555555  flags 000008c1
+
+rdx 0000000000000000  argL 00000000000000000000000000000000  rax 0000000000000000  argR aaaaaaaaaaaaaaaa0000aaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550007  flags 000000c1
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 000000c1
+  istrm $0x4B:      xmm0 ffffffffffffffffffff000000000000  rcx 5555555555555555  flags 000000c1
+  istrm $0x0B:      xmm0 000000000000000000000000000000f8  rcx 5555555555555555  flags 000000c1
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550007  flags 000008c1
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 000008c1
+  estrm $0x4B:      xmm0 ffffffffffffffffffffffffffffffff  rcx 5555555555555555  flags 000008c1
+  estrm $0x0B:      xmm0 000000000000000000000000000000ff  rcx 5555555555555555  flags 000008c1
+
+rdx 0000000000000000  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax 0000000000000000  argR aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  istrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  istrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550007  flags 000008c1
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 000008c1
+  estrm $0x4B:      xmm0 ffffffffffffffffffffffffffffffff  rcx 5555555555555555  flags 000008c1
+  estrm $0x0B:      xmm0 000000000000000000000000000000ff  rcx 5555555555555555  flags 000008c1
+
+rdx 0000000000000005  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax 0000000000000000  argR aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  istrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  istrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550007  flags 000000c1
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550005  flags 000000c1
+  estrm $0x4B:      xmm0 ffffffffffff00000000000000000000  rcx 5555555555555555  flags 000000c1
+  estrm $0x0B:      xmm0 000000000000000000000000000000e0  rcx 5555555555555555  flags 000000c1
+
+rdx 0000000000000000  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax 0000000000000006  argR aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  istrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  istrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550007  flags 000000c1
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550006  flags 000000c1
+  estrm $0x4B:      xmm0 ffffffff000000000000000000000000  rcx 5555555555555555  flags 000000c1
+  estrm $0x0B:      xmm0 000000000000000000000000000000c0  rcx 5555555555555555  flags 000000c1
+
+rdx 0000000000000005  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax 0000000000000006  argR aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  istrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  istrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550007  flags 000008c1
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 000008c1
+  estrm $0x4B:      xmm0 ffffffff00000000ffffffffffffffff  rcx 5555555555555555  flags 000008c1
+  estrm $0x0B:      xmm0 000000000000000000000000000000cf  rcx 5555555555555555  flags 000008c1
+
+rdx 0000000000000005  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax 000000000000000f  argR aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  istrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  istrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  estrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  estrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+
+rdx 0000000000000005  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax 0000000000000010  argR aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  istrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  istrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  estrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  estrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+
+rdx 0000000000000005  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax 0000000000000011  argR aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  istrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  istrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  estrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  estrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+
+rdx 0000000000000005  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax fffffffffffffffa  argR aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  istrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  istrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550007  flags 000008c1
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 000008c1
+  estrm $0x4B:      xmm0 ffffffff00000000ffffffffffffffff  rcx 5555555555555555  flags 000008c1
+  estrm $0x0B:      xmm0 000000000000000000000000000000cf  rcx 5555555555555555  flags 000008c1
+
+rdx 0000000000000005  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax fffffffffffffff1  argR aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  istrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  istrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  estrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  estrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+
+rdx 0000000000000005  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax fffffffffffffff0  argR aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  istrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  istrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  estrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  estrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+
+rdx 0000000000000005  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax ffffffffffffffef  argR aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  istrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  istrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  estrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  estrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+
+rdx 0000000000000005  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax 0000000000000006  argR aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  istrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  istrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550007  flags 000008c1
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 000008c1
+  estrm $0x4B:      xmm0 ffffffff00000000ffffffffffffffff  rcx 5555555555555555  flags 000008c1
+  estrm $0x0B:      xmm0 000000000000000000000000000000cf  rcx 5555555555555555  flags 000008c1
+
+rdx 000000000000000f  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax 0000000000000006  argR aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  istrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  istrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000841
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000841
+  estrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000841
+  estrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000841
+
+rdx 0000000000000010  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax 0000000000000006  argR aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  istrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  istrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000841
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000841
+  estrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000841
+  estrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000841
+
+rdx 0000000000000011  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax 0000000000000006  argR aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000881
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000881
+  istrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000881
+  istrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000881
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000841
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000841
+  estrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000841
+  estrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000841
+
+rdx fffffffffffffffb  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax 0000000000000006  argR aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000801
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000801
+  istrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000801
+  istrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000801
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550007  flags 000008c1
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 000008c1
+  estrm $0x4B:      xmm0 ffffffff00000000ffffffffffffffff  rcx 5555555555555555  flags 000008c1
+  estrm $0x0B:      xmm0 000000000000000000000000000000cf  rcx 5555555555555555  flags 000008c1
+
+rdx fffffffffffffff1  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax 0000000000000006  argR aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000801
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000801
+  istrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000801
+  istrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000801
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000841
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000841
+  estrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000841
+  estrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000841
+
+rdx fffffffffffffff0  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax 0000000000000006  argR aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000801
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000801
+  istrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000801
+  istrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000801
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000841
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000841
+  estrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000841
+  estrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000841
+
+rdx ffffffffffffffef  argL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  rax 0000000000000006  argR aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  istri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000801
+  istri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000801
+  istrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000801
+  istrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000801
+  estri $0x4B:      xmm0 55555555555555555555555555555555  rcx 5555555555550003  flags 00000841
+  estri $0x0B:      xmm0 55555555555555555555555555555555  rcx 5555555555550000  flags 00000841
+  estrm $0x4B:      xmm0 0000000000000000ffffffffffffffff  rcx 5555555555555555  flags 00000841
+  estrm $0x0B:      xmm0 0000000000000000000000000000000f  rcx 5555555555555555  flags 00000841
diff --git a/none/tests/amd64/pcmpxstrx64w.vgtest b/none/tests/amd64/pcmpxstrx64w.vgtest
new file mode 100644
index 0000000..4b49c51
--- /dev/null
+++ b/none/tests/amd64/pcmpxstrx64w.vgtest
@@ -0,0 +1,3 @@
+prog: pcmpxstrx64w
+prereq: ../../../tests/x86_amd64_features amd64-sse42
+vgopts: -q