Improve accuracy of definedness tracking through the x86 PMOVMSKB and
BSF instructions, as the lack of it causes false positives (VEX side).
Fixes #308627.  Combined efforts of Patrick J. LoPresti
<lopresti@gmail.com> and me.



git-svn-id: svn://svn.valgrind.org/vex/trunk@2559 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_amd64_defs.h b/priv/guest_amd64_defs.h
index bbcc95f..f687d90 100644
--- a/priv/guest_amd64_defs.h
+++ b/priv/guest_amd64_defs.h
@@ -141,8 +141,6 @@
 
 extern ULong amd64g_calculate_mmx_pmaddwd  ( ULong, ULong );
 extern ULong amd64g_calculate_mmx_psadbw   ( ULong, ULong );
-extern ULong amd64g_calculate_mmx_pmovmskb ( ULong );
-extern ULong amd64g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo );
 
 extern ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi );
 
diff --git a/priv/guest_amd64_helpers.c b/priv/guest_amd64_helpers.c
index 5f2c6c5..8cc11ad 100644
--- a/priv/guest_amd64_helpers.c
+++ b/priv/guest_amd64_helpers.c
@@ -2998,21 +2998,6 @@
 }
 
 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
-ULong amd64g_calculate_mmx_pmovmskb ( ULong xx )
-{
-   ULong r = 0;
-   if (xx & (1ULL << (64-1))) r |= (1<<7);
-   if (xx & (1ULL << (56-1))) r |= (1<<6);
-   if (xx & (1ULL << (48-1))) r |= (1<<5);
-   if (xx & (1ULL << (40-1))) r |= (1<<4);
-   if (xx & (1ULL << (32-1))) r |= (1<<3);
-   if (xx & (1ULL << (24-1))) r |= (1<<2);
-   if (xx & (1ULL << (16-1))) r |= (1<<1);
-   if (xx & (1ULL << ( 8-1))) r |= (1<<0);
-   return r;
-}
-
-/* CALLED FROM GENERATED CODE: CLEAN HELPER */
 ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
 {
    UInt t = 0;
@@ -3029,14 +3014,6 @@
 }
 
 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
-ULong amd64g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo )
-{
-   ULong rHi8 = amd64g_calculate_mmx_pmovmskb ( w64hi );
-   ULong rLo8 = amd64g_calculate_mmx_pmovmskb ( w64lo );
-   return ((rHi8 & 0xFF) << 8) | (rLo8 & 0xFF);
-}
-
-/* CALLED FROM GENERATED CODE: CLEAN HELPER */
 ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi )
 {
    UShort t, min;
diff --git a/priv/guest_amd64_toIR.c b/priv/guest_amd64_toIR.c
index 7474802..84378f8 100644
--- a/priv/guest_amd64_toIR.c
+++ b/priv/guest_amd64_toIR.c
@@ -7789,11 +7789,15 @@
    /* First, widen src to 64 bits if it is not already. */
    assign( src64, widenUto64(mkexpr(src)) );
 
-   /* Generate an 8-bit expression which is zero iff the 
-      original is zero, and nonzero otherwise */
+   /* Generate an 8-bit expression which is zero iff the original is
+      zero, and nonzero otherwise.  Ask for a CmpNE version which, if
+      instrumented by Memcheck, is instrumented expensively, since
+      this may be used on the output of a preceding movmskb insn,
+      which has been known to be partially defined, and in need of
+      careful handling. */
    assign( src8,
            unop(Iop_1Uto8, 
-                binop(Iop_CmpNE64,
+                binop(Iop_ExpCmpNE64,
                       mkexpr(src64), mkU64(0))) );
 
    /* Flags: Z is 1 iff source value is zero.  All others 
@@ -10277,14 +10281,15 @@
    UInt   rG = gregOfRexRM(pfx,modrm);
    IRTemp t0 = newTemp(Ity_I64);
    IRTemp t1 = newTemp(Ity_I64);
-   IRTemp t5 = newTemp(Ity_I64);
+   IRTemp t5 = newTemp(Ity_I32);
    assign(t0, getXMMRegLane64(rE, 0));
    assign(t1, getXMMRegLane64(rE, 1));
-   assign(t5, mkIRExprCCall( Ity_I64, 0/*regparms*/, 
-                             "amd64g_calculate_sse_pmovmskb",
-                             &amd64g_calculate_sse_pmovmskb,
-                             mkIRExprVec_2( mkexpr(t1), mkexpr(t0) )));
-   putIReg32(rG, unop(Iop_64to32,mkexpr(t5)));
+   assign(t5,
+          unop(Iop_16Uto32,
+               binop(Iop_8HLto16,
+                     unop(Iop_GetMSBs8x8, mkexpr(t1)),
+                     unop(Iop_GetMSBs8x8, mkexpr(t0)))));
+   putIReg32(rG, mkexpr(t5));
    DIP("%spmovmskb %s,%s\n", isAvx ? "v" : "", nameXMMReg(rE),
        nameIReg32(rG));
    delta += 1;
@@ -13443,7 +13448,7 @@
       }
       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
       /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
-         mmx(G), turn them into a byte, and put zero-extend of it in
+         mmx(E), turn them into a byte, and put zero-extend of it in
          ireg(G). */
       if (haveNo66noF2noF3(pfx)
           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
@@ -13451,14 +13456,10 @@
          if (epartIsReg(modrm)) {
             do_MMX_preamble();
             t0 = newTemp(Ity_I64);
-            t1 = newTemp(Ity_I64);
+            t1 = newTemp(Ity_I32);
             assign(t0, getMMXReg(eregLO3ofRM(modrm)));
-            assign(t1, mkIRExprCCall(
-                          Ity_I64, 0/*regparms*/, 
-                          "amd64g_calculate_mmx_pmovmskb",
-                          &amd64g_calculate_mmx_pmovmskb,
-                          mkIRExprVec_1(mkexpr(t0))));
-            putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_64to32,mkexpr(t1)));
+            assign(t1, unop(Iop_8Uto32, unop(Iop_GetMSBs8x8, mkexpr(t0))));
+            putIReg32(gregOfRexRM(pfx,modrm), mkexpr(t1));
             DIP("pmovmskb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
                                     nameIReg32(gregOfRexRM(pfx,modrm)));
             delta += 1;
diff --git a/priv/guest_x86_defs.h b/priv/guest_x86_defs.h
index af83cb7..a47040a 100644
--- a/priv/guest_x86_defs.h
+++ b/priv/guest_x86_defs.h
@@ -134,8 +134,6 @@
 
 extern ULong x86g_calculate_mmx_pmaddwd  ( ULong, ULong );
 extern ULong x86g_calculate_mmx_psadbw   ( ULong, ULong );
-extern UInt  x86g_calculate_mmx_pmovmskb ( ULong );
-extern UInt  x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo );
 
 
 /* --- DIRTY HELPERS --- */
diff --git a/priv/guest_x86_helpers.c b/priv/guest_x86_helpers.c
index 4676276..35938c9 100644
--- a/priv/guest_x86_helpers.c
+++ b/priv/guest_x86_helpers.c
@@ -2514,21 +2514,6 @@
 }
 
 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
-UInt x86g_calculate_mmx_pmovmskb ( ULong xx )
-{
-   UInt r = 0;
-   if (xx & (1ULL << (64-1))) r |= (1<<7);
-   if (xx & (1ULL << (56-1))) r |= (1<<6);
-   if (xx & (1ULL << (48-1))) r |= (1<<5);
-   if (xx & (1ULL << (40-1))) r |= (1<<4);
-   if (xx & (1ULL << (32-1))) r |= (1<<3);
-   if (xx & (1ULL << (24-1))) r |= (1<<2);
-   if (xx & (1ULL << (16-1))) r |= (1<<1);
-   if (xx & (1ULL << ( 8-1))) r |= (1<<0);
-   return r;
-}
-
-/* CALLED FROM GENERATED CODE: CLEAN HELPER */
 ULong x86g_calculate_mmx_psadbw ( ULong xx, ULong yy )
 {
    UInt t = 0;
@@ -2544,14 +2529,6 @@
    return (ULong)t;
 }
 
-/* CALLED FROM GENERATED CODE: CLEAN HELPER */
-UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo )
-{
-   UInt rHi8 = x86g_calculate_mmx_pmovmskb ( w64hi );
-   UInt rLo8 = x86g_calculate_mmx_pmovmskb ( w64lo );
-   return ((rHi8 & 0xFF) << 8) | (rLo8 & 0xFF);
-}
-
 
 /*---------------------------------------------------------------*/
 /*--- Helpers for dealing with segment overrides.             ---*/
diff --git a/priv/guest_x86_toIR.c b/priv/guest_x86_toIR.c
index e98762d..1a17d89 100644
--- a/priv/guest_x86_toIR.c
+++ b/priv/guest_x86_toIR.c
@@ -716,6 +716,7 @@
            || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
            || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
            || op8 == Iop_CasCmpNE8
+           || op8 == Iop_ExpCmpNE8
            || op8 == Iop_Not8);
    adj = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
    return adj + op8;
@@ -6385,10 +6386,14 @@
        ( isReg ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ), 
        nameIReg(sz, gregOfRM(modrm)));
 
-   /* Generate an 8-bit expression which is zero iff the 
-      original is zero, and nonzero otherwise */
+   /* Generate an 8-bit expression which is zero iff the original is
+      zero, and nonzero otherwise.  Ask for a CmpNE version which, if
+      instrumented by Memcheck, is instrumented expensively, since
+      this may be used on the output of a preceding movmskb insn,
+      which has been known to be partially defined, and in need of
+      careful handling. */
    assign( src8,
-           unop(Iop_1Uto8, binop(mkSizedOp(ty,Iop_CmpNE8),
+           unop(Iop_1Uto8, binop(mkSizedOp(ty,Iop_ExpCmpNE8),
                            mkexpr(src), mkU(ty,0))) );
 
    /* Flags: Z is 1 iff source value is zero.  All others 
@@ -9051,7 +9056,7 @@
 
    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
    /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
-      mmx(G), turn them into a byte, and put zero-extend of it in
+      mmx(E), turn them into a byte, and put zero-extend of it in
       ireg(G). */
    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD7) {
       modrm = insn[2];
@@ -9060,11 +9065,7 @@
          t0 = newTemp(Ity_I64);
          t1 = newTemp(Ity_I32);
          assign(t0, getMMXReg(eregOfRM(modrm)));
-         assign(t1, mkIRExprCCall(
-                       Ity_I32, 0/*regparms*/, 
-                       "x86g_calculate_mmx_pmovmskb",
-                       &x86g_calculate_mmx_pmovmskb,
-                       mkIRExprVec_1(mkexpr(t0))));
+         assign(t1, unop(Iop_8Uto32, unop(Iop_GetMSBs8x8, mkexpr(t0))));
          putIReg(4, gregOfRM(modrm), mkexpr(t1));
          DIP("pmovmskb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
                                  nameIReg(4,gregOfRM(modrm)));
@@ -10903,11 +10904,9 @@
       goto decode_success;
    }
 
-   /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes in
-      xmm(G), turn them into a byte, and put zero-extend of it in
-      ireg(G).  Doing this directly is just too cumbersome; give up
-      therefore and call a helper. */
-   /* UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ); */
+   /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes
+      in xmm(E), turn them into a byte, and put zero-extend of it in
+      ireg(G). */
    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD7) {
       modrm = insn[2];
       if (epartIsReg(modrm)) {
@@ -10916,11 +10915,11 @@
          assign(t0, getXMMRegLane64(eregOfRM(modrm), 0));
          assign(t1, getXMMRegLane64(eregOfRM(modrm), 1));
          t5 = newTemp(Ity_I32);
-         assign(t5, mkIRExprCCall(
-                       Ity_I32, 0/*regparms*/, 
-                       "x86g_calculate_sse_pmovmskb",
-                       &x86g_calculate_sse_pmovmskb,
-                       mkIRExprVec_2( mkexpr(t1), mkexpr(t0) )));
+         assign(t5,
+                unop(Iop_16Uto32,
+                     binop(Iop_8HLto16,
+                           unop(Iop_GetMSBs8x8, mkexpr(t1)),
+                           unop(Iop_GetMSBs8x8, mkexpr(t0)))));
          putIReg(4, gregOfRM(modrm), mkexpr(t5));
          DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
                                  nameIReg(4,gregOfRM(modrm)));
diff --git a/priv/host_amd64_isel.c b/priv/host_amd64_isel.c
index 1296390..98e90f7 100644
--- a/priv/host_amd64_isel.c
+++ b/priv/host_amd64_isel.c
@@ -791,7 +791,7 @@
 
    This should handle expressions of 64, 32, 16 and 8-bit type.  All
    results are returned in a 64-bit register.  For 32-, 16- and 8-bit
-   expressions, the upper 32/16/24 bits are arbitrary, so you should
+   expressions, the upper 32/48/56 bits are arbitrary, so you should
    mask or sign extend partial values if necessary.
 */
 
@@ -1586,6 +1586,25 @@
             /* These are no-ops. */
             return iselIntExpr_R(env, e->Iex.Unop.arg);
 
+         case Iop_GetMSBs8x8: {
+            /* Note: the following assumes the helper is of
+               signature
+                  UInt fn ( ULong ), and is not a regparm fn.
+            */
+            HReg dst = newVRegI(env);
+            HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
+            fn = (HWord)h_generic_calc_GetMSBs8x8;
+            addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
+            addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1 ));
+            /* MovxLQ is not exactly the right thing here.  We just
+               need to get the bottom 8 bits of RAX into dst, and zero
+               out everything else.  Assuming that the helper returns
+               a UInt with the top 24 bits zeroed out, it'll do,
+               though. */
+            addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
+            return dst;
+         }
+
          default: 
             break;
       }
@@ -2223,13 +2242,15 @@
            || e->Iex.Binop.op == Iop_CmpLE64S
            || e->Iex.Binop.op == Iop_CmpLE64U
            || e->Iex.Binop.op == Iop_CasCmpEQ64
-           || e->Iex.Binop.op == Iop_CasCmpNE64)) {
+           || e->Iex.Binop.op == Iop_CasCmpNE64
+           || e->Iex.Binop.op == Iop_ExpCmpNE64)) {
       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
       switch (e->Iex.Binop.op) {
          case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
-         case Iop_CmpNE64: case Iop_CasCmpNE64: return Acc_NZ;
+         case Iop_CmpNE64:
+         case Iop_CasCmpNE64: case Iop_ExpCmpNE64: return Acc_NZ;
 	 case Iop_CmpLT64S: return Acc_L;
 	 case Iop_CmpLT64U: return Acc_B;
 	 case Iop_CmpLE64S: return Acc_LE;
diff --git a/priv/host_generic_simd64.c b/priv/host_generic_simd64.c
index b70ce88..fdc9eed 100644
--- a/priv/host_generic_simd64.c
+++ b/priv/host_generic_simd64.c
@@ -1169,6 +1169,20 @@
           );
 }
 
+UInt h_generic_calc_GetMSBs8x8 ( ULong xx )
+{
+   UInt r = 0;
+   if (xx & (1ULL << (64-1))) r |= (1<<7);
+   if (xx & (1ULL << (56-1))) r |= (1<<6);
+   if (xx & (1ULL << (48-1))) r |= (1<<5);
+   if (xx & (1ULL << (40-1))) r |= (1<<4);
+   if (xx & (1ULL << (32-1))) r |= (1<<3);
+   if (xx & (1ULL << (24-1))) r |= (1<<2);
+   if (xx & (1ULL << (16-1))) r |= (1<<1);
+   if (xx & (1ULL << ( 8-1))) r |= (1<<0);
+   return r;
+}
+
 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
 
 /* Tuple/select functions for 16x2 vectors. */
diff --git a/priv/host_generic_simd64.h b/priv/host_generic_simd64.h
index 0858583..deef944 100644
--- a/priv/host_generic_simd64.h
+++ b/priv/host_generic_simd64.h
@@ -123,6 +123,8 @@
 extern ULong h_generic_calc_Min16Sx4 ( ULong, ULong );
 extern ULong h_generic_calc_Min8Ux8  ( ULong, ULong );
 
+extern UInt  h_generic_calc_GetMSBs8x8 ( ULong );
+
 /* 32-bit SIMD HELPERS */
 
 extern UInt h_generic_calc_Add16x2   ( UInt, UInt );
diff --git a/priv/host_x86_isel.c b/priv/host_x86_isel.c
index d342d92..5513d71 100644
--- a/priv/host_x86_isel.c
+++ b/priv/host_x86_isel.c
@@ -1293,6 +1293,23 @@
             /* These are no-ops. */
             return iselIntExpr_R(env, e->Iex.Unop.arg);
 
+         case Iop_GetMSBs8x8: {
+            /* Note: the following assumes the helper is of
+               signature
+                  UInt fn ( ULong ), and is not a regparm fn.
+            */
+            HReg  xLo, xHi;
+            HReg  dst = newVRegI(env);
+            HWord fn = (HWord)h_generic_calc_GetMSBs8x8;
+            iselInt64Expr(&xHi, &xLo, env, e->Iex.Unop.arg);
+            addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
+            addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
+            addInstr(env, X86Instr_Call( Xcc_ALWAYS, (UInt)fn, 0 ));
+            add_to_esp(env, 2*4);
+            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
+            return dst;
+         }
+
          default: 
             break;
       }
@@ -1840,7 +1857,8 @@
        && (e->Iex.Binop.op == Iop_CmpEQ16
            || e->Iex.Binop.op == Iop_CmpNE16
            || e->Iex.Binop.op == Iop_CasCmpEQ16
-           || e->Iex.Binop.op == Iop_CasCmpNE16)) {
+           || e->Iex.Binop.op == Iop_CasCmpNE16
+           || e->Iex.Binop.op == Iop_ExpCmpNE16)) {
       HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
       X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
       HReg    r    = newVRegI(env);
@@ -1848,9 +1866,12 @@
       addInstr(env, X86Instr_Alu32R(Xalu_XOR,rmi2,r));
       addInstr(env, X86Instr_Test32(0xFFFF,X86RM_Reg(r)));
       switch (e->Iex.Binop.op) {
-         case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Xcc_Z;
-         case Iop_CmpNE16: case Iop_CasCmpNE16: return Xcc_NZ;
-         default: vpanic("iselCondCode(x86): CmpXX16");
+         case Iop_CmpEQ16: case Iop_CasCmpEQ16:
+            return Xcc_Z;
+         case Iop_CmpNE16: case Iop_CasCmpNE16: case Iop_ExpCmpNE16:
+            return Xcc_NZ;
+         default:
+            vpanic("iselCondCode(x86): CmpXX16");
       }
    }
 
@@ -1882,13 +1903,15 @@
            || e->Iex.Binop.op == Iop_CmpLE32S
            || e->Iex.Binop.op == Iop_CmpLE32U
            || e->Iex.Binop.op == Iop_CasCmpEQ32
-           || e->Iex.Binop.op == Iop_CasCmpNE32)) {
+           || e->Iex.Binop.op == Iop_CasCmpNE32
+           || e->Iex.Binop.op == Iop_ExpCmpNE32)) {
       HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
       X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
       addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
       switch (e->Iex.Binop.op) {
          case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Xcc_Z;
-         case Iop_CmpNE32: case Iop_CasCmpNE32: return Xcc_NZ;
+         case Iop_CmpNE32:
+         case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Xcc_NZ;
          case Iop_CmpLT32S: return Xcc_L;
          case Iop_CmpLT32U: return Xcc_B;
          case Iop_CmpLE32S: return Xcc_LE;
diff --git a/priv/ir_defs.c b/priv/ir_defs.c
index 99b2266..b356f60 100644
--- a/priv/ir_defs.c
+++ b/priv/ir_defs.c
@@ -147,6 +147,8 @@
          str = "CasCmpEQ"; base = Iop_CasCmpEQ8; break;
       case Iop_CasCmpNE8 ... Iop_CasCmpNE64:
          str = "CasCmpNE"; base = Iop_CasCmpNE8; break;
+      case Iop_ExpCmpNE8 ... Iop_ExpCmpNE64:
+         str = "ExpCmpNE"; base = Iop_ExpCmpNE8; break;
       case Iop_Not8 ... Iop_Not64:
          str = "Not"; base = Iop_Not8; break;
       /* other cases must explicitly "return;" */
@@ -581,6 +583,7 @@
       case Iop_Reverse64_16x4: vex_printf("Reverse64_16x4"); return;
       case Iop_Reverse64_32x2: vex_printf("Reverse64_32x2"); return;
       case Iop_Abs32Fx2: vex_printf("Abs32Fx2"); return;
+      case Iop_GetMSBs8x8: vex_printf("GetMSBs8x8"); return;
 
       case Iop_CmpNEZ32x2: vex_printf("CmpNEZ32x2"); return;
       case Iop_CmpNEZ16x4: vex_printf("CmpNEZ16x4"); return;
@@ -2271,18 +2274,18 @@
          UNARY(Ity_I64, Ity_I64);
 
       case Iop_CmpEQ8: case Iop_CmpNE8:
-      case Iop_CasCmpEQ8: case Iop_CasCmpNE8:
+      case Iop_CasCmpEQ8: case Iop_CasCmpNE8: case Iop_ExpCmpNE8:
          COMPARISON(Ity_I8);
       case Iop_CmpEQ16: case Iop_CmpNE16:
-      case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
+      case Iop_CasCmpEQ16: case Iop_CasCmpNE16: case Iop_ExpCmpNE16:
          COMPARISON(Ity_I16);
       case Iop_CmpEQ32: case Iop_CmpNE32:
-      case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
+      case Iop_CasCmpEQ32: case Iop_CasCmpNE32: case Iop_ExpCmpNE32:
       case Iop_CmpLT32S: case Iop_CmpLE32S:
       case Iop_CmpLT32U: case Iop_CmpLE32U:
          COMPARISON(Ity_I32);
       case Iop_CmpEQ64: case Iop_CmpNE64:
-      case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
+      case Iop_CasCmpEQ64: case Iop_CasCmpNE64: case Iop_ExpCmpNE64:
       case Iop_CmpLT64S: case Iop_CmpLE64S:
       case Iop_CmpLT64U: case Iop_CmpLE64U:
          COMPARISON(Ity_I64);
@@ -2296,6 +2299,7 @@
       case Iop_Left16: UNARY(Ity_I16,Ity_I16);
       case Iop_CmpwNEZ32: case Iop_Left32: UNARY(Ity_I32,Ity_I32);
       case Iop_CmpwNEZ64: case Iop_Left64: UNARY(Ity_I64,Ity_I64);
+      case Iop_GetMSBs8x8: UNARY(Ity_I64, Ity_I8);
 
       case Iop_MullU8: case Iop_MullS8:
          BINARY(Ity_I8,Ity_I8, Ity_I16);
diff --git a/priv/ir_opt.c b/priv/ir_opt.c
index b7e3d9a..1537df6 100644
--- a/priv/ir_opt.c
+++ b/priv/ir_opt.c
@@ -1836,16 +1836,22 @@
 
             /* -- CmpNE -- */
             case Iop_CmpNE8:
+            case Iop_CasCmpNE8:
+            case Iop_ExpCmpNE8:
                e2 = IRExpr_Const(IRConst_U1(toBool(
                        ((0xFF & e->Iex.Binop.arg1->Iex.Const.con->Ico.U8)
                         != (0xFF & e->Iex.Binop.arg2->Iex.Const.con->Ico.U8)))));
                break;
             case Iop_CmpNE32:
+            case Iop_CasCmpNE32:
+            case Iop_ExpCmpNE32:
                e2 = IRExpr_Const(IRConst_U1(toBool(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
                         != e->Iex.Binop.arg2->Iex.Const.con->Ico.U32))));
                break;
             case Iop_CmpNE64:
+            case Iop_CasCmpNE64:
+            case Iop_ExpCmpNE64:
                e2 = IRExpr_Const(IRConst_U1(toBool(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
                         != e->Iex.Binop.arg2->Iex.Const.con->Ico.U64))));
diff --git a/pub/libvex_ir.h b/pub/libvex_ir.h
index f399bb2..99eaaaf 100644
--- a/pub/libvex_ir.h
+++ b/pub/libvex_ir.h
@@ -437,6 +437,10 @@
       Iop_CasCmpEQ8, Iop_CasCmpEQ16, Iop_CasCmpEQ32, Iop_CasCmpEQ64,
       Iop_CasCmpNE8, Iop_CasCmpNE16, Iop_CasCmpNE32, Iop_CasCmpNE64,
 
+      /* Exactly like CmpNE8/16/32/64, but carrying the additional
+         hint that these needs expensive definedness tracking. */
+      Iop_ExpCmpNE8, Iop_ExpCmpNE16, Iop_ExpCmpNE32, Iop_ExpCmpNE64,
+
       /* -- Ordering not important after here. -- */
 
       /* Widening multiplies */
@@ -991,6 +995,10 @@
          is undefined. */
       Iop_Perm8x8,
 
+      /* MISC CONVERSION -- get high bits of each byte lane, a la
+         x86/amd64 pmovmskb */
+      Iop_GetMSBs8x8, /* I64 -> I8 */
+
       /* Vector Reciprocal Estimate and Vector Reciprocal Square Root Estimate
          See floating-point equiwalents for details. */
       Iop_Recip32x2, Iop_Rsqrte32x2,