Use IR conditional stores (IRStoreG) to implement AVX-2 conditional
vector stores, VPMASKMOV{D,Q} xmm/ymm to memory.


git-svn-id: svn://svn.valgrind.org/vex/trunk@3077 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_amd64_toIR.c b/priv/guest_amd64_toIR.c
index d0c2233..35aaa73 100644
--- a/priv/guest_amd64_toIR.c
+++ b/priv/guest_amd64_toIR.c
@@ -27235,10 +27235,11 @@
 }
 
 
-/* Masked load.  */
-static ULong dis_VMASKMOV_load ( Bool *uses_vvvv, const VexAbiInfo* vbi,
-                                 Prefix pfx, Long delta,
-                                 const HChar* opname, Bool isYMM, IRType ty )
+/* Masked load or masked store. */
+static ULong dis_VMASKMOV ( Bool *uses_vvvv, const VexAbiInfo* vbi,
+                            Prefix pfx, Long delta,
+                            const HChar* opname, Bool isYMM, IRType ty,
+                            Bool isLoad )
 {
    HChar   dis_buf[50];
    Int     alen, i;
@@ -27246,49 +27247,54 @@
    UChar   modrm = getUChar(delta);
    UInt    rG    = gregOfRexRM(pfx,modrm);
    UInt    rV    = getVexNvvvv(pfx);
-   IRTemp  res[8], cond;
+
    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
-   if (isYMM) {
-      DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
-   } else {
-      DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
-   }
    delta += alen;
 
-   for (i = 0; i < sizeof(res)/sizeof(res[0]); i++)
-      res[i] = IRTemp_INVALID;
+   /**/ if (isLoad && isYMM) {
+      DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
+   }
+   else if (isLoad && !isYMM) {
+      DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
+   }
 
-   for (i = 0; i < 2 * (isYMM ? 2 : 1) * (ty == Ity_I32 ? 2 : 1); i++) {
-      res[i] = newTemp(ty);
-      cond = newTemp(Ity_I1);
-      assign( cond, 
-              binop(ty == Ity_I32 ? Iop_CmpLT32S : Iop_CmpLT64S,
-                    ty == Ity_I32 ? getYMMRegLane32( rV, i )
-                                  : getYMMRegLane64( rV, i ),
+   else if (!isLoad && isYMM) {
+      DIP("%s %s,%s,%s\n", opname, nameYMMReg(rV), nameYMMReg(rG), dis_buf );
+   }
+   else {
+      vassert(!isLoad && !isYMM);
+      DIP("%s %s,%s,%s\n", opname, nameXMMReg(rV), nameXMMReg(rG), dis_buf );
+   }
+
+   vassert(ty == Ity_I32 || ty == Ity_I64);
+   Bool laneIs32 = ty == Ity_I32;
+
+   Int nLanes = (isYMM ? 2 : 1) * (laneIs32 ? 4 : 2);
+
+   for (i = 0; i < nLanes; i++) {
+      IRTemp cond = newTemp(Ity_I1);
+      assign( cond,
+              binop(laneIs32 ? Iop_CmpLT32S : Iop_CmpLT64S,
+                    (laneIs32 ? getYMMRegLane32 : getYMMRegLane64)( rV, i ),
                     mkU(ty, 0) ));
-      stmt(
-         IRStmt_LoadG(
-            Iend_LE,
-            ty == Ity_I32 ? ILGop_Ident32 : ILGop_Ident64,
-            res[i], 
-            binop(Iop_Add64, mkexpr(addr), mkU64(i * (ty == Ity_I32 ? 4 : 8))),
-            ty == Ity_I32 ? mkU32(0) : mkU64(0),
-            mkexpr(cond)
-      ));
+      IRTemp  data = newTemp(ty);
+      IRExpr* ea   = binop(Iop_Add64, mkexpr(addr),
+                                      mkU64(i * (laneIs32 ? 4 : 8)));
+      if (isLoad) {
+         stmt(
+            IRStmt_LoadG(
+               Iend_LE, laneIs32 ? ILGop_Ident32 : ILGop_Ident64,
+               data, ea, laneIs32 ? mkU32(0) : mkU64(0), mkexpr(cond)
+         ));
+         (laneIs32 ? putYMMRegLane32 : putYMMRegLane64)( rG, i, mkexpr(data) );
+      } else {
+         assign(data, (laneIs32 ? getYMMRegLane32 : getYMMRegLane64)( rG, i ));
+         stmt( IRStmt_StoreG(Iend_LE, ea, mkexpr(data), mkexpr(cond)) );
+      }
    }
-   switch (ty) {
-      case Ity_I32:
-         for (i = 0; i < 8; i++)
-            putYMMRegLane32( rG, i, (i < 4 || isYMM)
-                                    ? mkexpr(res[i]) : mkU32(0) );
-         break;
-      case Ity_I64:
-         for (i = 0; i < 4; i++)
-            putYMMRegLane64( rG, i, (i < 2 || isYMM)
-                                    ? mkexpr(res[i]) : mkU64(0) );
-         break;
-      default: vassert(0);
-   }
+
+   if (isLoad && !isYMM)
+      putYMMRegLane128( rG, 1, mkV128(0) );
 
    *uses_vvvv = True;
    return delta;
@@ -28202,15 +28208,15 @@
       /* VMASKMOVPS m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 2C /r */
       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
           && !epartIsReg(getUChar(delta))) {
-         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
-                                    /*!isYMM*/False, Ity_I32 );
+         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
+                               /*!isYMM*/False, Ity_I32, /*isLoad*/True );
          goto decode_success;
       }
       /* VMASKMOVPS m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 2C /r */
       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
           && !epartIsReg(getUChar(delta))) {
-         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
-                                    /*isYMM*/True, Ity_I32 );
+         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
+                               /*isYMM*/True, Ity_I32, /*isLoad*/True );
          goto decode_success;
       }
       break;
@@ -28219,15 +28225,15 @@
       /* VMASKMOVPD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 2D /r */
       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
           && !epartIsReg(getUChar(delta))) {
-         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
-                                    /*!isYMM*/False, Ity_I64 );
+         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
+                               /*!isYMM*/False, Ity_I64, /*isLoad*/True );
          goto decode_success;
       }
       /* VMASKMOVPD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 2D /r */
       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
           && !epartIsReg(getUChar(delta))) {
-         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
-                                    /*isYMM*/True, Ity_I64 );
+         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
+                               /*isYMM*/True, Ity_I64, /*isLoad*/True );
          goto decode_success;
       }
       break;
@@ -28788,29 +28794,60 @@
       /* VPMASKMOVD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 8C /r */
       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
-         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
-                                    /*!isYMM*/False, Ity_I32 );
+         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
+                               /*!isYMM*/False, Ity_I32, /*isLoad*/True );
          goto decode_success;
       }
       /* VPMASKMOVD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 8C /r */
       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
-         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
-                                    /*isYMM*/True, Ity_I32 );
+         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
+                               /*isYMM*/True, Ity_I32, /*isLoad*/True );
          goto decode_success;
       }
       /* VPMASKMOVQ m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 8C /r */
       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
-         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
-                                    /*!isYMM*/False, Ity_I64 );
+         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
+                               /*!isYMM*/False, Ity_I64, /*isLoad*/True );
          goto decode_success;
       }
       /* VPMASKMOVQ m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 8C /r */
       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
-         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
-                                    /*isYMM*/True, Ity_I64 );
+         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
+                               /*isYMM*/True, Ity_I64, /*isLoad*/True );
+         goto decode_success;
+      }
+      break;
+
+   case 0x8E:
+      /* VPMASKMOVD xmm2, xmm1, m128 = VEX.NDS.128.66.0F38.W0 8E /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+          && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
+         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
+                               /*!isYMM*/False, Ity_I32, /*!isLoad*/False );
+         goto decode_success;
+      }
+      /* VPMASKMOVD ymm2, ymm1, m256 = VEX.NDS.256.66.0F38.W0 8E /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
+         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
+                               /*isYMM*/True, Ity_I32, /*!isLoad*/False );
+         goto decode_success;
+      }
+      /* VPMASKMOVQ xmm2, xmm1, m128 = VEX.NDS.128.66.0F38.W1 8E /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+          && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
+         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
+                               /*!isYMM*/False, Ity_I64, /*!isLoad*/False );
+         goto decode_success;
+      }
+      /* VPMASKMOVQ ymm2, ymm1, m256 = VEX.NDS.256.66.0F38.W1 8E /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
+         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
+                               /*isYMM*/True, Ity_I64, /*!isLoad*/False );
          goto decode_success;
       }
       break;
diff --git a/priv/host_amd64_defs.c b/priv/host_amd64_defs.c
index fdbf05e..4cfd9a4 100644
--- a/priv/host_amd64_defs.c
+++ b/priv/host_amd64_defs.c
@@ -753,7 +753,18 @@
    i->Ain.CLoad.szB  = szB;
    i->Ain.CLoad.addr = addr;
    i->Ain.CLoad.dst  = dst;
-   vassert(cond != Acc_ALWAYS);
+   vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
+   return i;
+}
+AMD64Instr* AMD64Instr_CStore ( AMD64CondCode cond, UChar szB,
+                                HReg src, AMD64AMode* addr ) {
+   AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag             = Ain_CStore;
+   i->Ain.CStore.cond = cond;
+   i->Ain.CStore.szB  = szB;
+   i->Ain.CStore.src  = src;
+   i->Ain.CStore.addr = addr;
+   vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
    return i;
 }
 AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ) {
@@ -1135,13 +1146,24 @@
       case Ain_CLoad:
          vex_printf("if (%%rflags.%s) { ",
                     showAMD64CondCode(i->Ain.CLoad.cond));
-         vex_printf("mov%c (", i->Ain.CLoad.szB == 4 ? 'l' : 'q');
+         vex_printf("mov%c ", i->Ain.CLoad.szB == 4 ? 'l' : 'q');
          ppAMD64AMode(i->Ain.CLoad.addr);
-         vex_printf("), ");
+         vex_printf(", ");
          (i->Ain.CLoad.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
             (i->Ain.CLoad.dst);
          vex_printf(" }");
          return;
+      case Ain_CStore:
+         vex_printf("if (%%rflags.%s) { ",
+                    showAMD64CondCode(i->Ain.CStore.cond));
+         vex_printf("mov%c ", i->Ain.CStore.szB == 4 ? 'l' : 'q');
+         (i->Ain.CStore.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
+            (i->Ain.CStore.src);
+         vex_printf(", ");
+         ppAMD64AMode(i->Ain.CStore.addr);
+         vex_printf(" }");
+         return;
+
       case Ain_MovxLQ:
          vex_printf("mov%clq ", i->Ain.MovxLQ.syned ? 's' : 'z');
          ppHRegAMD64_lo32(i->Ain.MovxLQ.src);
@@ -1488,6 +1510,10 @@
          addRegUsage_AMD64AMode(u, i->Ain.CLoad.addr);
          addHRegUse(u, HRmModify, i->Ain.CLoad.dst);
          return;
+      case Ain_CStore:
+         addRegUsage_AMD64AMode(u, i->Ain.CStore.addr);
+         addHRegUse(u, HRmRead, i->Ain.CStore.src);
+         return;
       case Ain_MovxLQ:
          addHRegUse(u, HRmRead,  i->Ain.MovxLQ.src);
          addHRegUse(u, HRmWrite, i->Ain.MovxLQ.dst);
@@ -1724,6 +1750,10 @@
          mapRegs_AMD64AMode(m, i->Ain.CLoad.addr);
          mapReg(m, &i->Ain.CLoad.dst);
          return;
+      case Ain_CStore:
+         mapRegs_AMD64AMode(m, i->Ain.CStore.addr);
+         mapReg(m, &i->Ain.CStore.src);
+         return;
       case Ain_MovxLQ:
          mapReg(m, &i->Ain.MovxLQ.src);
          mapReg(m, &i->Ain.MovxLQ.dst);
@@ -3035,6 +3065,35 @@
       goto done;
    }
 
+   case Ain_CStore: {
+      /* AFAICS this is identical to Ain_CStore except that the opcode
+         is 0x89 instead of 0x8B. */
+      vassert(i->Ain.CStore.cond != Acc_ALWAYS);
+
+      /* Only 32- or 64-bit variants are allowed. */
+      vassert(i->Ain.CStore.szB == 4 || i->Ain.CStore.szB == 8);
+
+      /* Use ptmp for backpatching conditional jumps. */
+      ptmp = NULL;
+
+      /* jmp fwds if !condition */
+      *p++ = toUChar(0x70 + (0xF & (i->Ain.CStore.cond ^ 1)));
+      ptmp = p; /* fill in this bit later */
+      *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
+
+      /* Now the store. */
+      rex = rexAMode_M(i->Ain.CStore.src, i->Ain.CStore.addr);
+      *p++ = i->Ain.CStore.szB == 4 ? clearWBit(rex) : rex;
+      *p++ = 0x89;
+      p = doAMode_M(p, i->Ain.CStore.src, i->Ain.CStore.addr);
+
+      /* Fix up the conditional branch */
+      Int delta = p - ptmp;
+      vassert(delta > 0 && delta < 40);
+      *ptmp = toUChar(delta-1);
+      goto done;
+   }
+
    case Ain_MovxLQ:
       /* No, _don't_ ask me why the sense of the args has to be
          different in the S vs Z case.  I don't know. */
diff --git a/priv/host_amd64_defs.h b/priv/host_amd64_defs.h
index 6ebe9b6..b3959e4 100644
--- a/priv/host_amd64_defs.h
+++ b/priv/host_amd64_defs.h
@@ -369,6 +369,7 @@
       Ain_XAssisted,   /* assisted transfer to GA */
       Ain_CMov64,      /* conditional move, 64-bit reg-reg only */
       Ain_CLoad,       /* cond. load to int reg, 32 bit ZX or 64 bit only */
+      Ain_CStore,      /* cond. store from int reg, 32 or 64 bit only */
       Ain_MovxLQ,      /* reg-reg move, zx-ing/sx-ing top half */
       Ain_LoadEX,      /* mov{s,z}{b,w,l}q from mem to reg */
       Ain_Store,       /* store 32/16/8 bit value in memory */
@@ -514,6 +515,14 @@
             AMD64AMode*   addr;
             HReg          dst;
          } CLoad;
+         /* cond. store from int reg, 32 or 64 bit only.
+            cond may not be Acc_ALWAYS. */
+         struct {
+            AMD64CondCode cond;
+            UChar         szB; /* 4 or 8 only */
+            HReg          src;
+            AMD64AMode*   addr;
+         } CStore;
          /* reg-reg move, sx-ing/zx-ing top half */
          struct {
             Bool syned;
@@ -721,6 +730,8 @@
 extern AMD64Instr* AMD64Instr_CMov64     ( AMD64CondCode, HReg src, HReg dst );
 extern AMD64Instr* AMD64Instr_CLoad      ( AMD64CondCode cond, UChar szB,
                                            AMD64AMode* addr, HReg dst );
+extern AMD64Instr* AMD64Instr_CStore     ( AMD64CondCode cond, UChar szB,
+                                           HReg src, AMD64AMode* addr );
 extern AMD64Instr* AMD64Instr_MovxLQ     ( Bool syned, HReg src, HReg dst );
 extern AMD64Instr* AMD64Instr_LoadEX     ( UChar szSmall, Bool syned,
                                            AMD64AMode* src, HReg dst );
diff --git a/priv/host_amd64_isel.c b/priv/host_amd64_isel.c
index 999ce95..a10e1fc 100644
--- a/priv/host_amd64_isel.c
+++ b/priv/host_amd64_isel.c
@@ -4314,6 +4314,28 @@
       return;
    }
 
+   /* --------- STOREG (guarded store) --------- */
+   case Ist_StoreG: {
+      IRStoreG* sg = stmt->Ist.StoreG.details;
+      if (sg->end != Iend_LE)
+         goto stmt_fail;
+
+      UChar szB = 0; /* invalid */
+      switch (typeOfIRExpr(env->type_env, sg->data)) {
+         case Ity_I32: szB = 4; break;
+         case Ity_I64: szB = 8; break;
+         default: break;
+      }
+      if (szB == 0)
+         goto stmt_fail;
+
+      AMD64AMode*   amAddr = iselIntExpr_AMode(env, sg->addr);
+      HReg          rSrc   = iselIntExpr_R(env, sg->data);
+      AMD64CondCode cc     = iselCondCode(env, sg->guard);
+      addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr));
+      return;
+   }
+
    /* --------- STORE --------- */
    case Ist_Store: {
       IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);