Use IR conditional stores (IRStoreG) to implement AVX-2 conditional
vector stores, VPMASKMOV{D,Q} xmm/ymm to memory.
git-svn-id: svn://svn.valgrind.org/vex/trunk@3077 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_amd64_toIR.c b/priv/guest_amd64_toIR.c
index d0c2233..35aaa73 100644
--- a/priv/guest_amd64_toIR.c
+++ b/priv/guest_amd64_toIR.c
@@ -27235,10 +27235,11 @@
}
-/* Masked load. */
-static ULong dis_VMASKMOV_load ( Bool *uses_vvvv, const VexAbiInfo* vbi,
- Prefix pfx, Long delta,
- const HChar* opname, Bool isYMM, IRType ty )
+/* Masked load or masked store. */
+static ULong dis_VMASKMOV ( Bool *uses_vvvv, const VexAbiInfo* vbi,
+ Prefix pfx, Long delta,
+ const HChar* opname, Bool isYMM, IRType ty,
+ Bool isLoad )
{
HChar dis_buf[50];
Int alen, i;
@@ -27246,49 +27247,54 @@
UChar modrm = getUChar(delta);
UInt rG = gregOfRexRM(pfx,modrm);
UInt rV = getVexNvvvv(pfx);
- IRTemp res[8], cond;
+
addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
- if (isYMM) {
- DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
- } else {
- DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
- }
delta += alen;
- for (i = 0; i < sizeof(res)/sizeof(res[0]); i++)
- res[i] = IRTemp_INVALID;
+ /**/ if (isLoad && isYMM) {
+ DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
+ }
+ else if (isLoad && !isYMM) {
+ DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
+ }
- for (i = 0; i < 2 * (isYMM ? 2 : 1) * (ty == Ity_I32 ? 2 : 1); i++) {
- res[i] = newTemp(ty);
- cond = newTemp(Ity_I1);
- assign( cond,
- binop(ty == Ity_I32 ? Iop_CmpLT32S : Iop_CmpLT64S,
- ty == Ity_I32 ? getYMMRegLane32( rV, i )
- : getYMMRegLane64( rV, i ),
+ else if (!isLoad && isYMM) {
+ DIP("%s %s,%s,%s\n", opname, nameYMMReg(rV), nameYMMReg(rG), dis_buf );
+ }
+ else {
+ vassert(!isLoad && !isYMM);
+ DIP("%s %s,%s,%s\n", opname, nameXMMReg(rV), nameXMMReg(rG), dis_buf );
+ }
+
+ vassert(ty == Ity_I32 || ty == Ity_I64);
+ Bool laneIs32 = ty == Ity_I32;
+
+ Int nLanes = (isYMM ? 2 : 1) * (laneIs32 ? 4 : 2);
+
+ for (i = 0; i < nLanes; i++) {
+ IRTemp cond = newTemp(Ity_I1);
+ assign( cond,
+ binop(laneIs32 ? Iop_CmpLT32S : Iop_CmpLT64S,
+ (laneIs32 ? getYMMRegLane32 : getYMMRegLane64)( rV, i ),
mkU(ty, 0) ));
- stmt(
- IRStmt_LoadG(
- Iend_LE,
- ty == Ity_I32 ? ILGop_Ident32 : ILGop_Ident64,
- res[i],
- binop(Iop_Add64, mkexpr(addr), mkU64(i * (ty == Ity_I32 ? 4 : 8))),
- ty == Ity_I32 ? mkU32(0) : mkU64(0),
- mkexpr(cond)
- ));
+ IRTemp data = newTemp(ty);
+ IRExpr* ea = binop(Iop_Add64, mkexpr(addr),
+ mkU64(i * (laneIs32 ? 4 : 8)));
+ if (isLoad) {
+ stmt(
+ IRStmt_LoadG(
+ Iend_LE, laneIs32 ? ILGop_Ident32 : ILGop_Ident64,
+ data, ea, laneIs32 ? mkU32(0) : mkU64(0), mkexpr(cond)
+ ));
+ (laneIs32 ? putYMMRegLane32 : putYMMRegLane64)( rG, i, mkexpr(data) );
+ } else {
+ assign(data, (laneIs32 ? getYMMRegLane32 : getYMMRegLane64)( rG, i ));
+ stmt( IRStmt_StoreG(Iend_LE, ea, mkexpr(data), mkexpr(cond)) );
+ }
}
- switch (ty) {
- case Ity_I32:
- for (i = 0; i < 8; i++)
- putYMMRegLane32( rG, i, (i < 4 || isYMM)
- ? mkexpr(res[i]) : mkU32(0) );
- break;
- case Ity_I64:
- for (i = 0; i < 4; i++)
- putYMMRegLane64( rG, i, (i < 2 || isYMM)
- ? mkexpr(res[i]) : mkU64(0) );
- break;
- default: vassert(0);
- }
+
+ if (isLoad && !isYMM)
+ putYMMRegLane128( rG, 1, mkV128(0) );
*uses_vvvv = True;
return delta;
@@ -28202,15 +28208,15 @@
/* VMASKMOVPS m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 2C /r */
if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
&& !epartIsReg(getUChar(delta))) {
- delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
- /*!isYMM*/False, Ity_I32 );
+ delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
+ /*!isYMM*/False, Ity_I32, /*isLoad*/True );
goto decode_success;
}
/* VMASKMOVPS m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 2C /r */
if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
&& !epartIsReg(getUChar(delta))) {
- delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
- /*isYMM*/True, Ity_I32 );
+ delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
+ /*isYMM*/True, Ity_I32, /*isLoad*/True );
goto decode_success;
}
break;
@@ -28219,15 +28225,15 @@
/* VMASKMOVPD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 2D /r */
if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
&& !epartIsReg(getUChar(delta))) {
- delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
- /*!isYMM*/False, Ity_I64 );
+ delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
+ /*!isYMM*/False, Ity_I64, /*isLoad*/True );
goto decode_success;
}
/* VMASKMOVPD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 2D /r */
if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
&& !epartIsReg(getUChar(delta))) {
- delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
- /*isYMM*/True, Ity_I64 );
+ delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
+ /*isYMM*/True, Ity_I64, /*isLoad*/True );
goto decode_success;
}
break;
@@ -28788,29 +28794,60 @@
/* VPMASKMOVD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 8C /r */
if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
&& 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
- delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
- /*!isYMM*/False, Ity_I32 );
+ delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
+ /*!isYMM*/False, Ity_I32, /*isLoad*/True );
goto decode_success;
}
/* VPMASKMOVD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 8C /r */
if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
&& 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
- delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
- /*isYMM*/True, Ity_I32 );
+ delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
+ /*isYMM*/True, Ity_I32, /*isLoad*/True );
goto decode_success;
}
/* VPMASKMOVQ m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 8C /r */
if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
&& 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
- delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
- /*!isYMM*/False, Ity_I64 );
+ delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
+ /*!isYMM*/False, Ity_I64, /*isLoad*/True );
goto decode_success;
}
/* VPMASKMOVQ m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 8C /r */
if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
&& 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
- delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
- /*isYMM*/True, Ity_I64 );
+ delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
+ /*isYMM*/True, Ity_I64, /*isLoad*/True );
+ goto decode_success;
+ }
+ break;
+
+ case 0x8E:
+ /* VPMASKMOVD xmm2, xmm1, m128 = VEX.NDS.128.66.0F38.W0 8E /r */
+ if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+ && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
+ delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
+ /*!isYMM*/False, Ity_I32, /*!isLoad*/False );
+ goto decode_success;
+ }
+ /* VPMASKMOVD ymm2, ymm1, m256 = VEX.NDS.256.66.0F38.W0 8E /r */
+ if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+ && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
+ delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
+ /*isYMM*/True, Ity_I32, /*!isLoad*/False );
+ goto decode_success;
+ }
+ /* VPMASKMOVQ xmm2, xmm1, m128 = VEX.NDS.128.66.0F38.W1 8E /r */
+ if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+ && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
+ delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
+ /*!isYMM*/False, Ity_I64, /*!isLoad*/False );
+ goto decode_success;
+ }
+ /* VPMASKMOVQ ymm2, ymm1, m256 = VEX.NDS.256.66.0F38.W1 8E /r */
+ if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+ && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
+ delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
+ /*isYMM*/True, Ity_I64, /*!isLoad*/False );
goto decode_success;
}
break;
diff --git a/priv/host_amd64_defs.c b/priv/host_amd64_defs.c
index fdbf05e..4cfd9a4 100644
--- a/priv/host_amd64_defs.c
+++ b/priv/host_amd64_defs.c
@@ -753,7 +753,18 @@
i->Ain.CLoad.szB = szB;
i->Ain.CLoad.addr = addr;
i->Ain.CLoad.dst = dst;
- vassert(cond != Acc_ALWAYS);
+ vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
+ return i;
+}
+AMD64Instr* AMD64Instr_CStore ( AMD64CondCode cond, UChar szB,
+ HReg src, AMD64AMode* addr ) {
+ AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr));
+ i->tag = Ain_CStore;
+ i->Ain.CStore.cond = cond;
+ i->Ain.CStore.szB = szB;
+ i->Ain.CStore.src = src;
+ i->Ain.CStore.addr = addr;
+ vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
return i;
}
AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ) {
@@ -1135,13 +1146,24 @@
case Ain_CLoad:
vex_printf("if (%%rflags.%s) { ",
showAMD64CondCode(i->Ain.CLoad.cond));
- vex_printf("mov%c (", i->Ain.CLoad.szB == 4 ? 'l' : 'q');
+ vex_printf("mov%c ", i->Ain.CLoad.szB == 4 ? 'l' : 'q');
ppAMD64AMode(i->Ain.CLoad.addr);
- vex_printf("), ");
+ vex_printf(", ");
(i->Ain.CLoad.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
(i->Ain.CLoad.dst);
vex_printf(" }");
return;
+ case Ain_CStore:
+ vex_printf("if (%%rflags.%s) { ",
+ showAMD64CondCode(i->Ain.CStore.cond));
+ vex_printf("mov%c ", i->Ain.CStore.szB == 4 ? 'l' : 'q');
+ (i->Ain.CStore.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
+ (i->Ain.CStore.src);
+ vex_printf(", ");
+ ppAMD64AMode(i->Ain.CStore.addr);
+ vex_printf(" }");
+ return;
+
case Ain_MovxLQ:
vex_printf("mov%clq ", i->Ain.MovxLQ.syned ? 's' : 'z');
ppHRegAMD64_lo32(i->Ain.MovxLQ.src);
@@ -1488,6 +1510,10 @@
addRegUsage_AMD64AMode(u, i->Ain.CLoad.addr);
addHRegUse(u, HRmModify, i->Ain.CLoad.dst);
return;
+ case Ain_CStore:
+ addRegUsage_AMD64AMode(u, i->Ain.CStore.addr);
+ addHRegUse(u, HRmRead, i->Ain.CStore.src);
+ return;
case Ain_MovxLQ:
addHRegUse(u, HRmRead, i->Ain.MovxLQ.src);
addHRegUse(u, HRmWrite, i->Ain.MovxLQ.dst);
@@ -1724,6 +1750,10 @@
mapRegs_AMD64AMode(m, i->Ain.CLoad.addr);
mapReg(m, &i->Ain.CLoad.dst);
return;
+ case Ain_CStore:
+ mapRegs_AMD64AMode(m, i->Ain.CStore.addr);
+ mapReg(m, &i->Ain.CStore.src);
+ return;
case Ain_MovxLQ:
mapReg(m, &i->Ain.MovxLQ.src);
mapReg(m, &i->Ain.MovxLQ.dst);
@@ -3035,6 +3065,35 @@
goto done;
}
+ case Ain_CStore: {
+ /* AFAICS this is identical to Ain_CStore except that the opcode
+ is 0x89 instead of 0x8B. */
+ vassert(i->Ain.CStore.cond != Acc_ALWAYS);
+
+ /* Only 32- or 64-bit variants are allowed. */
+ vassert(i->Ain.CStore.szB == 4 || i->Ain.CStore.szB == 8);
+
+ /* Use ptmp for backpatching conditional jumps. */
+ ptmp = NULL;
+
+ /* jmp fwds if !condition */
+ *p++ = toUChar(0x70 + (0xF & (i->Ain.CStore.cond ^ 1)));
+ ptmp = p; /* fill in this bit later */
+ *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
+
+ /* Now the store. */
+ rex = rexAMode_M(i->Ain.CStore.src, i->Ain.CStore.addr);
+ *p++ = i->Ain.CStore.szB == 4 ? clearWBit(rex) : rex;
+ *p++ = 0x89;
+ p = doAMode_M(p, i->Ain.CStore.src, i->Ain.CStore.addr);
+
+ /* Fix up the conditional branch */
+ Int delta = p - ptmp;
+ vassert(delta > 0 && delta < 40);
+ *ptmp = toUChar(delta-1);
+ goto done;
+ }
+
case Ain_MovxLQ:
/* No, _don't_ ask me why the sense of the args has to be
different in the S vs Z case. I don't know. */
diff --git a/priv/host_amd64_defs.h b/priv/host_amd64_defs.h
index 6ebe9b6..b3959e4 100644
--- a/priv/host_amd64_defs.h
+++ b/priv/host_amd64_defs.h
@@ -369,6 +369,7 @@
Ain_XAssisted, /* assisted transfer to GA */
Ain_CMov64, /* conditional move, 64-bit reg-reg only */
Ain_CLoad, /* cond. load to int reg, 32 bit ZX or 64 bit only */
+ Ain_CStore, /* cond. store from int reg, 32 or 64 bit only */
Ain_MovxLQ, /* reg-reg move, zx-ing/sx-ing top half */
Ain_LoadEX, /* mov{s,z}{b,w,l}q from mem to reg */
Ain_Store, /* store 32/16/8 bit value in memory */
@@ -514,6 +515,14 @@
AMD64AMode* addr;
HReg dst;
} CLoad;
+ /* cond. store from int reg, 32 or 64 bit only.
+ cond may not be Acc_ALWAYS. */
+ struct {
+ AMD64CondCode cond;
+ UChar szB; /* 4 or 8 only */
+ HReg src;
+ AMD64AMode* addr;
+ } CStore;
/* reg-reg move, sx-ing/zx-ing top half */
struct {
Bool syned;
@@ -721,6 +730,8 @@
extern AMD64Instr* AMD64Instr_CMov64 ( AMD64CondCode, HReg src, HReg dst );
extern AMD64Instr* AMD64Instr_CLoad ( AMD64CondCode cond, UChar szB,
AMD64AMode* addr, HReg dst );
+extern AMD64Instr* AMD64Instr_CStore ( AMD64CondCode cond, UChar szB,
+ HReg src, AMD64AMode* addr );
extern AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst );
extern AMD64Instr* AMD64Instr_LoadEX ( UChar szSmall, Bool syned,
AMD64AMode* src, HReg dst );
diff --git a/priv/host_amd64_isel.c b/priv/host_amd64_isel.c
index 999ce95..a10e1fc 100644
--- a/priv/host_amd64_isel.c
+++ b/priv/host_amd64_isel.c
@@ -4314,6 +4314,28 @@
return;
}
+ /* --------- STOREG (guarded store) --------- */
+ case Ist_StoreG: {
+ IRStoreG* sg = stmt->Ist.StoreG.details;
+ if (sg->end != Iend_LE)
+ goto stmt_fail;
+
+ UChar szB = 0; /* invalid */
+ switch (typeOfIRExpr(env->type_env, sg->data)) {
+ case Ity_I32: szB = 4; break;
+ case Ity_I64: szB = 8; break;
+ default: break;
+ }
+ if (szB == 0)
+ goto stmt_fail;
+
+ AMD64AMode* amAddr = iselIntExpr_AMode(env, sg->addr);
+ HReg rSrc = iselIntExpr_R(env, sg->data);
+ AMD64CondCode cc = iselCondCode(env, sg->guard);
+ addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr));
+ return;
+ }
+
/* --------- STORE --------- */
case Ist_Store: {
IRType tya = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);