Handle the new IROps introduced to support ARM64 SIMD.
git-svn-id: svn://svn.valgrind.org/valgrind/trunk@14362 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index 073cff0..fef5b49 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -48,6 +48,13 @@
Check the interpretation for vector narrowing and widening ops,
particularly the saturating ones. I suspect they are either overly
pessimistic and/or wrong.
+
+ Iop_QandSQsh64x2 and friends (vector-by-vector bidirectional
+ saturating shifts): the interpretation is overly pessimistic.
+ See comments on the relevant cases below for details.
+
+ Iop_Sh64Sx2 and friends (vector-by-vector bidirectional shifts,
+ both rounding and non-rounding variants): ditto
*/
/* This file implements the Memcheck instrumentation, and in
@@ -790,6 +797,31 @@
return assignNew('V', mce, Ity_I32, unop(Iop_64to32, tmp));
}
+ if (src_ty == Ity_V128 && dst_ty == Ity_I64) {
+ /* Use InterleaveHI64x2 to copy the top half of the vector into
+ the bottom half. Then we can UifU it with the original, throw
+ away the upper half of the result, and PCast-I64-to-I64
+ the lower half. */
+ // Generates vbits[127:64] : vbits[127:64]
+ IRAtom* hi64hi64
+ = assignNew('V', mce, Ity_V128,
+ binop(Iop_InterleaveHI64x2, vbits, vbits));
+ // Generates
+ // UifU(vbits[127:64],vbits[127:64]) : UifU(vbits[127:64],vbits[63:0])
+ // == vbits[127:64] : UifU(vbits[127:64],vbits[63:0])
+ IRAtom* lohi64
+ = mkUifUV128(mce, hi64hi64, vbits);
+ // Generates UifU(vbits[127:64],vbits[63:0])
+ IRAtom* lo64
+ = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, lohi64));
+ // Generates
+ // PCast-to-I64( UifU(vbits[127:64], vbits[63:0] )
+ // == PCast-to-I64( vbits[127:0] )
+ IRAtom* res
+ = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, lo64));
+ return res;
+ }
+
/* Else do it the slow way .. */
/* First of all, collapse vbits down to a single bit. */
tmp1 = NULL;
@@ -857,6 +889,42 @@
}
}
+/* This is a minor variant. It takes an arg of some type and returns
+ a value of the same type. The result consists entirely of Defined
+ (zero) bits except its least significant bit, which is a PCast of
+ the entire argument down to a single bit. */
+static IRAtom* mkPCastXXtoXXlsb ( MCEnv* mce, IRAtom* varg, IRType ty )
+{
+ if (ty == Ity_V128) {
+ /* --- Case for V128 --- */
+ IRAtom* varg128 = varg;
+ // generates: PCast-to-I64(varg128)
+ IRAtom* pcdTo64 = mkPCastTo(mce, Ity_I64, varg128);
+ // Now introduce zeros (defined bits) in the top 63 places
+ // generates: Def--(63)--Def PCast-to-I1(varg128)
+ IRAtom* d63pc
+ = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcdTo64, mkU64(1)));
+ // generates: Def--(64)--Def
+ IRAtom* d64
+ = definedOfType(Ity_I64);
+ // generates: Def--(127)--Def PCast-to-I1(varg128)
+ IRAtom* res
+ = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, d64, d63pc));
+ return res;
+ }
+ if (ty == Ity_I64) {
+ /* --- Case for I64 --- */
+ // PCast to 64
+ IRAtom* pcd = mkPCastTo(mce, Ity_I64, varg);
+ // Zero (Def) out the top 63 bits
+ IRAtom* res
+ = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcd, mkU64(1)));
+ return res;
+ }
+ /*NOTREACHED*/
+ tl_assert(0);
+}
+
/* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
/*
Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
@@ -3123,11 +3191,20 @@
return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
/* V x V shifts/rotates are done using the standard lazy scheme. */
+ /* For the non-rounding variants of bi-di vector x vector
+ shifts (the Iop_Sh.. ops, that is) we use the lazy scheme.
+ But note that this is overly pessimistic, because in fact only
+ the bottom 8 bits of each lane of the second argument are taken
+ into account when shifting. So really we ought to ignore
+ undefinedness in bits 8 and above of each lane in the
+ second argument. */
case Iop_Shl8x16:
case Iop_Shr8x16:
case Iop_Sar8x16:
case Iop_Sal8x16:
case Iop_Rol8x16:
+ case Iop_Sh8Sx16:
+ case Iop_Sh8Ux16:
return mkUifUV128(mce,
assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
mkPCast8x16(mce,vatom2)
@@ -3138,6 +3215,8 @@
case Iop_Sar16x8:
case Iop_Sal16x8:
case Iop_Rol16x8:
+ case Iop_Sh16Sx8:
+ case Iop_Sh16Ux8:
return mkUifUV128(mce,
assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
mkPCast16x8(mce,vatom2)
@@ -3148,6 +3227,8 @@
case Iop_Sar32x4:
case Iop_Sal32x4:
case Iop_Rol32x4:
+ case Iop_Sh32Sx4:
+ case Iop_Sh32Ux4:
return mkUifUV128(mce,
assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
mkPCast32x4(mce,vatom2)
@@ -3158,11 +3239,31 @@
case Iop_Sar64x2:
case Iop_Sal64x2:
case Iop_Rol64x2:
+ case Iop_Sh64Sx2:
+ case Iop_Sh64Ux2:
return mkUifUV128(mce,
assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
mkPCast64x2(mce,vatom2)
);
+ /* For the rounding variants of bi-di vector x vector shifts, the
+ rounding adjustment can cause undefinedness to propagate through
+ the entire lane, in the worst case. Too complex to handle
+ properly .. just UifU the arguments and then PCast them.
+ Suboptimal but safe. */
+ case Iop_Rsh8Sx16:
+ case Iop_Rsh8Ux16:
+ return binary8Ix16(mce, vatom1, vatom2);
+ case Iop_Rsh16Sx8:
+ case Iop_Rsh16Ux8:
+ return binary16Ix8(mce, vatom1, vatom2);
+ case Iop_Rsh32Sx4:
+ case Iop_Rsh32Ux4:
+ return binary32Ix4(mce, vatom1, vatom2);
+ case Iop_Rsh64Sx2:
+ case Iop_Rsh64Ux2:
+ return binary64Ix2(mce, vatom1, vatom2);
+
case Iop_F32ToFixed32Ux4_RZ:
case Iop_F32ToFixed32Sx4_RZ:
case Iop_Fixed32UToF32x4_RN:
@@ -3191,6 +3292,8 @@
case Iop_Avg8Sx16:
case Iop_QAdd8Ux16:
case Iop_QAdd8Sx16:
+ case Iop_QAddExtUSsatSS8x16:
+ case Iop_QAddExtSUsatUU8x16:
case Iop_QSal8x16:
case Iop_QShl8x16:
case Iop_Add8x16:
@@ -3216,6 +3319,8 @@
case Iop_Avg16Sx8:
case Iop_QAdd16Ux8:
case Iop_QAdd16Sx8:
+ case Iop_QAddExtUSsatSS16x8:
+ case Iop_QAddExtSUsatUU16x8:
case Iop_QSal16x8:
case Iop_QShl16x8:
case Iop_Add16x8:
@@ -3232,6 +3337,8 @@
case Iop_QAdd32Ux4:
case Iop_QSub32Sx4:
case Iop_QSub32Ux4:
+ case Iop_QAddExtUSsatSS32x4:
+ case Iop_QAddExtSUsatUU32x4:
case Iop_QSal32x4:
case Iop_QShl32x4:
case Iop_Avg32Ux4:
@@ -3262,6 +3369,8 @@
case Iop_QAdd64Sx2:
case Iop_QSub64Ux2:
case Iop_QSub64Sx2:
+ case Iop_QAddExtUSsatSS64x2:
+ case Iop_QAddExtSUsatUU64x2:
case Iop_PolynomialMulAdd64x2:
case Iop_CipherV128:
case Iop_CipherLV128:
@@ -3359,6 +3468,80 @@
complainIfUndefined(mce, atom2, NULL);
return mkPCast32x4(mce, vatom1);
+ /* Q-and-Qshift-by-imm-and-narrow of the form (V128, I8) -> V128.
+ To make this simpler, do the following:
+ * complain if the shift amount (the I8) is undefined
+ * pcast each lane at the wide width
+ * truncate each lane to half width
+ * pcast the resulting 64-bit value to a single bit and use
+ that as the least significant bit of the upper half of the
+ result. */
+ case Iop_QandQShrNnarrow64Uto32Ux2:
+ case Iop_QandQSarNnarrow64Sto32Sx2:
+ case Iop_QandQSarNnarrow64Sto32Ux2:
+ case Iop_QandQRShrNnarrow64Uto32Ux2:
+ case Iop_QandQRSarNnarrow64Sto32Sx2:
+ case Iop_QandQRSarNnarrow64Sto32Ux2:
+ case Iop_QandQShrNnarrow32Uto16Ux4:
+ case Iop_QandQSarNnarrow32Sto16Sx4:
+ case Iop_QandQSarNnarrow32Sto16Ux4:
+ case Iop_QandQRShrNnarrow32Uto16Ux4:
+ case Iop_QandQRSarNnarrow32Sto16Sx4:
+ case Iop_QandQRSarNnarrow32Sto16Ux4:
+ case Iop_QandQShrNnarrow16Uto8Ux8:
+ case Iop_QandQSarNnarrow16Sto8Sx8:
+ case Iop_QandQSarNnarrow16Sto8Ux8:
+ case Iop_QandQRShrNnarrow16Uto8Ux8:
+ case Iop_QandQRSarNnarrow16Sto8Sx8:
+ case Iop_QandQRSarNnarrow16Sto8Ux8:
+ {
+ IRAtom* (*fnPessim) (MCEnv*, IRAtom*) = NULL;
+ IROp opNarrow = Iop_INVALID;
+ switch (op) {
+ case Iop_QandQShrNnarrow64Uto32Ux2:
+ case Iop_QandQSarNnarrow64Sto32Sx2:
+ case Iop_QandQSarNnarrow64Sto32Ux2:
+ case Iop_QandQRShrNnarrow64Uto32Ux2:
+ case Iop_QandQRSarNnarrow64Sto32Sx2:
+ case Iop_QandQRSarNnarrow64Sto32Ux2:
+ fnPessim = mkPCast64x2;
+ opNarrow = Iop_NarrowUn64to32x2;
+ break;
+ case Iop_QandQShrNnarrow32Uto16Ux4:
+ case Iop_QandQSarNnarrow32Sto16Sx4:
+ case Iop_QandQSarNnarrow32Sto16Ux4:
+ case Iop_QandQRShrNnarrow32Uto16Ux4:
+ case Iop_QandQRSarNnarrow32Sto16Sx4:
+ case Iop_QandQRSarNnarrow32Sto16Ux4:
+ fnPessim = mkPCast32x4;
+ opNarrow = Iop_NarrowUn32to16x4;
+ break;
+ case Iop_QandQShrNnarrow16Uto8Ux8:
+ case Iop_QandQSarNnarrow16Sto8Sx8:
+ case Iop_QandQSarNnarrow16Sto8Ux8:
+ case Iop_QandQRShrNnarrow16Uto8Ux8:
+ case Iop_QandQRSarNnarrow16Sto8Sx8:
+ case Iop_QandQRSarNnarrow16Sto8Ux8:
+ fnPessim = mkPCast16x8;
+ opNarrow = Iop_NarrowUn16to8x8;
+ break;
+ default:
+ tl_assert(0);
+ }
+ complainIfUndefined(mce, atom2, NULL);
+ // Pessimised shift result
+ IRAtom* shV
+ = fnPessim(mce, vatom1);
+ // Narrowed, pessimised shift result
+ IRAtom* shVnarrowed
+ = assignNew('V', mce, Ity_I64, unop(opNarrow, shV));
+ // Generates: Def--(63)--Def PCast-to-I1(narrowed)
+ IRAtom* qV = mkPCastXXtoXXlsb(mce, shVnarrowed, Ity_I64);
+ // and assemble the result
+ return assignNew('V', mce, Ity_V128,
+ binop(Iop_64HLtoV128, qV, shVnarrowed));
+ }
+
case Iop_Mull32Sx2:
case Iop_Mull32Ux2:
case Iop_QDMull32Sx2:
@@ -3977,6 +4160,66 @@
mkPCast32x8(mce, vatom2)
);
+ /* Q-and-Qshift-by-vector of the form (V128, V128) -> V256.
+ Handle the shifted results in the same way that other
+ binary Q ops are handled, eg QSub: UifU the two args,
+ then pessimise -- which is binaryNIxM. But for the upper
+ V128, we require to generate just 1 bit which is the
+ pessimised shift result, with 127 defined zeroes above it.
+
+ Note that this overly pessimistic in that in fact only the
+ bottom 8 bits of each lane of the second arg determine the shift
+ amount. Really we ought to ignore any undefinedness in the
+ rest of the lanes of the second arg. */
+ case Iop_QandSQsh64x2: case Iop_QandUQsh64x2:
+ case Iop_QandSQRsh64x2: case Iop_QandUQRsh64x2:
+ case Iop_QandSQsh32x4: case Iop_QandUQsh32x4:
+ case Iop_QandSQRsh32x4: case Iop_QandUQRsh32x4:
+ case Iop_QandSQsh16x8: case Iop_QandUQsh16x8:
+ case Iop_QandSQRsh16x8: case Iop_QandUQRsh16x8:
+ case Iop_QandSQsh8x16: case Iop_QandUQsh8x16:
+ case Iop_QandSQRsh8x16: case Iop_QandUQRsh8x16:
+ {
+ // The function to generate the pessimised shift result
+ IRAtom* (*binaryNIxM)(MCEnv*,IRAtom*,IRAtom*) = NULL;
+ switch (op) {
+ case Iop_QandSQsh64x2:
+ case Iop_QandUQsh64x2:
+ case Iop_QandSQRsh64x2:
+ case Iop_QandUQRsh64x2:
+ binaryNIxM = binary64Ix2;
+ break;
+ case Iop_QandSQsh32x4:
+ case Iop_QandUQsh32x4:
+ case Iop_QandSQRsh32x4:
+ case Iop_QandUQRsh32x4:
+ binaryNIxM = binary32Ix4;
+ break;
+ case Iop_QandSQsh16x8:
+ case Iop_QandUQsh16x8:
+ case Iop_QandSQRsh16x8:
+ case Iop_QandUQRsh16x8:
+ binaryNIxM = binary16Ix8;
+ break;
+ case Iop_QandSQsh8x16:
+ case Iop_QandUQsh8x16:
+ case Iop_QandSQRsh8x16:
+ case Iop_QandUQRsh8x16:
+ binaryNIxM = binary8Ix16;
+ break;
+ default:
+ tl_assert(0);
+ }
+ tl_assert(binaryNIxM);
+ // Pessimised shift result, shV[127:0]
+ IRAtom* shV = binaryNIxM(mce, vatom1, vatom2);
+ // Generates: Def--(127)--Def PCast-to-I1(shV)
+ IRAtom* qV = mkPCastXXtoXXlsb(mce, shV, Ity_V128);
+ // and assemble the result
+ return assignNew('V', mce, Ity_V256,
+ binop(Iop_V128HLtoV256, qV, shV));
+ }
+
default:
ppIROp(op);
VG_(tool_panic)("memcheck:expr2vbits_Binop");
@@ -4047,6 +4290,7 @@
case Iop_Dup8x16:
case Iop_Dup16x8:
case Iop_Dup32x4:
+ case Iop_Reverse1sIn8_x16:
case Iop_Reverse8sIn16_x8:
case Iop_Reverse8sIn32_x4:
case Iop_Reverse16sIn32_x4:
@@ -4232,6 +4476,7 @@
case Iop_FtoI32Ux4_RZ:
case Iop_FtoI32Sx4_RZ:
case Iop_Abs32x4:
+ case Iop_RSqrtEst32Ux4:
return mkPCast32x4(mce, vatom);
case Iop_CmpwNEZ32: