arm64: implement: sadalp uadalp saddlp uaddlp saddlv uaddlv saddw{2}
uaddw{2} ssubw{2} usubw{2} shadd uhadd shsub uhsub sqadd uqadd sqsub
uqsub smaxp umaxp sminp uminp
git-svn-id: svn://svn.valgrind.org/vex/trunk@2895 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_arm64_toIR.c b/priv/guest_arm64_toIR.c
index 166a727..c084abe 100644
--- a/priv/guest_arm64_toIR.c
+++ b/priv/guest_arm64_toIR.c
@@ -539,6 +539,113 @@
}
}
+static IROp mkVecADD ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_Add8x16, Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecQADDU ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_QAdd8Ux16, Iop_QAdd16Ux8, Iop_QAdd32Ux4, Iop_QAdd64Ux2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecQADDS ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_QAdd8Sx16, Iop_QAdd16Sx8, Iop_QAdd32Sx4, Iop_QAdd64Sx2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecSUB ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecQSUBU ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_QSub8Ux16, Iop_QSub16Ux8, Iop_QSub32Ux4, Iop_QSub64Ux2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecQSUBS ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_QSub8Sx16, Iop_QSub16Sx8, Iop_QSub32Sx4, Iop_QSub64Sx2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecSARN ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_SarN8x16, Iop_SarN16x8, Iop_SarN32x4, Iop_SarN64x2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecSHRN ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_ShrN8x16, Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecSHLN ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_ShlN8x16, Iop_ShlN16x8, Iop_ShlN32x4, Iop_ShlN64x2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecCATEVENLANES ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_CatEvenLanes8x16, Iop_CatEvenLanes16x8,
+ Iop_CatEvenLanes32x4, Iop_InterleaveLO64x2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecCATODDLANES ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_CatOddLanes8x16, Iop_CatOddLanes16x8,
+ Iop_CatOddLanes32x4, Iop_InterleaveHI64x2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecMAXU ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4, Iop_Max64Ux2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecMAXS ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4, Iop_Max64Sx2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecMINU ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4, Iop_Min64Ux2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecMINS ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4, Iop_Min64Sx2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
static IRExpr* mkU ( IRType ty, ULong imm ) {
switch (ty) {
case Ity_I32: return mkU32((UInt)(imm & 0xFFFFFFFFULL));
@@ -1730,6 +1837,14 @@
vassert(0);
}
+/* The same, but from an expression instead. */
+static IRExpr* math_MAYBE_ZERO_HI64_fromE ( UInt bitQ, IRExpr* fullWidth )
+{
+ IRTemp fullWidthT = newTemp(Ity_V128);
+ assign(fullWidthT, fullWidth);
+ return math_MAYBE_ZERO_HI64(bitQ, fullWidthT);
+}
+
/*------------------------------------------------------------*/
/*--- FP comparison helpers ---*/
@@ -5539,6 +5654,18 @@
assign(res, unop(Iop_ZeroHI96ofV128, mkexpr(max3210)));
return res;
}
+ case Iop_Add64x2: {
+ IRTemp x10 = src;
+ IRTemp x00 = newTemp(Ity_V128);
+ IRTemp x11 = newTemp(Ity_V128);
+ assign(x11, binop(Iop_InterleaveHI64x2, mkexpr(x10), mkexpr(x10)));
+ assign(x00, binop(Iop_InterleaveLO64x2, mkexpr(x10), mkexpr(x10)));
+ IRTemp max10 = newTemp(Ity_V128);
+ assign(max10, binop(op, mkexpr(x11), mkexpr(x00)));
+ IRTemp res = newTemp(Ity_V128);
+ assign(res, unop(Iop_ZeroHI64ofV128, mkexpr(max10)));
+ return res;
+ }
default:
vassert(0);
}
@@ -5688,8 +5815,8 @@
either the lower or upper set of lanes to twice-as-wide,
resulting in a new V128 value. */
static
-IRTemp math_WIDEN_LANES ( Bool zWiden, Bool fromUpperHalf,
- UInt sizeNarrow, IRExpr* srcE )
+IRTemp math_WIDEN_LO_OR_HI_LANES ( Bool zWiden, Bool fromUpperHalf,
+ UInt sizeNarrow, IRExpr* srcE )
{
IRTemp src = newTemp(Ity_V128);
IRTemp res = newTemp(Ity_V128);
@@ -5729,6 +5856,49 @@
}
+/* Generate IR that takes a V128 and sign- or zero-widens
+ either the even or odd lanes to twice-as-wide,
+ resulting in a new V128 value. */
+static
+IRTemp math_WIDEN_EVEN_OR_ODD_LANES ( Bool zWiden, Bool fromOdd,
+ UInt sizeNarrow, IRExpr* srcE )
+{
+ IRTemp src = newTemp(Ity_V128);
+ IRTemp res = newTemp(Ity_V128);
+ IROp opSAR = mkVecSARN(sizeNarrow+1);
+ IROp opSHR = mkVecSHRN(sizeNarrow+1);
+ IROp opSHL = mkVecSHLN(sizeNarrow+1);
+ IROp opSxR = zWiden ? opSHR : opSAR;
+ UInt amt = 0;
+ switch (sizeNarrow) {
+ case X10: amt = 32; break;
+ case X01: amt = 16; break;
+ case X00: amt = 8; break;
+ default: vassert(0);
+ }
+ assign(src, srcE);
+ if (fromOdd) {
+ assign(res, binop(opSxR, mkexpr(src), mkU8(amt)));
+ } else {
+ assign(res, binop(opSxR, binop(opSHL, mkexpr(src), mkU8(amt)),
+ mkU8(amt)));
+ }
+ return res;
+}
+
+
+/* Generate IR that takes two V128s and narrows (takes lower half)
+ of each lane, producing a single V128 value. */
+static
+IRTemp math_NARROW_LANES ( IRTemp argHi, IRTemp argLo, UInt sizeNarrow )
+{
+ IRTemp res = newTemp(Ity_V128);
+ assign(res, binop(mkVecCATEVENLANES(sizeNarrow),
+ mkexpr(argHi), mkexpr(argLo)));
+ return res;
+}
+
+
/* Let |new64| be a V128 in which only the lower 64 bits are interesting,
and the upper can contain any value -- it is ignored. If |is2| is False,
generate IR to put |new64| in the lower half of vector reg |dd| and zero
@@ -5759,6 +5929,22 @@
}
+/* QCFLAG tracks the SIMD sticky saturation status. Update the status
+ thusly: if |nres| and |qres| hold the same value, leave QCFLAG
+ unchanged. Otherwise, set it (implicitly) to 1. */
+static
+void updateQCFLAGwithDifference ( IRTemp nres, IRTemp qres )
+{
+ IRTemp diff = newTemp(Ity_V128);
+ IRTemp oldQCFLAG = newTemp(Ity_V128);
+ IRTemp newQCFLAG = newTemp(Ity_V128);
+ assign(diff, binop(Iop_XorV128, mkexpr(nres), mkexpr(qres)));
+ assign(oldQCFLAG, IRExpr_Get(OFFB_QCFLAG, Ity_V128));
+ assign(newQCFLAG, binop(Iop_OrV128, mkexpr(oldQCFLAG), mkexpr(diff)));
+ stmt(IRStmt_Put(OFFB_QCFLAG, mkexpr(newQCFLAG)));
+}
+
+
static
Bool dis_AdvSIMD_EXT(/*MB_OUT*/DisResult* dres, UInt insn)
{
@@ -5909,6 +6095,41 @@
UInt nn = INSN(9,5);
UInt dd = INSN(4,0);
+ if (opcode == BITS5(0,0,0,1,1)) {
+ /* -------- 0,xx,00011 SADDLV -------- */
+ /* -------- 1,xx,00011 UADDLV -------- */
+ /* size is the narrow size */
+ if (size == X11 || (size == X10 && bitQ == 0)) return False;
+ Bool isU = bitU == 1;
+ IRTemp src = newTemp(Ity_V128);
+ assign(src, getQReg128(nn));
+ /* The basic plan is to widen the lower half, and if Q = 1,
+ the upper half too. Add them together (if Q = 1), and in
+ either case fold with add at twice the lane width.
+ */
+ IRExpr* widened
+ = mkexpr(math_WIDEN_LO_OR_HI_LANES(
+ isU, False/*!fromUpperHalf*/, size, mkexpr(src)));
+ if (bitQ == 1) {
+ widened
+ = binop(mkVecADD(size+1),
+ widened,
+ mkexpr(math_WIDEN_LO_OR_HI_LANES(
+ isU, True/*fromUpperHalf*/, size, mkexpr(src)))
+ );
+ }
+ /* Now fold. */
+ IRTemp tWi = newTemp(Ity_V128);
+ assign(tWi, widened);
+ IRTemp res = math_FOLDV(tWi, mkVecADD(size+1));
+ putQReg128(dd, mkexpr(res));
+ const HChar* arr = nameArr_Q_SZ(bitQ, size);
+ const HChar ch = "bhsd"[size];
+ DIP("%s %s.%c, %s.%s\n", isU ? "uaddlv" : "saddlv",
+ nameQReg128(dd), ch, nameQReg128(nn), arr);
+ return True;
+ }
+
UInt ix = 0;
/**/ if (opcode == BITS5(0,1,0,1,0)) { ix = bitU == 0 ? 1 : 2; }
else if (opcode == BITS5(1,1,0,1,0)) { ix = bitU == 0 ? 3 : 4; }
@@ -7014,8 +7235,8 @@
vassert(size <= 2);
Bool isU = bitU == 1;
Bool isADD = opcode == BITS4(0,0,0,0);
- IRTemp argL = math_WIDEN_LANES(isU, is2, size, getQReg128(nn));
- IRTemp argR = math_WIDEN_LANES(isU, is2, size, getQReg128(mm));
+ IRTemp argL = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
+ IRTemp argR = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
IRTemp res = newTemp(Ity_V128);
assign(res, binop(isADD ? opADD[size] : opSUB[size],
mkexpr(argL), mkexpr(argR)));
@@ -7030,6 +7251,31 @@
return True;
}
+ if (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,0,1,1)) {
+ /* -------- 0,0001 SADDW{2} -------- */
+ /* -------- 1,0001 UADDW{2} -------- */
+ /* -------- 0,0011 SSUBW{2} -------- */
+ /* -------- 1,0011 USUBW{2} -------- */
+ /* Widens, and size refers to the narrowed lanes. */
+ if (size == X11) return False;
+ vassert(size <= 2);
+ Bool isU = bitU == 1;
+ Bool isADD = opcode == BITS4(0,0,0,1);
+ IRTemp argR = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
+ IRTemp res = newTemp(Ity_V128);
+ assign(res, binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
+ getQReg128(nn), mkexpr(argR)));
+ putQReg128(dd, mkexpr(res));
+ const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
+ const HChar* arrWide = nameArr_Q_SZ(1, size+1);
+ const HChar* nm = isADD ? (isU ? "uaddw" : "saddw")
+ : (isU ? "usubw" : "ssubw");
+ DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
+ nameQReg128(dd), arrWide,
+ nameQReg128(nn), arrWide, nameQReg128(mm), arrNarrow);
+ return True;
+ }
+
if (opcode == BITS4(0,1,0,0) || opcode == BITS4(0,1,1,0)) {
/* -------- 0,0100 ADDHN{2} -------- */
/* -------- 1,0100 RADDHN{2} -------- */
@@ -7094,8 +7340,8 @@
vassert(size <= 2);
Bool isU = bitU == 1;
Bool isACC = opcode == BITS4(0,1,0,1);
- IRTemp argL = math_WIDEN_LANES(isU, is2, size, getQReg128(nn));
- IRTemp argR = math_WIDEN_LANES(isU, is2, size, getQReg128(mm));
+ IRTemp argL = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
+ IRTemp argR = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
IRTemp abd = math_ABD(isU, size+1, mkexpr(argL), mkexpr(argR));
IRTemp res = newTemp(Ity_V128);
assign(res, isACC ? binop(opADD[size], mkexpr(abd), getQReg128(dd))
@@ -7197,6 +7443,85 @@
UInt dd = INSN(4,0);
vassert(size < 4);
+ if (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,1,0,0)) {
+ /* -------- 0,xx,00000 SHADD std6_std6_std6 -------- */
+ /* -------- 1,xx,00000 UHADD std6_std6_std6 -------- */
+ /* -------- 0,xx,00100 SHSUB std6_std6_std6 -------- */
+ /* -------- 1,xx,00100 UHSUB std6_std6_std6 -------- */
+ if (size == X11) return False;
+ Bool isADD = opcode == BITS5(0,0,0,0,0);
+ Bool isU = bitU == 1;
+ /* Widen both args out, do the math, narrow to final result. */
+ IRTemp argL = newTemp(Ity_V128);
+ IRTemp argLhi = IRTemp_INVALID;
+ IRTemp argLlo = IRTemp_INVALID;
+ IRTemp argR = newTemp(Ity_V128);
+ IRTemp argRhi = IRTemp_INVALID;
+ IRTemp argRlo = IRTemp_INVALID;
+ IRTemp resHi = newTemp(Ity_V128);
+ IRTemp resLo = newTemp(Ity_V128);
+ IRTemp res = IRTemp_INVALID;
+ assign(argL, getQReg128(nn));
+ argLlo = math_WIDEN_LO_OR_HI_LANES(isU, False, size, mkexpr(argL));
+ argLhi = math_WIDEN_LO_OR_HI_LANES(isU, True, size, mkexpr(argL));
+ assign(argR, getQReg128(mm));
+ argRlo = math_WIDEN_LO_OR_HI_LANES(isU, False, size, mkexpr(argR));
+ argRhi = math_WIDEN_LO_OR_HI_LANES(isU, True, size, mkexpr(argR));
+ IROp opADDSUB = isADD ? mkVecADD(size+1) : mkVecSUB(size+1);
+ IROp opSxR = isU ? mkVecSHRN(size+1) : mkVecSARN(size+1);
+ assign(resHi, binop(opSxR,
+ binop(opADDSUB, mkexpr(argLhi), mkexpr(argRhi)),
+ mkU8(1)));
+ assign(resLo, binop(opSxR,
+ binop(opADDSUB, mkexpr(argLlo), mkexpr(argRlo)),
+ mkU8(1)));
+ res = math_NARROW_LANES ( resHi, resLo, size );
+ putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
+ const HChar* nm = isADD ? (isU ? "uhadd" : "shadd")
+ : (isU ? "uhsub" : "shsub");
+ const HChar* arr = nameArr_Q_SZ(bitQ, size);
+ DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
+ nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+ return True;
+ }
+
+ if (opcode == BITS5(0,0,0,0,1) || opcode == BITS5(0,0,1,0,1)) {
+ /* -------- 0,xx,00001 SQADD std7_std7_std7 -------- */
+ /* -------- 1,xx,00001 UQADD std7_std7_std7 -------- */
+ /* -------- 0,xx,00101 SQSUB std7_std7_std7 -------- */
+ /* -------- 1,xx,00101 UQSUB std7_std7_std7 -------- */
+ if (bitQ == 0 && size == X11) return False; // implied 1d case
+ Bool isADD = opcode == BITS5(0,0,0,0,1);
+ Bool isU = bitU == 1;
+ IROp qop = Iop_INVALID;
+ IROp nop = Iop_INVALID;
+ if (isADD) {
+ qop = isU ? mkVecQADDU(size) : mkVecQADDS(size);
+ nop = mkVecADD(size);
+ } else {
+ qop = isU ? mkVecQSUBU(size) : mkVecQSUBS(size);
+ nop = mkVecSUB(size);
+ }
+ IRTemp argL = newTemp(Ity_V128);
+ IRTemp argR = newTemp(Ity_V128);
+ IRTemp qres = newTemp(Ity_V128);
+ IRTemp nres = newTemp(Ity_V128);
+ assign(argL, getQReg128(nn));
+ assign(argR, getQReg128(mm));
+ assign(qres, math_MAYBE_ZERO_HI64_fromE(
+ bitQ, binop(qop, mkexpr(argL), mkexpr(argR))));
+ assign(nres, math_MAYBE_ZERO_HI64_fromE(
+ bitQ, binop(nop, mkexpr(argL), mkexpr(argR))));
+ putQReg128(dd, mkexpr(qres));
+ updateQCFLAGwithDifference(nres, qres);
+ const HChar* nm = isADD ? (isU ? "uqadd" : "sqadd")
+ : (isU ? "uqsub" : "sqsub");
+ const HChar* arr = nameArr_Q_SZ(bitQ, size);
+ DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
+ nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+ return True;
+ }
+
if (bitU == 0 && opcode == BITS5(0,0,0,1,1)) {
/* -------- 0,00,00011 AND 16b_16b_16b, 8b_8b_8b -------- */
/* -------- 0,01,00011 BIC 16b_16b_16b, 8b_8b_8b -------- */
@@ -7433,39 +7758,6 @@
return False;
}
- if (bitU == 0 && opcode == BITS5(1,0,1,1,1)) {
- if (bitQ == 0 && size == X11) return False; // implied 1d case
- /* -------- 0,xx,10111 ADDP std7_std7_std7 -------- */
- const IROp opsADD[4]
- = { Iop_Add8x16, Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
- const IROp opsCEV[4]
- = { Iop_CatEvenLanes8x16, Iop_CatEvenLanes16x8, Iop_CatEvenLanes32x4,
- Iop_InterleaveLO64x2 };
- const IROp opsCOD[4]
- = { Iop_CatOddLanes8x16, Iop_CatOddLanes16x8, Iop_CatOddLanes32x4,
- Iop_InterleaveHI64x2 };
- IRTemp vN = newTemp(Ity_V128);
- IRTemp vM = newTemp(Ity_V128);
- assign(vN, getQReg128(nn));
- assign(vM, getQReg128(mm));
- IRTemp res128 = newTemp(Ity_V128);
- assign(res128, binop(opsADD[size],
- binop(opsCEV[size], mkexpr(vM), mkexpr(vN)),
- binop(opsCOD[size], mkexpr(vM), mkexpr(vN))));
- /* In the half-width case, use CatEL32x4 to extract the half-width
- result from the full-width result. */
- IRExpr* res
- = bitQ == 0 ? unop(Iop_ZeroHI64ofV128,
- binop(Iop_CatEvenLanes32x4, mkexpr(res128),
- mkexpr(res128)))
- : mkexpr(res128);
- putQReg128(dd, res);
- const HChar* arr = nameArr_Q_SZ(bitQ, size);
- DIP("addp %s.%s, %s.%s, %s.%s\n",
- nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
- return True;
- }
-
if (opcode == BITS5(1,0,0,1,1)) {
/* -------- 0,xx,10011 MUL std7_std7_std7 -------- */
/* -------- 1,xx,10011 PMUL 16b_16b_16b, 8b_8b_8b -------- */
@@ -7488,6 +7780,67 @@
return False;
}
+ if (opcode == BITS5(1,0,1,0,0) || opcode == BITS5(1,0,1,0,1)) {
+ /* -------- 0,xx,10100 SMAXP std6_std6_std6 -------- */
+ /* -------- 1,xx,10100 UMAXP std6_std6_std6 -------- */
+ /* -------- 0,xx,10101 SMINP std6_std6_std6 -------- */
+ /* -------- 1,xx,10101 UMINP std6_std6_std6 -------- */
+ if (size == X11) return False;
+ Bool isU = bitU == 1;
+ Bool isMAX = opcode == BITS5(1,0,1,0,0);
+ IRTemp vN = newTemp(Ity_V128);
+ IRTemp vM = newTemp(Ity_V128);
+ IROp op = isMAX ? (isU ? mkVecMAXU(size) : mkVecMAXS(size))
+ : (isU ? mkVecMINU(size) : mkVecMINS(size));
+ assign(vN, getQReg128(nn));
+ assign(vM, getQReg128(mm));
+ IRTemp res128 = newTemp(Ity_V128);
+ assign(res128,
+ binop(op,
+ binop(mkVecCATEVENLANES(size), mkexpr(vM), mkexpr(vN)),
+ binop(mkVecCATODDLANES(size), mkexpr(vM), mkexpr(vN))));
+ /* In the half-width case, use CatEL32x4 to extract the half-width
+ result from the full-width result. */
+ IRExpr* res
+ = bitQ == 0 ? unop(Iop_ZeroHI64ofV128,
+ binop(Iop_CatEvenLanes32x4, mkexpr(res128),
+ mkexpr(res128)))
+ : mkexpr(res128);
+ putQReg128(dd, res);
+ const HChar* arr = nameArr_Q_SZ(bitQ, size);
+ const HChar* nm = isMAX ? (isU ? "umaxp" : "smaxp")
+ : (isU ? "uminp" : "sminp");
+ DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
+ nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+ return True;
+ }
+
+ if (bitU == 0 && opcode == BITS5(1,0,1,1,1)) {
+ /* -------- 0,xx,10111 ADDP std7_std7_std7 -------- */
+ if (bitQ == 0 && size == X11) return False; // implied 1d case
+ IRTemp vN = newTemp(Ity_V128);
+ IRTemp vM = newTemp(Ity_V128);
+ assign(vN, getQReg128(nn));
+ assign(vM, getQReg128(mm));
+ IRTemp res128 = newTemp(Ity_V128);
+ assign(res128,
+ binop(mkVecADD(size),
+ binop(mkVecCATEVENLANES(size), mkexpr(vM), mkexpr(vN)),
+ binop(mkVecCATODDLANES(size), mkexpr(vM), mkexpr(vN))));
+ /* In the half-width case, use CatEL32x4 to extract the half-width
+ result from the full-width result. */
+ IRExpr* res
+ = bitQ == 0 ? unop(Iop_ZeroHI64ofV128,
+ binop(Iop_CatEvenLanes32x4, mkexpr(res128),
+ mkexpr(res128)))
+ : mkexpr(res128);
+ putQReg128(dd, res);
+ const HChar* arr = nameArr_Q_SZ(bitQ, size);
+ DIP("addp %s.%s, %s.%s, %s.%s\n",
+ nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+ return True;
+ }
+
if (bitU == 0 && opcode == BITS5(1,1,0,0,1)) {
/* -------- 0,0x,11001 FMLA 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
/* -------- 0,1x,11001 FMLS 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
@@ -7705,6 +8058,36 @@
return True;
}
+ if (opcode == BITS5(0,0,0,1,0) || opcode == BITS5(0,0,1,1,0)) {
+ /* -------- 0,xx,00010: SADDLP std6_std6 -------- */
+ /* -------- 1,xx,00010: UADDLP std6_std6 -------- */
+ /* -------- 0,xx,00110: SADALP std6_std6 -------- */
+ /* -------- 1,xx,00110: UADALP std6_std6 -------- */
+ /* Widens, and size refers to the narrow size. */
+ if (size == X11) return False; // no 1d or 2d cases
+ Bool isU = bitU == 1;
+ Bool isACC = opcode == BITS5(0,0,1,1,0);
+ IRTemp src = newTemp(Ity_V128);
+ IRTemp sum = newTemp(Ity_V128);
+ IRTemp res = newTemp(Ity_V128);
+ assign(src, getQReg128(nn));
+ assign(sum,
+ binop(mkVecADD(size+1),
+ mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
+ isU, True/*fromOdd*/, size, mkexpr(src))),
+ mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
+ isU, False/*!fromOdd*/, size, mkexpr(src)))));
+ assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(sum), getQReg128(dd))
+ : mkexpr(sum));
+ putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
+ const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
+ const HChar* arrWide = nameArr_Q_SZ(bitQ, size+1);
+ DIP("%s %s.%s, %s.%s\n", isACC ? (isU ? "uadalp" : "sadalp")
+ : (isU ? "uaddlp" : "saddlp"),
+ nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow);
+ return True;
+ }
+
if (opcode == BITS5(0,0,1,0,0)) {
/* -------- 0,xx,00100: CLS std6_std6 -------- */
/* -------- 1,xx,00100: CLZ std6_std6 -------- */
diff --git a/priv/host_arm64_defs.c b/priv/host_arm64_defs.c
index d964f9b..0b3bdb2 100644
--- a/priv/host_arm64_defs.c
+++ b/priv/host_arm64_defs.c
@@ -929,6 +929,22 @@
case ARM64vecb_SMULL2DSS: *nm = "smull"; *ar = "2dss"; return;
case ARM64vecb_SMULL4SHH: *nm = "smull"; *ar = "4shh"; return;
case ARM64vecb_SMULL8HBB: *nm = "smull"; *ar = "8hbb"; return;
+ case ARM64vecb_SQADD64x2: *nm = "sqadd"; *ar = "2d"; return;
+ case ARM64vecb_SQADD32x4: *nm = "sqadd"; *ar = "4s"; return;
+ case ARM64vecb_SQADD16x8: *nm = "sqadd"; *ar = "8h"; return;
+ case ARM64vecb_SQADD8x16: *nm = "sqadd"; *ar = "16b"; return;
+ case ARM64vecb_UQADD64x2: *nm = "uqadd"; *ar = "2d"; return;
+ case ARM64vecb_UQADD32x4: *nm = "uqadd"; *ar = "4s"; return;
+ case ARM64vecb_UQADD16x8: *nm = "uqadd"; *ar = "8h"; return;
+ case ARM64vecb_UQADD8x16: *nm = "uqadd"; *ar = "16b"; return;
+ case ARM64vecb_SQSUB64x2: *nm = "sqsub"; *ar = "2d"; return;
+ case ARM64vecb_SQSUB32x4: *nm = "sqsub"; *ar = "4s"; return;
+ case ARM64vecb_SQSUB16x8: *nm = "sqsub"; *ar = "8h"; return;
+ case ARM64vecb_SQSUB8x16: *nm = "sqsub"; *ar = "16b"; return;
+ case ARM64vecb_UQSUB64x2: *nm = "uqsub"; *ar = "2d"; return;
+ case ARM64vecb_UQSUB32x4: *nm = "uqsub"; *ar = "4s"; return;
+ case ARM64vecb_UQSUB16x8: *nm = "uqsub"; *ar = "8h"; return;
+ case ARM64vecb_UQSUB8x16: *nm = "uqsub"; *ar = "16b"; return;
default: vpanic("showARM64VecBinOp");
}
}
@@ -3461,12 +3477,14 @@
#define X000000 BITS8(0,0, 0,0,0,0,0,0)
#define X000001 BITS8(0,0, 0,0,0,0,0,1)
#define X000010 BITS8(0,0, 0,0,0,0,1,0)
+#define X000011 BITS8(0,0, 0,0,0,0,1,1)
#define X000100 BITS8(0,0, 0,0,0,1,0,0)
#define X000110 BITS8(0,0, 0,0,0,1,1,0)
#define X000111 BITS8(0,0, 0,0,0,1,1,1)
#define X001000 BITS8(0,0, 0,0,1,0,0,0)
#define X001001 BITS8(0,0, 0,0,1,0,0,1)
#define X001010 BITS8(0,0, 0,0,1,0,1,0)
+#define X001011 BITS8(0,0, 0,0,1,0,1,1)
#define X001101 BITS8(0,0, 0,0,1,1,0,1)
#define X001110 BITS8(0,0, 0,0,1,1,1,0)
#define X001111 BITS8(0,0, 0,0,1,1,1,1)
@@ -5151,6 +5169,26 @@
000 01110 10 1 m 110000 n d SMULL Vd.2d, Vn.2s, Vm.2s
000 01110 01 1 m 110000 n d SMULL Vd.4s, Vn.4h, Vm.4h
000 01110 00 1 m 110000 n d SMULL Vd.8h, Vn.8b, Vm.8b
+
+ 010 01110 11 1 m 000011 n d SQADD Vd.2d, Vn.2d, Vm.2d
+ 010 01110 10 1 m 000011 n d SQADD Vd.4s, Vn.4s, Vm.4s
+ 010 01110 01 1 m 000011 n d SQADD Vd.8h, Vn.8h, Vm.8h
+ 010 01110 00 1 m 000011 n d SQADD Vd.16b, Vn.16b, Vm.16b
+
+ 011 01110 11 1 m 000011 n d UQADD Vd.2d, Vn.2d, Vm.2d
+ 011 01110 10 1 m 000011 n d UQADD Vd.4s, Vn.4s, Vm.4s
+ 011 01110 01 1 m 000011 n d UQADD Vd.8h, Vn.8h, Vm.8h
+ 011 01110 00 1 m 000011 n d UQADD Vd.16b, Vn.16b, Vm.16b
+
+ 010 01110 11 1 m 001011 n d SQSUB Vd.2d, Vn.2d, Vm.2d
+ 010 01110 10 1 m 001011 n d SQSUB Vd.4s, Vn.4s, Vm.4s
+ 010 01110 01 1 m 001011 n d SQSUB Vd.8h, Vn.8h, Vm.8h
+ 010 01110 00 1 m 001011 n d SQSUB Vd.16b, Vn.16b, Vm.16b
+
+ 011 01110 11 1 m 001011 n d UQSUB Vd.2d, Vn.2d, Vm.2d
+ 011 01110 10 1 m 001011 n d UQSUB Vd.4s, Vn.4s, Vm.4s
+ 011 01110 01 1 m 001011 n d UQSUB Vd.8h, Vn.8h, Vm.8h
+ 011 01110 00 1 m 001011 n d UQSUB Vd.16b, Vn.16b, Vm.16b
*/
UInt vD = qregNo(i->ARM64in.VBinV.dst);
UInt vN = qregNo(i->ARM64in.VBinV.argL);
@@ -5402,6 +5440,58 @@
*p++ = X_3_8_5_6_5_5(X000, X01110001, vM, X110000, vN, vD);
break;
+ case ARM64vecb_SQADD64x2:
+ *p++ = X_3_8_5_6_5_5(X010, X01110111, vM, X000011, vN, vD);
+ break;
+ case ARM64vecb_SQADD32x4:
+ *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X000011, vN, vD);
+ break;
+ case ARM64vecb_SQADD16x8:
+ *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X000011, vN, vD);
+ break;
+ case ARM64vecb_SQADD8x16:
+ *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X000011, vN, vD);
+ break;
+
+ case ARM64vecb_UQADD64x2:
+ *p++ = X_3_8_5_6_5_5(X011, X01110111, vM, X000011, vN, vD);
+ break;
+ case ARM64vecb_UQADD32x4:
+ *p++ = X_3_8_5_6_5_5(X011, X01110101, vM, X000011, vN, vD);
+ break;
+ case ARM64vecb_UQADD16x8:
+ *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X000011, vN, vD);
+ break;
+ case ARM64vecb_UQADD8x16:
+ *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X000011, vN, vD);
+ break;
+
+ case ARM64vecb_SQSUB64x2:
+ *p++ = X_3_8_5_6_5_5(X010, X01110111, vM, X001011, vN, vD);
+ break;
+ case ARM64vecb_SQSUB32x4:
+ *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X001011, vN, vD);
+ break;
+ case ARM64vecb_SQSUB16x8:
+ *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X001011, vN, vD);
+ break;
+ case ARM64vecb_SQSUB8x16:
+ *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X001011, vN, vD);
+ break;
+
+ case ARM64vecb_UQSUB64x2:
+ *p++ = X_3_8_5_6_5_5(X011, X01110111, vM, X001011, vN, vD);
+ break;
+ case ARM64vecb_UQSUB32x4:
+ *p++ = X_3_8_5_6_5_5(X011, X01110101, vM, X001011, vN, vD);
+ break;
+ case ARM64vecb_UQSUB16x8:
+ *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X001011, vN, vD);
+ break;
+ case ARM64vecb_UQSUB8x16:
+ *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X001011, vN, vD);
+ break;
+
default:
goto bad;
}
diff --git a/priv/host_arm64_defs.h b/priv/host_arm64_defs.h
index 1f7c10f..3795c27 100644
--- a/priv/host_arm64_defs.h
+++ b/priv/host_arm64_defs.h
@@ -350,6 +350,14 @@
ARM64vecb_UMULL4SHH, ARM64vecb_UMULL8HBB,
ARM64vecb_SMULL2DSS,
ARM64vecb_SMULL4SHH, ARM64vecb_SMULL8HBB,
+ ARM64vecb_SQADD64x2, ARM64vecb_SQADD32x4,
+ ARM64vecb_SQADD16x8, ARM64vecb_SQADD8x16,
+ ARM64vecb_UQADD64x2, ARM64vecb_UQADD32x4,
+ ARM64vecb_UQADD16x8, ARM64vecb_UQADD8x16,
+ ARM64vecb_SQSUB64x2, ARM64vecb_SQSUB32x4,
+ ARM64vecb_SQSUB16x8, ARM64vecb_SQSUB8x16,
+ ARM64vecb_UQSUB64x2, ARM64vecb_UQSUB32x4,
+ ARM64vecb_UQSUB16x8, ARM64vecb_UQSUB8x16,
ARM64vecb_INVALID
}
ARM64VecBinOp;
diff --git a/priv/host_arm64_isel.c b/priv/host_arm64_isel.c
index b63548b..8720aa9 100644
--- a/priv/host_arm64_isel.c
+++ b/priv/host_arm64_isel.c
@@ -4414,8 +4414,8 @@
case Iop_Neg64Fx2: case Iop_Neg32Fx4:
case Iop_Abs64x2: case Iop_Abs32x4:
case Iop_Abs16x8: case Iop_Abs8x16:
- case Iop_Cls32x4: case Iop_Cls16x8: case Iop_Cls8x16:
- case Iop_Clz32x4: case Iop_Clz16x8: case Iop_Clz8x16:
+ case Iop_Cls32x4: case Iop_Cls16x8: case Iop_Cls8x16:
+ case Iop_Clz32x4: case Iop_Clz16x8: case Iop_Clz8x16:
case Iop_Cnt8x16:
case Iop_Reverse1sIn8_x16:
case Iop_Reverse8sIn16_x8:
@@ -4912,93 +4912,45 @@
addInstr(env, ARM64Instr_VQfromXX(res, argL, argR));
return res;
}
-//ZZ case Iop_AndV128: {
-//ZZ HReg res = newVRegV(env);
-//ZZ HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
-//ZZ HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
-//ZZ addInstr(env, ARMInstr_NBinary(ARMneon_VAND,
-//ZZ res, argL, argR, 4, True));
-//ZZ return res;
-//ZZ }
-//ZZ case Iop_OrV128: {
-//ZZ HReg res = newVRegV(env);
-//ZZ HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
-//ZZ HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
-//ZZ addInstr(env, ARMInstr_NBinary(ARMneon_VORR,
-//ZZ res, argL, argR, 4, True));
-//ZZ return res;
-//ZZ }
-//ZZ case Iop_XorV128: {
-//ZZ HReg res = newVRegV(env);
-//ZZ HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
-//ZZ HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
-//ZZ addInstr(env, ARMInstr_NBinary(ARMneon_VXOR,
-//ZZ res, argL, argR, 4, True));
-//ZZ return res;
-//ZZ }
-//ZZ case Iop_Add8x16:
-//ZZ case Iop_Add16x8:
-//ZZ case Iop_Add32x4:
case Iop_AndV128:
case Iop_OrV128:
case Iop_XorV128:
- case Iop_Max32Ux4:
- case Iop_Max16Ux8:
- case Iop_Max8Ux16:
- case Iop_Min32Ux4:
- case Iop_Min16Ux8:
- case Iop_Min8Ux16:
- case Iop_Max32Sx4:
- case Iop_Max16Sx8:
- case Iop_Max8Sx16:
- case Iop_Min32Sx4:
- case Iop_Min16Sx8:
- case Iop_Min8Sx16:
- case Iop_Add64x2:
- case Iop_Add32x4:
- case Iop_Add16x8:
- case Iop_Add8x16:
- case Iop_Sub64x2:
- case Iop_Sub32x4:
- case Iop_Sub16x8:
- case Iop_Sub8x16:
- case Iop_Mul32x4:
- case Iop_Mul16x8:
- case Iop_Mul8x16:
- case Iop_CmpEQ64x2:
- case Iop_CmpEQ32x4:
- case Iop_CmpEQ16x8:
- case Iop_CmpEQ8x16:
- case Iop_CmpGT64Ux2:
- case Iop_CmpGT32Ux4:
- case Iop_CmpGT16Ux8:
- case Iop_CmpGT8Ux16:
- case Iop_CmpGT64Sx2:
- case Iop_CmpGT32Sx4:
- case Iop_CmpGT16Sx8:
- case Iop_CmpGT8Sx16:
- case Iop_CmpEQ64Fx2:
- case Iop_CmpEQ32Fx4:
- case Iop_CmpLE64Fx2:
- case Iop_CmpLE32Fx4:
- case Iop_CmpLT64Fx2:
- case Iop_CmpLT32Fx4:
+ case Iop_Max32Ux4: case Iop_Max16Ux8: case Iop_Max8Ux16:
+ case Iop_Min32Ux4: case Iop_Min16Ux8: case Iop_Min8Ux16:
+ case Iop_Max32Sx4: case Iop_Max16Sx8: case Iop_Max8Sx16:
+ case Iop_Min32Sx4: case Iop_Min16Sx8: case Iop_Min8Sx16:
+ case Iop_Add64x2: case Iop_Add32x4:
+ case Iop_Add16x8: case Iop_Add8x16:
+ case Iop_Sub64x2: case Iop_Sub32x4:
+ case Iop_Sub16x8: case Iop_Sub8x16:
+ case Iop_Mul32x4: case Iop_Mul16x8: case Iop_Mul8x16:
+ case Iop_CmpEQ64x2: case Iop_CmpEQ32x4:
+ case Iop_CmpEQ16x8: case Iop_CmpEQ8x16:
+ case Iop_CmpGT64Ux2: case Iop_CmpGT32Ux4:
+ case Iop_CmpGT16Ux8: case Iop_CmpGT8Ux16:
+ case Iop_CmpGT64Sx2: case Iop_CmpGT32Sx4:
+ case Iop_CmpGT16Sx8: case Iop_CmpGT8Sx16:
+ case Iop_CmpEQ64Fx2: case Iop_CmpEQ32Fx4:
+ case Iop_CmpLE64Fx2: case Iop_CmpLE32Fx4:
+ case Iop_CmpLT64Fx2: case Iop_CmpLT32Fx4:
case Iop_Perm8x16:
- case Iop_InterleaveLO64x2:
- case Iop_CatEvenLanes32x4:
- case Iop_CatEvenLanes16x8:
- case Iop_CatEvenLanes8x16:
- case Iop_InterleaveHI64x2:
- case Iop_CatOddLanes32x4:
- case Iop_CatOddLanes16x8:
- case Iop_CatOddLanes8x16:
+ case Iop_InterleaveLO64x2: case Iop_CatEvenLanes32x4:
+ case Iop_CatEvenLanes16x8: case Iop_CatEvenLanes8x16:
+ case Iop_InterleaveHI64x2: case Iop_CatOddLanes32x4:
+ case Iop_CatOddLanes16x8: case Iop_CatOddLanes8x16:
case Iop_InterleaveHI32x4:
- case Iop_InterleaveHI16x8:
- case Iop_InterleaveHI8x16:
+ case Iop_InterleaveHI16x8: case Iop_InterleaveHI8x16:
case Iop_InterleaveLO32x4:
- case Iop_InterleaveLO16x8:
- case Iop_InterleaveLO8x16:
+ case Iop_InterleaveLO16x8: case Iop_InterleaveLO8x16:
case Iop_PolynomialMul8x16:
+ case Iop_QAdd64Sx2: case Iop_QAdd32Sx4:
+ case Iop_QAdd16Sx8: case Iop_QAdd8Sx16:
+ case Iop_QAdd64Ux2: case Iop_QAdd32Ux4:
+ case Iop_QAdd16Ux8: case Iop_QAdd8Ux16:
+ case Iop_QSub64Sx2: case Iop_QSub32Sx4:
+ case Iop_QSub16Sx8: case Iop_QSub8Sx16:
+ case Iop_QSub64Ux2: case Iop_QSub32Ux4:
+ case Iop_QSub16Ux8: case Iop_QSub8Ux16:
{
HReg res = newVRegV(env);
HReg argL = iselV128Expr(env, e->Iex.Binop.arg1);
@@ -5080,6 +5032,22 @@
case Iop_InterleaveLO8x16: op = ARM64vecb_ZIP18x16; sw = True;
break;
case Iop_PolynomialMul8x16: op = ARM64vecb_PMUL8x16; break;
+ case Iop_QAdd64Sx2: op = ARM64vecb_SQADD64x2; break;
+ case Iop_QAdd32Sx4: op = ARM64vecb_SQADD32x4; break;
+ case Iop_QAdd16Sx8: op = ARM64vecb_SQADD16x8; break;
+ case Iop_QAdd8Sx16: op = ARM64vecb_SQADD8x16; break;
+ case Iop_QAdd64Ux2: op = ARM64vecb_UQADD64x2; break;
+ case Iop_QAdd32Ux4: op = ARM64vecb_UQADD32x4; break;
+ case Iop_QAdd16Ux8: op = ARM64vecb_UQADD16x8; break;
+ case Iop_QAdd8Ux16: op = ARM64vecb_UQADD8x16; break;
+ case Iop_QSub64Sx2: op = ARM64vecb_SQSUB64x2; break;
+ case Iop_QSub32Sx4: op = ARM64vecb_SQSUB32x4; break;
+ case Iop_QSub16Sx8: op = ARM64vecb_SQSUB16x8; break;
+ case Iop_QSub8Sx16: op = ARM64vecb_SQSUB8x16; break;
+ case Iop_QSub64Ux2: op = ARM64vecb_UQSUB64x2; break;
+ case Iop_QSub32Ux4: op = ARM64vecb_UQSUB32x4; break;
+ case Iop_QSub16Ux8: op = ARM64vecb_UQSUB16x8; break;
+ case Iop_QSub8Ux16: op = ARM64vecb_UQSUB8x16; break;
default: vassert(0);
}
if (sw) {
diff --git a/priv/ir_defs.c b/priv/ir_defs.c
index b81a96e..27ccf8f 100644
--- a/priv/ir_defs.c
+++ b/priv/ir_defs.c
@@ -2818,14 +2818,14 @@
case Iop_Add8x16: case Iop_Add16x8:
case Iop_Add32x4: case Iop_Add64x2:
case Iop_QAdd8Ux16: case Iop_QAdd16Ux8:
- case Iop_QAdd32Ux4: //case Iop_QAdd64Ux2:
+ case Iop_QAdd32Ux4: case Iop_QAdd64Ux2:
case Iop_QAdd8Sx16: case Iop_QAdd16Sx8:
case Iop_QAdd32Sx4: case Iop_QAdd64Sx2:
case Iop_PwAdd8x16: case Iop_PwAdd16x8: case Iop_PwAdd32x4:
case Iop_Sub8x16: case Iop_Sub16x8:
case Iop_Sub32x4: case Iop_Sub64x2:
case Iop_QSub8Ux16: case Iop_QSub16Ux8:
- case Iop_QSub32Ux4: //case Iop_QSub64Ux2:
+ case Iop_QSub32Ux4: case Iop_QSub64Ux2:
case Iop_QSub8Sx16: case Iop_QSub16Sx8:
case Iop_QSub32Sx4: case Iop_QSub64Sx2:
case Iop_Mul8x16: case Iop_Mul16x8: case Iop_Mul32x4: