arm64: implement: abs d_d, neg d_d, abs std7_std7, addhn, subhn, raddhn, rsubhn
git-svn-id: svn://svn.valgrind.org/vex/trunk@2877 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_arm64_toIR.c b/priv/guest_arm64_toIR.c
index 71e20aa..141b456 100644
--- a/priv/guest_arm64_toIR.c
+++ b/priv/guest_arm64_toIR.c
@@ -273,6 +273,12 @@
return IRExpr_Const(IRConst_U32(i));
}
+static IRExpr* mkU16 ( UInt i )
+{
+ vassert(i < 65536);
+ return IRExpr_Const(IRConst_U16(i));
+}
+
static IRExpr* mkU8 ( UInt i )
{
vassert(i < 256);
@@ -3183,7 +3189,7 @@
{
vassert(bitQ <= 1 && size <= 3);
const HChar* nms[8]
- = { "2d", "4s", "8h", "16b", "1d", "2s", "4h", "8b" };
+ = { "8b", "4h", "2s", "1d", "16b", "8h", "4s", "2d" };
UInt ix = (bitQ << 2) | size;
vassert(ix < 8);
return nms[ix];
@@ -5516,6 +5522,36 @@
}
+/* Let |new64| be a V128 in which only the lower 64 bits are interesting,
+ and the upper can contain any value -- it is ignored. If |is2| is False,
+ generate IR to put |new64| in the lower half of vector reg |dd| and zero
+ the upper half. If |is2| is True, generate IR to put |new64| in the upper
+ half of vector reg |dd| and leave the lower half unchanged. This
+ simulates the behaviour of the "foo/foo2" instructions in which the
+ destination is half the width of sources, for example addhn/addhn2.
+*/
+static
+void putLO64andZUorPutHI64 ( Bool is2, UInt dd, IRTemp new64 )
+{
+ if (is2) {
+ /* Get the old contents of Vdd, zero the upper half, and replace
+ it with 'x'. */
+ IRTemp t_zero_oldLO = newTemp(Ity_V128);
+ assign(t_zero_oldLO, unop(Iop_ZeroHI64ofV128, getQReg128(dd)));
+ IRTemp t_newHI_zero = newTemp(Ity_V128);
+ assign(t_newHI_zero, binop(Iop_InterleaveLO64x2, mkexpr(new64),
+ mkV128(0x0000)));
+ IRTemp res = newTemp(Ity_V128);
+ assign(res, binop(Iop_OrV128, mkexpr(t_zero_oldLO),
+ mkexpr(t_newHI_zero)));
+ putQReg128(dd, mkexpr(res));
+ } else {
+ /* This is simple. */
+ putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(new64)));
+ }
+}
+
+
static
Bool dis_AdvSIMD_EXT(/*MB_OUT*/DisResult* dres, UInt insn)
{
@@ -6153,6 +6189,22 @@
return True;
}
+ if (bitU == 0 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
+ /* -------- 0,11,01011 ABS d_d -------- */
+ putQReg128(dd, unop(Iop_ZeroHI64ofV128,
+ unop(Iop_Abs64x2, getQReg128(nn))));
+ DIP("abs d%u, d%u\n", dd, nn);
+ return True;
+ }
+
+ if (bitU == 1 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
+ /* -------- 1,11,01011 NEG d_d -------- */
+ putQReg128(dd, unop(Iop_ZeroHI64ofV128,
+ binop(Iop_Sub64x2, mkV128(0x0000), getQReg128(nn))));
+ DIP("neg d%u, d%u\n", dd, nn);
+ return True;
+ }
+
# define INSN(_bMax,_bMin) SLICE_UInt(insn, (_bMax), (_bMin))
return False;
# undef INSN
@@ -6338,7 +6390,80 @@
static
Bool dis_AdvSIMD_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
{
+ /* 31 30 29 28 23 21 20 15 11 9 4
+ 0 Q U 01110 size 1 m opcode 00 n d
+ Decode fields: u,opcode
+ */
# define INSN(_bMax,_bMin) SLICE_UInt(insn, (_bMax), (_bMin))
+ if (INSN(31,31) != 0
+ || INSN(28,24) != BITS5(0,1,1,1,0)
+ || INSN(21,21) != 1
+ || INSN(11,10) != BITS2(0,0)) {
+ return False;
+ }
+ UInt bitQ = INSN(30,30);
+ UInt bitU = INSN(29,29);
+ UInt size = INSN(23,22);
+ UInt mm = INSN(20,16);
+ UInt opcode = INSN(15,12);
+ UInt nn = INSN(9,5);
+ UInt dd = INSN(4,0);
+ vassert(size < 4);
+ Bool is2 = bitQ == 1;
+
+ if (opcode == BITS4(0,1,0,0) || opcode == BITS4(0,1,1,0)) {
+ /* -------- 0,0100 ADDHN{2} -------- */
+ /* -------- 1,0100 RADDHN{2} -------- */
+ /* -------- 0,0110 SUBHN{2} -------- */
+ /* -------- 1,0110 RSUBHN{2} -------- */
+ /* Narrows, and size refers to the narrowed lanes. */
+ if (size == X11) return False;
+ vassert(size <= 2);
+ const IROp opADD[3] = { Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
+ const IROp opSUB[3] = { Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 };
+ const IROp opSHR[3] = { Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2 };
+ const UInt shift[3] = { 8, 16, 32 };
+ const IROp opCAT[3] = { Iop_CatEvenLanes8x16, Iop_CatEvenLanes16x8,
+ Iop_CatEvenLanes32x4 };
+ Bool isADD = opcode == BITS4(0,1,0,0);
+ Bool isR = bitU == 1;
+ /* Combined elements in wide lanes */
+ IRTemp wide = newTemp(Ity_V128);
+ IRExpr* wideE = binop(isADD ? opADD[size] : opSUB[size],
+ getQReg128(nn), getQReg128(mm));
+ if (isR) {
+ IRType ty = Ity_INVALID;
+ IRTemp rcS = IRTemp_INVALID;
+ switch (size) {
+ case X00: ty = Ity_I16;
+ rcS = newTemp(ty); assign(rcS, mkU16(0x80)); break;
+ case X01: ty = Ity_I32;
+ rcS = newTemp(ty); assign(rcS, mkU32(0x8000)); break;
+ case X10: ty = Ity_I64;
+ rcS = newTemp(ty); assign(rcS, mkU64(0x80000000)); break;
+ default: vassert(0);
+ }
+ IRTemp rcV = math_DUP_TO_V128(rcS, ty);
+ wideE = binop(opADD[size], wideE, mkexpr(rcV));
+ }
+ assign(wide, wideE);
+ /* Top halves of elements, still in wide lanes */
+ IRTemp shrd = newTemp(Ity_V128);
+ assign(shrd, binop(opSHR[size], mkexpr(wide), mkU8(shift[size])));
+ /* Elements now compacted into lower 64 bits */
+ IRTemp new64 = newTemp(Ity_V128);
+ assign(new64, binop(opCAT[size], mkexpr(shrd), mkexpr(shrd)));
+ putLO64andZUorPutHI64(is2, dd, new64);
+ const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
+ const HChar* arrWide = nameArr_Q_SZ(1, size+1);
+ const HChar* nm = isADD ? (isR ? "raddhn" : "addhn")
+ : (isR ? "rsubhn" : "subhn");
+ DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
+ nameQReg128(dd), arrNarrow,
+ nameQReg128(nn), arrWide, nameQReg128(mm), arrWide);
+ return True;
+ }
+
return False;
# undef INSN
}
@@ -6858,6 +6983,20 @@
return True;
}
+ if (bitU == 0 && opcode == BITS5(0,1,0,1,1)) {
+ /* -------- 0,xx,01011: ABS std7_std7 -------- */
+ if (bitQ == 0 && size == X11) return False; // implied 1d case
+ const IROp opABS[4]
+ = { Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, Iop_Abs64x2 };
+ IRTemp res = newTemp(Ity_V128);
+ assign(res, unop(opABS[size], getQReg128(nn)));
+ putQReg128(dd, bitQ == 0 ? unop(Iop_ZeroHI64ofV128, mkexpr(res))
+ : mkexpr(res));
+ const HChar* arr = nameArr_Q_SZ(bitQ, size);
+ DIP("abs %s.%s, %s.%s\n", nameQReg128(dd), arr, nameQReg128(nn), arr);
+ return True;
+ }
+
if (bitU == 1 && opcode == BITS5(0,1,0,1,1)) {
/* -------- 1,xx,01011: NEG std7_std7 -------- */
if (bitQ == 0 && size == X11) return False; // implied 1d case
diff --git a/priv/host_arm64_defs.c b/priv/host_arm64_defs.c
index 29f78db..5c5988a 100644
--- a/priv/host_arm64_defs.c
+++ b/priv/host_arm64_defs.c
@@ -929,6 +929,10 @@
case ARM64vecu_FNEG32x4: *nm = "fneg "; *ar = "4s"; return;
case ARM64vecu_FABS64x2: *nm = "fabs "; *ar = "2d"; return;
case ARM64vecu_FABS32x4: *nm = "fabs "; *ar = "4s"; return;
+ case ARM64vecu_ABS64x2: *nm = "abs"; *ar = "2d"; return;
+ case ARM64vecu_ABS32x4: *nm = "abs"; *ar = "4s"; return;
+ case ARM64vecu_ABS16x8: *nm = "abs"; *ar = "8h"; return;
+ case ARM64vecu_ABS8x16: *nm = "abs"; *ar = "16b"; return;
case ARM64vecu_NOT: *nm = "not "; *ar = "all"; return;
default: vpanic("showARM64VecUnaryOp");
}
@@ -3422,6 +3426,7 @@
#define X100101 BITS8(0,0, 1,0,0,1,0,1)
#define X100110 BITS8(0,0, 1,0,0,1,1,0)
#define X100111 BITS8(0,0, 1,0,0,1,1,1)
+#define X101110 BITS8(0,0, 1,0,1,1,1,0)
#define X110000 BITS8(0,0, 1,1,0,0,0,0)
#define X110001 BITS8(0,0, 1,1,0,0,0,1)
#define X110101 BITS8(0,0, 1,1,0,1,0,1)
@@ -5309,6 +5314,11 @@
011 01110 11 1 00000 111110 n d FNEG Vd.2d, Vn.2d
011 01110 10 1 00000 111110 n d FNEG Vd.4s, Vn.4s
011 01110 00 1 00000 010110 n d NOT Vd.16b, Vn.16b
+
+ 010 01110 11 1 00000 101110 n d ABS Vd.2d, Vn.2d
+ 010 01110 10 1 00000 101110 n d ABS Vd.4s, Vn.4s
+ 010 01110 01 1 00000 101110 n d ABS Vd.8h, Vn.8h
+ 010 01110 00 1 00000 101110 n d ABS Vd.16b, Vn.16b
*/
UInt vD = qregNo(i->ARM64in.VUnaryV.dst);
UInt vN = qregNo(i->ARM64in.VUnaryV.arg);
@@ -5328,6 +5338,18 @@
case ARM64vecu_NOT:
*p++ = X_3_8_5_6_5_5(X011, X01110001, X00000, X010110, vN, vD);
break;
+ case ARM64vecu_ABS64x2:
+ *p++ = X_3_8_5_6_5_5(X010, X01110111, X00000, X101110, vN, vD);
+ break;
+ case ARM64vecu_ABS32x4:
+ *p++ = X_3_8_5_6_5_5(X010, X01110101, X00000, X101110, vN, vD);
+ break;
+ case ARM64vecu_ABS16x8:
+ *p++ = X_3_8_5_6_5_5(X010, X01110011, X00000, X101110, vN, vD);
+ break;
+ case ARM64vecu_ABS8x16:
+ *p++ = X_3_8_5_6_5_5(X010, X01110001, X00000, X101110, vN, vD);
+ break;
default:
goto bad;
}
diff --git a/priv/host_arm64_defs.h b/priv/host_arm64_defs.h
index baec464..38b2910 100644
--- a/priv/host_arm64_defs.h
+++ b/priv/host_arm64_defs.h
@@ -307,102 +307,67 @@
typedef
enum {
- ARM64vecb_ADD64x2=120,
- ARM64vecb_ADD32x4,
- ARM64vecb_ADD16x8,
- ARM64vecb_ADD8x16,
- ARM64vecb_SUB64x2,
- ARM64vecb_SUB32x4,
- ARM64vecb_SUB16x8,
- ARM64vecb_SUB8x16,
- ARM64vecb_MUL32x4,
- ARM64vecb_MUL16x8,
- ARM64vecb_MUL8x16,
- ARM64vecb_FADD64x2,
- ARM64vecb_FSUB64x2,
- ARM64vecb_FMUL64x2,
- ARM64vecb_FDIV64x2,
- ARM64vecb_FADD32x4,
- ARM64vecb_FSUB32x4,
- ARM64vecb_FMUL32x4,
- ARM64vecb_FDIV32x4,
- ARM64vecb_UMAX32x4,
- ARM64vecb_UMAX16x8,
- ARM64vecb_UMAX8x16,
- ARM64vecb_UMIN32x4,
- ARM64vecb_UMIN16x8,
- ARM64vecb_UMIN8x16,
- ARM64vecb_SMAX32x4,
- ARM64vecb_SMAX16x8,
- ARM64vecb_SMAX8x16,
- ARM64vecb_SMIN32x4,
- ARM64vecb_SMIN16x8,
- ARM64vecb_SMIN8x16,
+ ARM64vecb_ADD64x2=120, ARM64vecb_ADD32x4,
+ ARM64vecb_ADD16x8, ARM64vecb_ADD8x16,
+ ARM64vecb_SUB64x2, ARM64vecb_SUB32x4,
+ ARM64vecb_SUB16x8, ARM64vecb_SUB8x16,
+ ARM64vecb_MUL32x4,
+ ARM64vecb_MUL16x8, ARM64vecb_MUL8x16,
+ ARM64vecb_FADD64x2, ARM64vecb_FADD32x4,
+ ARM64vecb_FSUB64x2, ARM64vecb_FSUB32x4,
+ ARM64vecb_FMUL64x2, ARM64vecb_FMUL32x4,
+ ARM64vecb_FDIV64x2, ARM64vecb_FDIV32x4,
+ ARM64vecb_UMAX32x4,
+ ARM64vecb_UMAX16x8, ARM64vecb_UMAX8x16,
+ ARM64vecb_UMIN32x4,
+ ARM64vecb_UMIN16x8, ARM64vecb_UMIN8x16,
+ ARM64vecb_SMAX32x4,
+ ARM64vecb_SMAX16x8, ARM64vecb_SMAX8x16,
+ ARM64vecb_SMIN32x4,
+ ARM64vecb_SMIN16x8, ARM64vecb_SMIN8x16,
ARM64vecb_AND,
ARM64vecb_ORR,
ARM64vecb_XOR,
- ARM64vecb_CMEQ64x2,
- ARM64vecb_CMEQ32x4,
- ARM64vecb_CMEQ16x8,
- ARM64vecb_CMEQ8x16,
- ARM64vecb_CMHI64x2, /* >u */
- ARM64vecb_CMHI32x4,
- ARM64vecb_CMHI16x8,
- ARM64vecb_CMHI8x16,
- ARM64vecb_CMGT64x2, /* >s */
- ARM64vecb_CMGT32x4,
- ARM64vecb_CMGT16x8,
- ARM64vecb_CMGT8x16,
- ARM64vecb_FCMEQ64x2,
- ARM64vecb_FCMEQ32x4,
- ARM64vecb_FCMGE64x2,
- ARM64vecb_FCMGE32x4,
- ARM64vecb_FCMGT64x2,
- ARM64vecb_FCMGT32x4,
+ ARM64vecb_CMEQ64x2, ARM64vecb_CMEQ32x4,
+ ARM64vecb_CMEQ16x8, ARM64vecb_CMEQ8x16,
+ ARM64vecb_CMHI64x2, ARM64vecb_CMHI32x4, /* >u */
+ ARM64vecb_CMHI16x8, ARM64vecb_CMHI8x16,
+ ARM64vecb_CMGT64x2, ARM64vecb_CMGT32x4, /* >s */
+ ARM64vecb_CMGT16x8, ARM64vecb_CMGT8x16,
+ ARM64vecb_FCMEQ64x2, ARM64vecb_FCMEQ32x4,
+ ARM64vecb_FCMGE64x2, ARM64vecb_FCMGE32x4,
+ ARM64vecb_FCMGT64x2, ARM64vecb_FCMGT32x4,
ARM64vecb_TBL1,
- ARM64vecb_UZP164x2,
- ARM64vecb_UZP132x4,
- ARM64vecb_UZP116x8,
- ARM64vecb_UZP18x16,
- ARM64vecb_UZP264x2,
- ARM64vecb_UZP232x4,
- ARM64vecb_UZP216x8,
- ARM64vecb_UZP28x16,
- ARM64vecb_ZIP132x4,
- ARM64vecb_ZIP116x8,
- ARM64vecb_ZIP18x16,
- ARM64vecb_ZIP232x4,
- ARM64vecb_ZIP216x8,
- ARM64vecb_ZIP28x16,
+ ARM64vecb_UZP164x2, ARM64vecb_UZP132x4,
+ ARM64vecb_UZP116x8, ARM64vecb_UZP18x16,
+ ARM64vecb_UZP264x2, ARM64vecb_UZP232x4,
+ ARM64vecb_UZP216x8, ARM64vecb_UZP28x16,
+ ARM64vecb_ZIP132x4, ARM64vecb_ZIP116x8,
+ ARM64vecb_ZIP18x16, ARM64vecb_ZIP232x4,
+ ARM64vecb_ZIP216x8, ARM64vecb_ZIP28x16,
ARM64vecb_INVALID
}
ARM64VecBinOp;
typedef
enum {
- ARM64vecu_FNEG64x2=300,
- ARM64vecu_FNEG32x4,
- ARM64vecu_FABS64x2,
- ARM64vecu_FABS32x4,
+ ARM64vecu_FNEG64x2=300, ARM64vecu_FNEG32x4,
+ ARM64vecu_FABS64x2, ARM64vecu_FABS32x4,
ARM64vecu_NOT,
+ ARM64vecu_ABS64x2, ARM64vecu_ABS32x4,
+ ARM64vecu_ABS16x8, ARM64vecu_ABS8x16,
ARM64vecu_INVALID
}
ARM64VecUnaryOp;
typedef
enum {
- ARM64vecsh_USHR64x2=350,
- ARM64vecsh_USHR32x4,
- ARM64vecsh_USHR16x8,
- ARM64vecsh_USHR8x16,
- ARM64vecsh_SSHR64x2,
- ARM64vecsh_SSHR32x4,
- ARM64vecsh_SSHR16x8,
- ARM64vecsh_SSHR8x16,
- ARM64vecsh_SHL64x2,
- ARM64vecsh_SHL32x4,
- ARM64vecsh_SHL16x8,
- ARM64vecsh_SHL8x16,
+ ARM64vecsh_USHR64x2=350, ARM64vecsh_USHR32x4,
+ ARM64vecsh_USHR16x8, ARM64vecsh_USHR8x16,
+ ARM64vecsh_SSHR64x2, ARM64vecsh_SSHR32x4,
+ ARM64vecsh_SSHR16x8, ARM64vecsh_SSHR8x16,
+ ARM64vecsh_SHL64x2, ARM64vecsh_SHL32x4,
+ ARM64vecsh_SHL16x8, ARM64vecsh_SHL8x16,
ARM64vecsh_INVALID
}
ARM64VecShiftOp;
diff --git a/priv/host_arm64_isel.c b/priv/host_arm64_isel.c
index 17d76e4..d12c72d 100644
--- a/priv/host_arm64_isel.c
+++ b/priv/host_arm64_isel.c
@@ -4413,7 +4413,12 @@
case Iop_Abs64Fx2:
case Iop_Abs32Fx4:
case Iop_Neg64Fx2:
- case Iop_Neg32Fx4: {
+ case Iop_Neg32Fx4:
+ case Iop_Abs64x2:
+ case Iop_Abs32x4:
+ case Iop_Abs16x8:
+ case Iop_Abs8x16:
+ {
HReg res = newVRegV(env);
HReg arg = iselV128Expr(env, e->Iex.Unop.arg);
ARM64VecUnaryOp op = ARM64vecu_INVALID;
@@ -4423,6 +4428,10 @@
case Iop_Abs32Fx4: op = ARM64vecu_FABS32x4; break;
case Iop_Neg64Fx2: op = ARM64vecu_FNEG64x2; break;
case Iop_Neg32Fx4: op = ARM64vecu_FNEG32x4; break;
+ case Iop_Abs64x2: op = ARM64vecu_ABS64x2; break;
+ case Iop_Abs32x4: op = ARM64vecu_ABS32x4; break;
+ case Iop_Abs16x8: op = ARM64vecu_ABS16x8; break;
+ case Iop_Abs8x16: op = ARM64vecu_ABS8x16; break;
default: vassert(0);
}
addInstr(env, ARM64Instr_VUnaryV(op, res, arg));
diff --git a/priv/ir_defs.c b/priv/ir_defs.c
index fc3ef47..4d65daf 100644
--- a/priv/ir_defs.c
+++ b/priv/ir_defs.c
@@ -726,6 +726,7 @@
case Iop_Abs8x16: vex_printf("Abs8x16"); return;
case Iop_Abs16x8: vex_printf("Abs16x8"); return;
case Iop_Abs32x4: vex_printf("Abs32x4"); return;
+ case Iop_Abs64x2: vex_printf("Abs64x2"); return;
case Iop_Add8x16: vex_printf("Add8x16"); return;
case Iop_Add16x8: vex_printf("Add16x8"); return;
@@ -2910,7 +2911,7 @@
case Iop_Reverse32_8x16: case Iop_Reverse32_16x8:
case Iop_Reverse16_8x16:
case Iop_Neg64Fx2: case Iop_Neg32Fx4:
- case Iop_Abs8x16: case Iop_Abs16x8: case Iop_Abs32x4:
+ case Iop_Abs8x16: case Iop_Abs16x8: case Iop_Abs32x4: case Iop_Abs64x2:
case Iop_CipherSV128:
case Iop_PwBitMtxXpose64x2:
case Iop_ZeroHI64ofV128: case Iop_ZeroHI96ofV128:
diff --git a/pub/libvex_ir.h b/pub/libvex_ir.h
index 7bc68c8..c61ce23 100644
--- a/pub/libvex_ir.h
+++ b/pub/libvex_ir.h
@@ -1483,7 +1483,7 @@
Iop_PwBitMtxXpose64x2,
/* ABSOLUTE VALUE */
- Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4,
+ Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, Iop_Abs64x2,
/* AVERAGING: note: (arg1 + arg2 + 1) >>u 1 */
Iop_Avg8Ux16, Iop_Avg16Ux8, Iop_Avg32Ux4,