arm64: implement: abs d_d, neg d_d, abs std7_std7, addhn, subhn, raddhn, rsubhn


git-svn-id: svn://svn.valgrind.org/vex/trunk@2877 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_arm64_toIR.c b/priv/guest_arm64_toIR.c
index 71e20aa..141b456 100644
--- a/priv/guest_arm64_toIR.c
+++ b/priv/guest_arm64_toIR.c
@@ -273,6 +273,12 @@
    return IRExpr_Const(IRConst_U32(i));
 }
 
+static IRExpr* mkU16 ( UInt i )
+{
+   vassert(i < 65536);
+   return IRExpr_Const(IRConst_U16(i));
+}
+
 static IRExpr* mkU8 ( UInt i )
 {
    vassert(i < 256);
@@ -3183,7 +3189,7 @@
 {
    vassert(bitQ <= 1 && size <= 3);
    const HChar* nms[8]
-      = { "2d", "4s", "8h", "16b", "1d", "2s", "4h", "8b" };
+      = { "8b", "4h", "2s", "1d", "16b", "8h", "4s", "2d" };
    UInt ix = (bitQ << 2) | size;
    vassert(ix < 8);
    return nms[ix];
@@ -5516,6 +5522,36 @@
 }
 
 
+/* Let |new64| be a V128 in which only the lower 64 bits are interesting,
+   and the upper can contain any value -- it is ignored.  If |is2| is False,
+   generate IR to put |new64| in the lower half of vector reg |dd| and zero
+   the upper half.  If |is2| is True, generate IR to put |new64| in the upper
+   half of vector reg |dd| and leave the lower half unchanged.  This
+   simulates the behaviour of the "foo/foo2" instructions in which the 
+   destination is half the width of sources, for example addhn/addhn2.
+*/
+static
+void putLO64andZUorPutHI64 ( Bool is2, UInt dd, IRTemp new64 )
+{
+   if (is2) {
+      /* Get the old contents of Vdd, zero the upper half, and replace
+         it with 'x'. */
+      IRTemp t_zero_oldLO = newTemp(Ity_V128);
+      assign(t_zero_oldLO, unop(Iop_ZeroHI64ofV128, getQReg128(dd)));
+      IRTemp t_newHI_zero = newTemp(Ity_V128);
+      assign(t_newHI_zero, binop(Iop_InterleaveLO64x2, mkexpr(new64),
+                                                       mkV128(0x0000)));
+      IRTemp res = newTemp(Ity_V128);
+      assign(res, binop(Iop_OrV128, mkexpr(t_zero_oldLO),
+                                    mkexpr(t_newHI_zero)));
+      putQReg128(dd, mkexpr(res));
+   } else {
+      /* This is simple. */
+      putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(new64)));
+   }
+}
+
+
 static
 Bool dis_AdvSIMD_EXT(/*MB_OUT*/DisResult* dres, UInt insn)
 {
@@ -6153,6 +6189,22 @@
       return True;
    }
 
+   if (bitU == 0 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
+      /* -------- 0,11,01011 ABS d_d -------- */
+      putQReg128(dd, unop(Iop_ZeroHI64ofV128,
+                          unop(Iop_Abs64x2, getQReg128(nn))));
+      DIP("abs d%u, d%u\n", dd, nn);
+      return True;
+   }
+
+   if (bitU == 1 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
+      /* -------- 1,11,01011 NEG d_d -------- */
+      putQReg128(dd, unop(Iop_ZeroHI64ofV128,
+                          binop(Iop_Sub64x2, mkV128(0x0000), getQReg128(nn))));
+      DIP("neg d%u, d%u\n", dd, nn);
+      return True;
+   }
+
 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
    return False;
 #  undef INSN
@@ -6338,7 +6390,80 @@
 static
 Bool dis_AdvSIMD_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
 {
+   /* 31 30 29 28    23   21 20 15     11 9 4
+      0  Q  U  01110 size 1  m  opcode 00 n d
+      Decode fields: u,opcode
+   */
 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
+   if (INSN(31,31) != 0
+       || INSN(28,24) != BITS5(0,1,1,1,0)
+       || INSN(21,21) != 1
+       || INSN(11,10) != BITS2(0,0)) {
+      return False;
+   }
+   UInt bitQ   = INSN(30,30);
+   UInt bitU   = INSN(29,29);
+   UInt size   = INSN(23,22);
+   UInt mm     = INSN(20,16);
+   UInt opcode = INSN(15,12);
+   UInt nn     = INSN(9,5);
+   UInt dd     = INSN(4,0);
+   vassert(size < 4);
+   Bool is2    = bitQ == 1;
+
+   if (opcode == BITS4(0,1,0,0) || opcode == BITS4(0,1,1,0)) {
+      /* -------- 0,0100  ADDHN{2} -------- */
+      /* -------- 1,0100 RADDHN{2} -------- */
+      /* -------- 0,0110  SUBHN{2} -------- */
+      /* -------- 1,0110 RSUBHN{2} -------- */
+      /* Narrows, and size refers to the narrowed lanes. */
+      if (size == X11) return False;
+      vassert(size <= 2);
+      const IROp opADD[3] = { Iop_Add16x8,  Iop_Add32x4,  Iop_Add64x2  };
+      const IROp opSUB[3] = { Iop_Sub16x8,  Iop_Sub32x4,  Iop_Sub64x2  };
+      const IROp opSHR[3] = { Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2 };
+      const UInt shift[3] = { 8,            16,           32           };
+      const IROp opCAT[3] = { Iop_CatEvenLanes8x16, Iop_CatEvenLanes16x8,
+                              Iop_CatEvenLanes32x4 };
+      Bool isADD = opcode == BITS4(0,1,0,0);
+      Bool isR   = bitU == 1;
+      /* Combined elements in wide lanes */
+      IRTemp  wide  = newTemp(Ity_V128);
+      IRExpr* wideE = binop(isADD ? opADD[size] : opSUB[size],
+                            getQReg128(nn), getQReg128(mm));
+      if (isR) {
+         IRType ty  = Ity_INVALID;
+         IRTemp rcS = IRTemp_INVALID;
+         switch (size) {
+            case X00: ty  = Ity_I16;
+                      rcS = newTemp(ty); assign(rcS, mkU16(0x80)); break;
+            case X01: ty  = Ity_I32;
+                      rcS = newTemp(ty); assign(rcS, mkU32(0x8000)); break;
+            case X10: ty  = Ity_I64;
+                      rcS = newTemp(ty); assign(rcS, mkU64(0x80000000)); break;
+            default:  vassert(0);
+         }
+         IRTemp rcV = math_DUP_TO_V128(rcS, ty);
+         wideE = binop(opADD[size], wideE, mkexpr(rcV));
+      }
+      assign(wide, wideE);
+      /* Top halves of elements, still in wide lanes */
+      IRTemp shrd = newTemp(Ity_V128);
+      assign(shrd, binop(opSHR[size], mkexpr(wide), mkU8(shift[size])));
+      /* Elements now compacted into lower 64 bits */
+      IRTemp new64 = newTemp(Ity_V128);
+      assign(new64, binop(opCAT[size], mkexpr(shrd), mkexpr(shrd)));
+      putLO64andZUorPutHI64(is2, dd, new64);
+      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
+      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
+      const HChar* nm = isADD ? (isR ? "raddhn" : "addhn")
+                              : (isR ? "rsubhn" : "subhn");
+      DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
+          nameQReg128(dd), arrNarrow,
+          nameQReg128(nn), arrWide, nameQReg128(mm), arrWide);
+      return True;
+   }
+
    return False;
 #  undef INSN
 }
@@ -6858,6 +6983,20 @@
       return True;
    }
 
+   if (bitU == 0 && opcode == BITS5(0,1,0,1,1)) {
+      /* -------- 0,xx,01011: ABS std7_std7 -------- */
+      if (bitQ == 0 && size == X11) return False; // implied 1d case
+      const IROp opABS[4]
+         = { Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, Iop_Abs64x2 };
+      IRTemp res = newTemp(Ity_V128);
+      assign(res, unop(opABS[size], getQReg128(nn)));
+      putQReg128(dd, bitQ == 0 ? unop(Iop_ZeroHI64ofV128, mkexpr(res))
+                               : mkexpr(res));
+      const HChar* arr = nameArr_Q_SZ(bitQ, size);
+      DIP("abs %s.%s, %s.%s\n", nameQReg128(dd), arr, nameQReg128(nn), arr);
+      return True;
+   }
+
    if (bitU == 1 && opcode == BITS5(0,1,0,1,1)) {
       /* -------- 1,xx,01011: NEG std7_std7 -------- */
       if (bitQ == 0 && size == X11) return False; // implied 1d case
diff --git a/priv/host_arm64_defs.c b/priv/host_arm64_defs.c
index 29f78db..5c5988a 100644
--- a/priv/host_arm64_defs.c
+++ b/priv/host_arm64_defs.c
@@ -929,6 +929,10 @@
       case ARM64vecu_FNEG32x4: *nm = "fneg "; *ar = "4s";  return;
       case ARM64vecu_FABS64x2: *nm = "fabs "; *ar = "2d";  return;
       case ARM64vecu_FABS32x4: *nm = "fabs "; *ar = "4s";  return;
+      case ARM64vecu_ABS64x2:  *nm = "abs";   *ar = "2d";  return;
+      case ARM64vecu_ABS32x4:  *nm = "abs";   *ar = "4s";  return;
+      case ARM64vecu_ABS16x8:  *nm = "abs";   *ar = "8h";  return;
+      case ARM64vecu_ABS8x16:  *nm = "abs";   *ar = "16b"; return;
       case ARM64vecu_NOT:      *nm = "not  "; *ar = "all"; return;
       default: vpanic("showARM64VecUnaryOp");
    }
@@ -3422,6 +3426,7 @@
 #define X100101  BITS8(0,0, 1,0,0,1,0,1)
 #define X100110  BITS8(0,0, 1,0,0,1,1,0)
 #define X100111  BITS8(0,0, 1,0,0,1,1,1)
+#define X101110  BITS8(0,0, 1,0,1,1,1,0)
 #define X110000  BITS8(0,0, 1,1,0,0,0,0)
 #define X110001  BITS8(0,0, 1,1,0,0,0,1)
 #define X110101  BITS8(0,0, 1,1,0,1,0,1)
@@ -5309,6 +5314,11 @@
             011 01110 11 1 00000 111110 n d  FNEG Vd.2d,  Vn.2d
             011 01110 10 1 00000 111110 n d  FNEG Vd.4s,  Vn.4s
             011 01110 00 1 00000 010110 n d  NOT  Vd.16b, Vn.16b
+
+            010 01110 11 1 00000 101110 n d  ABS  Vd.2d,  Vn.2d
+            010 01110 10 1 00000 101110 n d  ABS  Vd.4s,  Vn.4s
+            010 01110 01 1 00000 101110 n d  ABS  Vd.8h,  Vn.8h
+            010 01110 00 1 00000 101110 n d  ABS  Vd.16b, Vn.16b
          */
          UInt vD = qregNo(i->ARM64in.VUnaryV.dst);
          UInt vN = qregNo(i->ARM64in.VUnaryV.arg);
@@ -5328,6 +5338,18 @@
             case ARM64vecu_NOT:
                *p++ = X_3_8_5_6_5_5(X011, X01110001, X00000, X010110, vN, vD);
                break;
+            case ARM64vecu_ABS64x2:
+               *p++ = X_3_8_5_6_5_5(X010, X01110111, X00000, X101110, vN, vD);
+               break;
+            case ARM64vecu_ABS32x4:
+               *p++ = X_3_8_5_6_5_5(X010, X01110101, X00000, X101110, vN, vD);
+               break;
+            case ARM64vecu_ABS16x8:
+               *p++ = X_3_8_5_6_5_5(X010, X01110011, X00000, X101110, vN, vD);
+               break;
+            case ARM64vecu_ABS8x16:
+               *p++ = X_3_8_5_6_5_5(X010, X01110001, X00000, X101110, vN, vD);
+               break;
             default:
                goto bad;
          }
diff --git a/priv/host_arm64_defs.h b/priv/host_arm64_defs.h
index baec464..38b2910 100644
--- a/priv/host_arm64_defs.h
+++ b/priv/host_arm64_defs.h
@@ -307,102 +307,67 @@
 
 typedef
    enum {
-      ARM64vecb_ADD64x2=120,
-      ARM64vecb_ADD32x4,
-      ARM64vecb_ADD16x8,
-      ARM64vecb_ADD8x16,
-      ARM64vecb_SUB64x2,
-      ARM64vecb_SUB32x4,
-      ARM64vecb_SUB16x8,
-      ARM64vecb_SUB8x16,
-      ARM64vecb_MUL32x4,
-      ARM64vecb_MUL16x8,
-      ARM64vecb_MUL8x16,
-      ARM64vecb_FADD64x2,
-      ARM64vecb_FSUB64x2,
-      ARM64vecb_FMUL64x2,
-      ARM64vecb_FDIV64x2,
-      ARM64vecb_FADD32x4,
-      ARM64vecb_FSUB32x4,
-      ARM64vecb_FMUL32x4,
-      ARM64vecb_FDIV32x4,
-      ARM64vecb_UMAX32x4,
-      ARM64vecb_UMAX16x8,
-      ARM64vecb_UMAX8x16,
-      ARM64vecb_UMIN32x4,
-      ARM64vecb_UMIN16x8,
-      ARM64vecb_UMIN8x16,
-      ARM64vecb_SMAX32x4,
-      ARM64vecb_SMAX16x8,
-      ARM64vecb_SMAX8x16,
-      ARM64vecb_SMIN32x4,
-      ARM64vecb_SMIN16x8,
-      ARM64vecb_SMIN8x16,
+      ARM64vecb_ADD64x2=120, ARM64vecb_ADD32x4,
+      ARM64vecb_ADD16x8,     ARM64vecb_ADD8x16,
+      ARM64vecb_SUB64x2,     ARM64vecb_SUB32x4,
+      ARM64vecb_SUB16x8,     ARM64vecb_SUB8x16,
+                             ARM64vecb_MUL32x4,
+      ARM64vecb_MUL16x8,     ARM64vecb_MUL8x16,
+      ARM64vecb_FADD64x2,    ARM64vecb_FADD32x4,
+      ARM64vecb_FSUB64x2,    ARM64vecb_FSUB32x4,
+      ARM64vecb_FMUL64x2,    ARM64vecb_FMUL32x4,
+      ARM64vecb_FDIV64x2,    ARM64vecb_FDIV32x4,
+                             ARM64vecb_UMAX32x4,
+      ARM64vecb_UMAX16x8,    ARM64vecb_UMAX8x16,
+                             ARM64vecb_UMIN32x4,
+      ARM64vecb_UMIN16x8,    ARM64vecb_UMIN8x16,
+                             ARM64vecb_SMAX32x4,
+      ARM64vecb_SMAX16x8,    ARM64vecb_SMAX8x16,
+                             ARM64vecb_SMIN32x4,
+      ARM64vecb_SMIN16x8,    ARM64vecb_SMIN8x16,
       ARM64vecb_AND,
       ARM64vecb_ORR,
       ARM64vecb_XOR,
-      ARM64vecb_CMEQ64x2,
-      ARM64vecb_CMEQ32x4,
-      ARM64vecb_CMEQ16x8,
-      ARM64vecb_CMEQ8x16,
-      ARM64vecb_CMHI64x2, /* >u */
-      ARM64vecb_CMHI32x4,
-      ARM64vecb_CMHI16x8,
-      ARM64vecb_CMHI8x16,
-      ARM64vecb_CMGT64x2, /* >s */
-      ARM64vecb_CMGT32x4,
-      ARM64vecb_CMGT16x8,
-      ARM64vecb_CMGT8x16,
-      ARM64vecb_FCMEQ64x2,
-      ARM64vecb_FCMEQ32x4,
-      ARM64vecb_FCMGE64x2,
-      ARM64vecb_FCMGE32x4,
-      ARM64vecb_FCMGT64x2,
-      ARM64vecb_FCMGT32x4,
+      ARM64vecb_CMEQ64x2,    ARM64vecb_CMEQ32x4,
+      ARM64vecb_CMEQ16x8,    ARM64vecb_CMEQ8x16,
+      ARM64vecb_CMHI64x2,    ARM64vecb_CMHI32x4, /* >u */
+      ARM64vecb_CMHI16x8,    ARM64vecb_CMHI8x16,
+      ARM64vecb_CMGT64x2,    ARM64vecb_CMGT32x4, /* >s */
+      ARM64vecb_CMGT16x8,    ARM64vecb_CMGT8x16,
+      ARM64vecb_FCMEQ64x2,   ARM64vecb_FCMEQ32x4,
+      ARM64vecb_FCMGE64x2,   ARM64vecb_FCMGE32x4,
+      ARM64vecb_FCMGT64x2,   ARM64vecb_FCMGT32x4,
       ARM64vecb_TBL1,
-      ARM64vecb_UZP164x2,
-      ARM64vecb_UZP132x4,
-      ARM64vecb_UZP116x8,
-      ARM64vecb_UZP18x16,
-      ARM64vecb_UZP264x2,
-      ARM64vecb_UZP232x4,
-      ARM64vecb_UZP216x8,
-      ARM64vecb_UZP28x16,
-      ARM64vecb_ZIP132x4,
-      ARM64vecb_ZIP116x8,
-      ARM64vecb_ZIP18x16,
-      ARM64vecb_ZIP232x4,
-      ARM64vecb_ZIP216x8,
-      ARM64vecb_ZIP28x16,
+      ARM64vecb_UZP164x2,    ARM64vecb_UZP132x4,
+      ARM64vecb_UZP116x8,    ARM64vecb_UZP18x16,
+      ARM64vecb_UZP264x2,    ARM64vecb_UZP232x4,
+      ARM64vecb_UZP216x8,    ARM64vecb_UZP28x16,
+      ARM64vecb_ZIP132x4,    ARM64vecb_ZIP116x8,
+      ARM64vecb_ZIP18x16,    ARM64vecb_ZIP232x4,
+      ARM64vecb_ZIP216x8,    ARM64vecb_ZIP28x16,
       ARM64vecb_INVALID
    }
    ARM64VecBinOp;
 
 typedef
    enum {
-      ARM64vecu_FNEG64x2=300,
-      ARM64vecu_FNEG32x4,
-      ARM64vecu_FABS64x2,
-      ARM64vecu_FABS32x4,
+      ARM64vecu_FNEG64x2=300, ARM64vecu_FNEG32x4,
+      ARM64vecu_FABS64x2,     ARM64vecu_FABS32x4,
       ARM64vecu_NOT,
+      ARM64vecu_ABS64x2,      ARM64vecu_ABS32x4,
+      ARM64vecu_ABS16x8,      ARM64vecu_ABS8x16,
       ARM64vecu_INVALID
    }
    ARM64VecUnaryOp;
 
 typedef
    enum {
-      ARM64vecsh_USHR64x2=350,
-      ARM64vecsh_USHR32x4,
-      ARM64vecsh_USHR16x8,
-      ARM64vecsh_USHR8x16,
-      ARM64vecsh_SSHR64x2,
-      ARM64vecsh_SSHR32x4,
-      ARM64vecsh_SSHR16x8,
-      ARM64vecsh_SSHR8x16,
-      ARM64vecsh_SHL64x2,
-      ARM64vecsh_SHL32x4,
-      ARM64vecsh_SHL16x8,
-      ARM64vecsh_SHL8x16,
+      ARM64vecsh_USHR64x2=350, ARM64vecsh_USHR32x4,
+      ARM64vecsh_USHR16x8,     ARM64vecsh_USHR8x16,
+      ARM64vecsh_SSHR64x2,     ARM64vecsh_SSHR32x4,
+      ARM64vecsh_SSHR16x8,     ARM64vecsh_SSHR8x16,
+      ARM64vecsh_SHL64x2,      ARM64vecsh_SHL32x4,
+      ARM64vecsh_SHL16x8,      ARM64vecsh_SHL8x16,
       ARM64vecsh_INVALID
    }
    ARM64VecShiftOp;
diff --git a/priv/host_arm64_isel.c b/priv/host_arm64_isel.c
index 17d76e4..d12c72d 100644
--- a/priv/host_arm64_isel.c
+++ b/priv/host_arm64_isel.c
@@ -4413,7 +4413,12 @@
          case Iop_Abs64Fx2:
          case Iop_Abs32Fx4:
          case Iop_Neg64Fx2:
-         case Iop_Neg32Fx4: {
+         case Iop_Neg32Fx4:
+         case Iop_Abs64x2:
+         case Iop_Abs32x4:
+         case Iop_Abs16x8:
+         case Iop_Abs8x16:
+         {
             HReg res = newVRegV(env);
             HReg arg = iselV128Expr(env, e->Iex.Unop.arg);
             ARM64VecUnaryOp op = ARM64vecu_INVALID;
@@ -4423,6 +4428,10 @@
                case Iop_Abs32Fx4: op = ARM64vecu_FABS32x4; break;
                case Iop_Neg64Fx2: op = ARM64vecu_FNEG64x2; break;
                case Iop_Neg32Fx4: op = ARM64vecu_FNEG32x4; break;
+               case Iop_Abs64x2:  op = ARM64vecu_ABS64x2;  break;
+               case Iop_Abs32x4:  op = ARM64vecu_ABS32x4;  break;
+               case Iop_Abs16x8:  op = ARM64vecu_ABS16x8;  break;
+               case Iop_Abs8x16:  op = ARM64vecu_ABS8x16;  break;
                default: vassert(0);
             }
             addInstr(env, ARM64Instr_VUnaryV(op, res, arg));
diff --git a/priv/ir_defs.c b/priv/ir_defs.c
index fc3ef47..4d65daf 100644
--- a/priv/ir_defs.c
+++ b/priv/ir_defs.c
@@ -726,6 +726,7 @@
       case Iop_Abs8x16: vex_printf("Abs8x16"); return;
       case Iop_Abs16x8: vex_printf("Abs16x8"); return;
       case Iop_Abs32x4: vex_printf("Abs32x4"); return;
+      case Iop_Abs64x2: vex_printf("Abs64x2"); return;
 
       case Iop_Add8x16:   vex_printf("Add8x16"); return;
       case Iop_Add16x8:   vex_printf("Add16x8"); return;
@@ -2910,7 +2911,7 @@
       case Iop_Reverse32_8x16: case Iop_Reverse32_16x8:
       case Iop_Reverse16_8x16:
       case Iop_Neg64Fx2: case Iop_Neg32Fx4:
-      case Iop_Abs8x16: case Iop_Abs16x8: case Iop_Abs32x4:
+      case Iop_Abs8x16: case Iop_Abs16x8: case Iop_Abs32x4: case Iop_Abs64x2:
       case Iop_CipherSV128:
       case Iop_PwBitMtxXpose64x2:
       case Iop_ZeroHI64ofV128:  case Iop_ZeroHI96ofV128:
diff --git a/pub/libvex_ir.h b/pub/libvex_ir.h
index 7bc68c8..c61ce23 100644
--- a/pub/libvex_ir.h
+++ b/pub/libvex_ir.h
@@ -1483,7 +1483,7 @@
       Iop_PwBitMtxXpose64x2,
 
       /* ABSOLUTE VALUE */
-      Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4,
+      Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, Iop_Abs64x2,
 
       /* AVERAGING: note: (arg1 + arg2 + 1) >>u 1 */
       Iop_Avg8Ux16, Iop_Avg16Ux8, Iop_Avg32Ux4,