arm64: implement:
{sqdmlal,sqdmlsl,sqdmull}{d_s_s[],s_h_h[]}
{sqdmlal,sqdmlsl,sqdmull}{d_s_s,s_h_h}
{sqdmlal,sqdmlsl,sqdmull}{2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)}
sqrdmulh 4s,2s,8h,4h (vector)


git-svn-id: svn://svn.valgrind.org/vex/trunk@2909 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_arm64_toIR.c b/priv/guest_arm64_toIR.c
index e779c01..f3b8d54 100644
--- a/priv/guest_arm64_toIR.c
+++ b/priv/guest_arm64_toIR.c
@@ -392,7 +392,6 @@
    *t2 = newTempV128();
 }
 
-/* Initialise V128 temporaries en masse. */
 static
 void newTempsV128_3(IRTemp* t1, IRTemp* t2, IRTemp* t3)
 {
@@ -404,6 +403,19 @@
    *t3 = newTempV128();
 }
 
+//static
+//void newTempsV128_4(IRTemp* t1, IRTemp* t2, IRTemp* t3, IRTemp* t4)
+//{
+//   vassert(t1 && *t1 == IRTemp_INVALID);
+//   vassert(t2 && *t2 == IRTemp_INVALID);
+//   vassert(t3 && *t3 == IRTemp_INVALID);
+//   vassert(t4 && *t4 == IRTemp_INVALID);
+//   *t1 = newTempV128();
+//   *t2 = newTempV128();
+//   *t3 = newTempV128();
+//   *t4 = newTempV128();
+//}
+
 static
 void newTempsV128_7(IRTemp* t1, IRTemp* t2, IRTemp* t3,
                     IRTemp* t4, IRTemp* t5, IRTemp* t6, IRTemp* t7)
@@ -782,6 +794,20 @@
    }
 }
 
+static IROp mkVecQDMULHIS ( UInt size ) {
+   const IROp ops[4]
+      = { Iop_INVALID, Iop_QDMulHi16Sx8, Iop_QDMulHi32Sx4, Iop_INVALID };
+   vassert(size < 4);
+   return ops[size];
+}
+
+static IROp mkVecQRDMULHIS ( UInt size ) {
+   const IROp ops[4]
+      = { Iop_INVALID, Iop_QRDMulHi16Sx8, Iop_QRDMulHi32Sx4, Iop_INVALID };
+   vassert(size < 4);
+   return ops[size];
+}
+
 /* Generate IR to create 'arg rotated right by imm', for sane values
    of 'ty' and 'imm'. */
 static IRTemp mathROR ( IRType ty, IRTemp arg, UInt imm )
@@ -6362,22 +6388,105 @@
 }
 
 
-/* QCFLAG tracks the SIMD sticky saturation status.  Update the status
-   thusly: if |qres| and |nres| hold the same value, leave QCFLAG
-   unchanged.  Otherwise, set it (implicitly) to 1. */
+/* Generate IR for widening signed vector multiplies.  The operands
+   have their lane width signedly widened, and they are then multiplied
+   at the wider width, returning results in two new IRTemps. */
 static
-void updateQCFLAGwithDifference ( IRTemp qres, IRTemp nres )
+void math_MULLS ( /*OUT*/IRTemp* resHI, /*OUT*/IRTemp* resLO,
+                  UInt sizeNarrow, IRTemp argL, IRTemp argR )
+{
+   vassert(sizeNarrow <= 2);
+   newTempsV128_2(resHI, resLO);
+   IRTemp argLhi = newTemp(Ity_I64);
+   IRTemp argLlo = newTemp(Ity_I64);
+   IRTemp argRhi = newTemp(Ity_I64);
+   IRTemp argRlo = newTemp(Ity_I64);
+   assign(argLhi, unop(Iop_V128HIto64, mkexpr(argL)));
+   assign(argLlo, unop(Iop_V128to64,   mkexpr(argL)));
+   assign(argRhi, unop(Iop_V128HIto64, mkexpr(argR)));
+   assign(argRlo, unop(Iop_V128to64,   mkexpr(argR)));
+   IROp opMulls = mkVecMULLS(sizeNarrow);
+   assign(*resHI, binop(opMulls, mkexpr(argLhi), mkexpr(argRhi)));
+   assign(*resLO, binop(opMulls, mkexpr(argLlo), mkexpr(argRlo)));
+}
+
+
+static
+void math_SQDMULH ( /*OUT*/IRTemp* res,
+                    /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
+                    Bool isR, UInt size, IRTemp vN, IRTemp vM )
+{
+   vassert(size == X01 || size == X10); /* s or h only */
+
+   newTempsV128_3(res, sat1q, sat1n);
+
+   IRTemp mullsHI = IRTemp_INVALID, mullsLO = IRTemp_INVALID;
+   math_MULLS(&mullsHI, &mullsLO, size, vN, vM);
+
+   IRTemp addWide = mkVecADD(size+1);
+
+   if (isR) {
+      assign(*sat1q, binop(mkVecQRDMULHIS(size), mkexpr(vN), mkexpr(vM)));
+
+      Int    rcShift    = size == X01 ? 15 : 31;
+      IRTemp roundConst = math_VEC_DUP_IMM(size+1, 1ULL << rcShift);
+      assign(*sat1n,
+             binop(mkVecCATODDLANES(size),
+                   binop(addWide,
+                         binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
+                         mkexpr(roundConst)),
+                   binop(addWide,
+                         binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO)),
+                         mkexpr(roundConst))));
+   } else {
+      assign(*sat1q, binop(mkVecQDMULHIS(size), mkexpr(vN), mkexpr(vM)));
+
+      assign(*sat1n,
+             binop(mkVecCATODDLANES(size),
+                   binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
+                   binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO))));
+   }
+
+   assign(*res, mkexpr(*sat1q));
+}
+
+
+/* QCFLAG tracks the SIMD sticky saturation status.  Update the status
+   thusly: if, after application of |opZHI| to both |qres| and |nres|,
+   they have the same value, leave QCFLAG unchanged.  Otherwise, set it
+   (implicitly) to 1.  |opZHI| may only be one of the Iop_ZeroHIxxofV128
+   operators, or Iop_INVALID, in which case |qres| and |nres| are used
+   unmodified.  The presence |opZHI| means this function can be used to
+   generate QCFLAG update code for both scalar and vector SIMD operations.
+*/
+static
+void updateQCFLAGwithDifferenceZHI ( IRTemp qres, IRTemp nres, IROp opZHI )
 {
    IRTemp diff      = newTempV128();
    IRTemp oldQCFLAG = newTempV128();
    IRTemp newQCFLAG = newTempV128();
-   assign(diff,      binop(Iop_XorV128, mkexpr(qres), mkexpr(nres)));
+   if (opZHI == Iop_INVALID) {
+      assign(diff, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres)));
+   } else {
+      vassert(opZHI == Iop_ZeroHI64ofV128 || opZHI == Iop_ZeroHI96ofV128);
+      assign(diff, unop(opZHI, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres))));
+   }
    assign(oldQCFLAG, IRExpr_Get(OFFB_QCFLAG, Ity_V128));
    assign(newQCFLAG, binop(Iop_OrV128, mkexpr(oldQCFLAG), mkexpr(diff)));
    stmt(IRStmt_Put(OFFB_QCFLAG, mkexpr(newQCFLAG)));
 }
 
 
+/* A variant of updateQCFLAGwithDifferenceZHI in which |qres| and |nres|
+   are used unmodified, hence suitable for QCFLAG updates for whole-vector
+   operations. */
+static
+void updateQCFLAGwithDifference ( IRTemp qres, IRTemp nres )
+{
+   updateQCFLAGwithDifferenceZHI(qres, nres, Iop_INVALID);
+}
+
+
 /*------------------------------------------------------------*/
 /*--- SIMD and FP instructions                             ---*/
 /*------------------------------------------------------------*/
@@ -7270,7 +7379,67 @@
 static
 Bool dis_AdvSIMD_scalar_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
 {
+   /* 31 29 28    23   21 20 15     11 9 4
+      01 U  11110 size 1  m  opcode 00 n d
+      Decode fields: u,opcode
+   */
 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
+   if (INSN(31,30) != BITS2(0,1)
+       || INSN(28,24) != BITS5(1,1,1,1,0)
+       || INSN(21,21) != 1
+       || INSN(11,10) != BITS2(0,0)) {
+      return False;
+   }
+   UInt bitU   = INSN(29,29);
+   UInt size   = INSN(23,22);
+   UInt mm     = INSN(20,16);
+   UInt opcode = INSN(15,12);
+   UInt nn     = INSN(9,5);
+   UInt dd     = INSN(4,0);
+   vassert(size < 4);
+
+   if (bitU == 0
+       && (opcode == BITS4(1,1,0,1)
+           || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
+      /* -------- 0,1101  SQDMULL -------- */ // 0 (ks)
+      /* -------- 0,1001  SQDMLAL -------- */ // 1
+      /* -------- 0,1011  SQDMLSL -------- */ // 2
+      /* Widens, and size refers to the narrowed lanes. */
+      UInt ks = 3;
+      switch (opcode) {
+         case BITS4(1,1,0,1): ks = 0; break;
+         case BITS4(1,0,0,1): ks = 1; break;
+         case BITS4(1,0,1,1): ks = 2; break;
+         default: vassert(0);
+      }
+      vassert(ks >= 0 && ks <= 2);
+      if (size == X00 || size == X11) return False;
+      vassert(size <= 2);
+      IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
+      vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
+      newTempsV128_3(&vecN, &vecM, &vecD);
+      assign(vecN, getQReg128(nn));
+      assign(vecM, getQReg128(mm));
+      assign(vecD, getQReg128(dd));
+      math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
+                       False/*!is2*/, size, "mas"[ks],
+                       vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
+      IROp opZHI = mkVecZEROHIxxOFV128(size+1);
+      putQReg128(dd, unop(opZHI, mkexpr(res)));
+      vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
+      updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
+      if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
+         updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
+      }
+      const HChar* nm        = ks == 0 ? "sqdmull"
+                                       : (ks == 1 ? "sqdmlal" : "sqdmlsl");
+      const HChar  arrNarrow = "bhsd"[size];
+      const HChar  arrWide   = "bhsd"[size+1];
+      DIP("%s %c%d, %c%d, %c%d\n",
+          nm, arrWide, dd, arrNarrow, nn, arrNarrow, mm);
+      return True;
+   }
+
    return False;
 #  undef INSN
 }
@@ -7523,7 +7692,84 @@
 static
 Bool dis_AdvSIMD_scalar_x_indexed_element(/*MB_OUT*/DisResult* dres, UInt insn)
 {
+   /* 31   28    23   21 20 19 15     11   9 4
+      01 U 11111 size L  M  m  opcode H  0 n d
+      Decode fields are: u,size,opcode
+      M is really part of the mm register number.  Individual 
+      cases need to inspect L and H though.
+   */
 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
+   if (INSN(31,30) != BITS2(0,1)
+       || INSN(28,24) != BITS5(1,1,1,1,1) || INSN(10,10) !=0) {
+      return False;
+   }
+   UInt bitU   = INSN(29,29);
+   UInt size   = INSN(23,22);
+   UInt bitL   = INSN(21,21);
+   UInt bitM   = INSN(20,20);
+   UInt mmLO4  = INSN(19,16);
+   UInt opcode = INSN(15,12);
+   UInt bitH   = INSN(11,11);
+   UInt nn     = INSN(9,5);
+   UInt dd     = INSN(4,0);
+   vassert(size < 4);
+   vassert(bitH < 2 && bitM < 2 && bitL < 2);
+
+   if (bitU == 0 
+       && (opcode == BITS4(1,0,1,1)
+           || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
+      /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
+      /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
+      /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
+      /* Widens, and size refers to the narrowed lanes. */
+      UInt ks = 3;
+      switch (opcode) {
+         case BITS4(1,0,1,1): ks = 0; break;
+         case BITS4(0,0,1,1): ks = 1; break;
+         case BITS4(0,1,1,1): ks = 2; break;
+         default: vassert(0);
+      }
+      vassert(ks >= 0 && ks <= 2);
+      UInt mm  = 32; // invalid
+      UInt ix  = 16; // invalid
+      switch (size) {
+         case X00:
+            return False; // h_b_b[] case is not allowed
+         case X01:
+            mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
+         case X10:
+            mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
+         case X11:
+            return False; // q_d_d[] case is not allowed
+         default:
+            vassert(0);
+      }
+      vassert(mm < 32 && ix < 16);
+      IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
+      vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
+      newTempsV128_2(&vecN, &vecD);
+      assign(vecN, getQReg128(nn));
+      IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
+      assign(vecD, getQReg128(dd));
+      math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
+                       False/*!is2*/, size, "mas"[ks],
+                       vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
+      IROp opZHI = mkVecZEROHIxxOFV128(size+1);
+      putQReg128(dd, unop(opZHI, mkexpr(res)));
+      vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
+      updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
+      if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
+         updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
+      }
+      const HChar* nm        = ks == 0 ? "sqmull"
+                                       : (ks == 1 ? "sqdmlal" : "sqdmlsl");
+      const HChar  arrNarrow = "bhsd"[size];
+      const HChar  arrWide   = "bhsd"[size+1];
+      DIP("%s %c%d, %c%d, v%d.%c[%u]\n",
+          nm, arrWide, dd, arrNarrow, nn, dd, arrNarrow, ix);
+      return True;
+   }
+
    return False;
 #  undef INSN
 }
@@ -7814,8 +8060,6 @@
       /* -------- 0,0010 SSUBL{2} -------- */
       /* -------- 1,0010 USUBL{2} -------- */
       /* Widens, and size refers to the narrowed lanes. */
-      const IROp opADD[3] = { Iop_Add16x8,  Iop_Add32x4,  Iop_Add64x2 };
-      const IROp opSUB[3] = { Iop_Sub16x8,  Iop_Sub32x4,  Iop_Sub64x2 };
       if (size == X11) return False;
       vassert(size <= 2);
       Bool   isU   = bitU == 1;
@@ -7823,7 +8067,7 @@
       IRTemp argL  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
       IRTemp res   = newTempV128();
-      assign(res, binop(isADD ? opADD[size] : opSUB[size],
+      assign(res, binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
                         mkexpr(argL), mkexpr(argR)));
       putQReg128(dd, mkexpr(res));
       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
@@ -7966,6 +8210,48 @@
       return True;
    }
 
+   if (bitU == 0
+       && (opcode == BITS4(1,1,0,1)
+           || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
+      /* -------- 0,1101  SQDMULL{2} -------- */ // 0 (ks)
+      /* -------- 0,1001  SQDMLAL{2} -------- */ // 1
+      /* -------- 0,1011  SQDMLSL{2} -------- */ // 2
+      /* Widens, and size refers to the narrowed lanes. */
+      UInt ks = 3;
+      switch (opcode) {
+         case BITS4(1,1,0,1): ks = 0; break;
+         case BITS4(1,0,0,1): ks = 1; break;
+         case BITS4(1,0,1,1): ks = 2; break;
+         default: vassert(0);
+      }
+      vassert(ks >= 0 && ks <= 2);
+      if (size == X00 || size == X11) return False;
+      vassert(size <= 2);
+      IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
+      vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
+      newTempsV128_3(&vecN, &vecM, &vecD);
+      assign(vecN, getQReg128(nn));
+      assign(vecM, getQReg128(mm));
+      assign(vecD, getQReg128(dd));
+      math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
+                       is2, size, "mas"[ks],
+                       vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
+      putQReg128(dd, mkexpr(res));
+      vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
+      updateQCFLAGwithDifference(sat1q, sat1n);
+      if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
+         updateQCFLAGwithDifference(sat2q, sat2n);
+      }
+      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
+      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
+      const HChar* nm        = ks == 0 ? "sqdmull"
+                                       : (ks == 1 ? "sqdmlal" : "sqdmlsl");
+      DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
+          nameQReg128(dd), arrWide,
+          nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
+      return True;
+   }
+
    if (bitU == 0 && opcode == BITS4(1,1,1,0)) {
       /* -------- 0,1110  PMULL{2} -------- */
       /* Widens, and size refers to the narrowed lanes. */
@@ -8351,6 +8637,27 @@
       return True;
    }
 
+   if (opcode == BITS5(1,0,1,1,0)) {
+      /* -------- 0,xx,10110 SQDMULH s and h variants only -------- */
+      /* -------- 1,xx,10110 SQRDMULH s and h variants only -------- */
+      if (size == X00 || size == X11) return False;
+      Bool isR = bitU == 1;
+      IRTemp res, sat1q, sat1n, vN, vM;
+      res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
+      newTempsV128_2(&vN, &vM);
+      assign(vN, getQReg128(nn));
+      assign(vM, getQReg128(mm));
+      math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
+      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
+      IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
+      updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
+      const HChar* arr = nameArr_Q_SZ(bitQ, size);
+      const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
+      DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
+          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+      return True;
+   }
+
    if (bitU == 0 && opcode == BITS5(1,0,1,1,1)) {
       /* -------- 0,xx,10111 ADDP std7_std7_std7 -------- */
       if (bitQ == 0 && size == X11) return False; // implied 1d case
@@ -9080,7 +9387,7 @@
       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
          updateQCFLAGwithDifference(sat2q, sat2n);
       }
-      const HChar* nm        = ks == 0 ? "sqmull"
+      const HChar* nm        = ks == 0 ? "sqdmull"
                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
diff --git a/priv/host_arm64_defs.c b/priv/host_arm64_defs.c
index c636982..e55430f 100644
--- a/priv/host_arm64_defs.c
+++ b/priv/host_arm64_defs.c
@@ -947,6 +947,10 @@
       case ARM64vecb_UQSUB8x16:    *nm = "uqsub";    *ar = "16b";  return;
       case ARM64vecb_SQDMULL2DSS:  *nm = "sqdmull";  *ar = "2dss"; return;
       case ARM64vecb_SQDMULL4SHH:  *nm = "sqdmull";  *ar = "4shh"; return;
+      case ARM64vecb_SQDMULH32x4:  *nm = "sqdmulh";  *ar = "4s";   return;
+      case ARM64vecb_SQDMULH16x8:  *nm = "sqdmulh";  *ar = "8h";   return;
+      case ARM64vecb_SQRDMULH32x4: *nm = "sqrdmulh"; *ar = "4s";   return;
+      case ARM64vecb_SQRDMULH16x8: *nm = "sqrdmulh"; *ar = "8h";   return;
       default: vpanic("showARM64VecBinOp");
    }
 }
@@ -3506,6 +3510,7 @@
 #define X100101  BITS8(0,0, 1,0,0,1,0,1)
 #define X100110  BITS8(0,0, 1,0,0,1,1,0)
 #define X100111  BITS8(0,0, 1,0,0,1,1,1)
+#define X101101  BITS8(0,0, 1,0,1,1,0,1)
 #define X101110  BITS8(0,0, 1,0,1,1,1,0)
 #define X110000  BITS8(0,0, 1,1,0,0,0,0)
 #define X110001  BITS8(0,0, 1,1,0,0,0,1)
@@ -5195,6 +5200,11 @@
 
             000 01110 10 1 m  110100 n d   SQDMULL Vd.2d, Vn.2s, Vm.2s
             000 01110 01 1 m  110100 n d   SQDMULL Vd.4s, Vn.4h, Vm.4h
+
+            010 01110 10 1 m  101101 n d   SQDMULH   Vd.4s,  Vn.4s,  Vm.4s
+            010 01110 01 1 m  101101 n d   SQDMULH   Vd.8h,  Vn.8h,  Vm.8h
+            011 01110 10 1 m  101101 n d   SQRDMULH  Vd.4s,  Vn.4s,  Vm.4s
+            011 01110 10 1 m  101101 n d   SQRDMULH  Vd.8h,  Vn.8h,  Vm.8h
          */
          UInt vD = qregNo(i->ARM64in.VBinV.dst);
          UInt vN = qregNo(i->ARM64in.VBinV.argL);
@@ -5505,6 +5515,19 @@
                *p++ = X_3_8_5_6_5_5(X000, X01110011, vM, X110100, vN, vD);
                break;
 
+            case ARM64vecb_SQDMULH32x4:
+               *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X101101, vN, vD);
+               break;
+            case ARM64vecb_SQDMULH16x8:
+               *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X101101, vN, vD);
+               break;
+            case ARM64vecb_SQRDMULH32x4:
+               *p++ = X_3_8_5_6_5_5(X011, X01110101, vM, X101101, vN, vD);
+               break;
+            case ARM64vecb_SQRDMULH16x8:
+               *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X101101, vN, vD);
+               break;
+
             default:
                goto bad;
          }
diff --git a/priv/host_arm64_defs.h b/priv/host_arm64_defs.h
index bce1911..90bf4c1 100644
--- a/priv/host_arm64_defs.h
+++ b/priv/host_arm64_defs.h
@@ -360,7 +360,10 @@
       ARM64vecb_UQSUB16x8,   ARM64vecb_UQSUB8x16,
                              ARM64vecb_SQDMULL2DSS,
       ARM64vecb_SQDMULL4SHH,
-
+                             ARM64vecb_SQDMULH32x4,
+      ARM64vecb_SQDMULH16x8,
+                             ARM64vecb_SQRDMULH32x4,
+      ARM64vecb_SQRDMULH16x8,
       ARM64vecb_INVALID
    }
    ARM64VecBinOp;
diff --git a/priv/host_arm64_isel.c b/priv/host_arm64_isel.c
index 97cfde0..dfbe146 100644
--- a/priv/host_arm64_isel.c
+++ b/priv/host_arm64_isel.c
@@ -4951,6 +4951,8 @@
          case Iop_QSub16Sx8: case Iop_QSub8Sx16:
          case Iop_QSub64Ux2: case Iop_QSub32Ux4:
          case Iop_QSub16Ux8: case Iop_QSub8Ux16:
+         case Iop_QDMulHi32Sx4:  case Iop_QDMulHi16Sx8: 
+         case Iop_QRDMulHi32Sx4: case Iop_QRDMulHi16Sx8: 
          {
             HReg res  = newVRegV(env);
             HReg argL = iselV128Expr(env, e->Iex.Binop.arg1);
@@ -5032,22 +5034,26 @@
                case Iop_InterleaveLO8x16: op = ARM64vecb_ZIP18x16; sw = True;
                                           break;
                case Iop_PolynomialMul8x16: op = ARM64vecb_PMUL8x16; break;
-               case Iop_QAdd64Sx2:  op = ARM64vecb_SQADD64x2; break;
-               case Iop_QAdd32Sx4:  op = ARM64vecb_SQADD32x4; break;
-               case Iop_QAdd16Sx8:  op = ARM64vecb_SQADD16x8; break;
-               case Iop_QAdd8Sx16:  op = ARM64vecb_SQADD8x16; break;
-               case Iop_QAdd64Ux2:  op = ARM64vecb_UQADD64x2; break;
-               case Iop_QAdd32Ux4:  op = ARM64vecb_UQADD32x4; break;
-               case Iop_QAdd16Ux8:  op = ARM64vecb_UQADD16x8; break;
-               case Iop_QAdd8Ux16:  op = ARM64vecb_UQADD8x16; break;
-               case Iop_QSub64Sx2:  op = ARM64vecb_SQSUB64x2; break;
-               case Iop_QSub32Sx4:  op = ARM64vecb_SQSUB32x4; break;
-               case Iop_QSub16Sx8:  op = ARM64vecb_SQSUB16x8; break;
-               case Iop_QSub8Sx16:  op = ARM64vecb_SQSUB8x16; break;
-               case Iop_QSub64Ux2:  op = ARM64vecb_UQSUB64x2; break;
-               case Iop_QSub32Ux4:  op = ARM64vecb_UQSUB32x4; break;
-               case Iop_QSub16Ux8:  op = ARM64vecb_UQSUB16x8; break;
-               case Iop_QSub8Ux16:  op = ARM64vecb_UQSUB8x16; break;
+               case Iop_QAdd64Sx2:      op = ARM64vecb_SQADD64x2; break;
+               case Iop_QAdd32Sx4:      op = ARM64vecb_SQADD32x4; break;
+               case Iop_QAdd16Sx8:      op = ARM64vecb_SQADD16x8; break;
+               case Iop_QAdd8Sx16:      op = ARM64vecb_SQADD8x16; break;
+               case Iop_QAdd64Ux2:      op = ARM64vecb_UQADD64x2; break;
+               case Iop_QAdd32Ux4:      op = ARM64vecb_UQADD32x4; break;
+               case Iop_QAdd16Ux8:      op = ARM64vecb_UQADD16x8; break;
+               case Iop_QAdd8Ux16:      op = ARM64vecb_UQADD8x16; break;
+               case Iop_QSub64Sx2:      op = ARM64vecb_SQSUB64x2; break;
+               case Iop_QSub32Sx4:      op = ARM64vecb_SQSUB32x4; break;
+               case Iop_QSub16Sx8:      op = ARM64vecb_SQSUB16x8; break;
+               case Iop_QSub8Sx16:      op = ARM64vecb_SQSUB8x16; break;
+               case Iop_QSub64Ux2:      op = ARM64vecb_UQSUB64x2; break;
+               case Iop_QSub32Ux4:      op = ARM64vecb_UQSUB32x4; break;
+               case Iop_QSub16Ux8:      op = ARM64vecb_UQSUB16x8; break;
+               case Iop_QSub8Ux16:      op = ARM64vecb_UQSUB8x16; break;
+               case Iop_QDMulHi32Sx4:   op = ARM64vecb_SQDMULH32x4; break;
+               case Iop_QDMulHi16Sx8:   op = ARM64vecb_SQDMULH16x8; break;
+               case Iop_QRDMulHi32Sx4:  op = ARM64vecb_SQRDMULH32x4; break;
+               case Iop_QRDMulHi16Sx8:  op = ARM64vecb_SQRDMULH16x8; break;
                default: vassert(0);
             }
             if (sw) {