arm64: implement: sqneg, {u,s}q{add,sub} (scalar),
{sqdmlal,sqdmlsl,sqdmull} (vector x element)

As part of this, rename Iop_QDMulLong* to Iop_QDMull* so as to be
consistent with their non-saturating equivalents.



git-svn-id: svn://svn.valgrind.org/vex/trunk@2907 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_arm64_toIR.c b/priv/guest_arm64_toIR.c
index 1b91c61..e779c01 100644
--- a/priv/guest_arm64_toIR.c
+++ b/priv/guest_arm64_toIR.c
@@ -384,6 +384,27 @@
 
 /* Initialise V128 temporaries en masse. */
 static
+void newTempsV128_2(IRTemp* t1, IRTemp* t2)
+{
+   vassert(t1 && *t1 == IRTemp_INVALID);
+   vassert(t2 && *t2 == IRTemp_INVALID);
+   *t1 = newTempV128();
+   *t2 = newTempV128();
+}
+
+/* Initialise V128 temporaries en masse. */
+static
+void newTempsV128_3(IRTemp* t1, IRTemp* t2, IRTemp* t3)
+{
+   vassert(t1 && *t1 == IRTemp_INVALID);
+   vassert(t2 && *t2 == IRTemp_INVALID);
+   vassert(t3 && *t3 == IRTemp_INVALID);
+   *t1 = newTempV128();
+   *t2 = newTempV128();
+   *t3 = newTempV128();
+}
+
+static
 void newTempsV128_7(IRTemp* t1, IRTemp* t2, IRTemp* t3,
                     IRTemp* t4, IRTemp* t5, IRTemp* t6, IRTemp* t7)
 {
@@ -710,6 +731,13 @@
    return ops[sizeNarrow];
 }
 
+static IROp mkVecQDMULLS ( UInt sizeNarrow ) {
+   const IROp ops[4]
+      = { Iop_INVALID, Iop_QDMull16Sx4, Iop_QDMull32Sx2, Iop_INVALID };
+   vassert(sizeNarrow < 3);
+   return ops[sizeNarrow];
+}
+
 static IROp mkVecCMPEQ ( UInt size ) {
    const IROp ops[4]
       = { Iop_CmpEQ8x16, Iop_CmpEQ16x8, Iop_CmpEQ32x4, Iop_CmpEQ64x2 };
@@ -6223,15 +6251,117 @@
 }
 
 
-static IRTemp math_ZERO_ALL_EXCEPT_LOWEST_LANE ( IRTemp src, UInt size )
+/* Compute vector SQNEG at lane size |size| for |srcE|, returning
+   the q result in |*qneg| and the normal result in |*nneg|. */
+static
+void math_SQNEG ( /*OUT*/IRTemp* qneg, /*OUT*/IRTemp* nneg,
+                  IRExpr* srcE, UInt size )
+{
+      IRTemp src = IRTemp_INVALID;
+      newTempsV128_3(&src, nneg, qneg);
+      assign(src,   srcE);
+      assign(*nneg, binop(mkVecSUB(size),   mkV128(0x0000), mkexpr(src)));
+      assign(*qneg, binop(mkVecQSUBS(size), mkV128(0x0000), mkexpr(src)));
+}
+
+
+static IRTemp math_ZERO_ALL_EXCEPT_LOWEST_LANE ( IRExpr* srcE, UInt size )
 {
    vassert(size < 4);
    IRTemp t = newTempV128();
-   assign(t, unop(mkVecZEROHIxxOFV128(size), mkexpr(src)));
+   assign(t, unop(mkVecZEROHIxxOFV128(size), srcE));
    return t;
 }
 
 
+/* Generate IR to compute vector widening MULL from either the lower
+   (is2==False) or upper (is2==True) halves of vecN and vecM.  The
+   widening multiplies are unsigned when isU==True and signed when
+   isU==False.  |size| is the narrow lane size indication.  Optionally,
+   the product may be added to or subtracted from vecD, at the wide lane
+   size.  This happens when |mas| is 'a' (add) or 's' (sub).  When |mas|
+   is 'm' (only multiply) then the accumulate part does not happen, and
+   |vecD| is expected to == IRTemp_INVALID.
+
+   Only size==0 (h_b_b), size==1 (s_h_h) and size==2 (d_s_s) variants
+   are allowed.  The result is returned in a new IRTemp, which is
+   returned in *res. */
+static
+void math_MULL_ACC ( /*OUT*/IRTemp* res,
+                     Bool is2, Bool isU, UInt size, HChar mas,
+                     IRTemp vecN, IRTemp vecM, IRTemp vecD )
+{
+   vassert(res && *res == IRTemp_INVALID);
+   vassert(size <= 2);
+   vassert(mas == 'm' || mas == 'a' || mas == 's');
+   if (mas == 'm') vassert(vecD == IRTemp_INVALID);
+   IROp   mulOp = isU ? mkVecMULLU(size) : mkVecMULLS(size);
+   IROp   accOp = (mas == 'a') ? mkVecADD(size+1) 
+                  : (mas == 's' ? mkVecSUB(size+1)
+                  : Iop_INVALID);
+   IRTemp mul   = math_BINARY_WIDENING_V128(is2, mulOp, 
+                                            mkexpr(vecN), mkexpr(vecM));
+   *res = newTempV128();
+   assign(*res, mas == 'm' ? mkexpr(mul) 
+                           : binop(accOp, mkexpr(vecD), mkexpr(mul)));
+}
+
+
+/* Same as math_MULL_ACC, except the multiply is signed widening,
+   the multiplied value is then doubled, before being added to or
+   subtracted from the accumulated value.  And everything is
+   saturated.  In all cases, saturation residuals are returned
+   via (sat1q, sat1n), and in the accumulate cases,
+   via (sat2q, sat2n) too.  All results are returned in new temporaries.
+   In the no-accumulate case, *sat2q and *sat2n are never instantiated,
+   so the caller can tell this has happened. */
+static
+void math_SQDMULL_ACC ( /*OUT*/IRTemp* res,
+                        /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
+                        /*OUT*/IRTemp* sat2q, /*OUT*/IRTemp* sat2n,
+                        Bool is2, UInt size, HChar mas,
+                        IRTemp vecN, IRTemp vecM, IRTemp vecD )
+{
+   vassert(size <= 2);
+   vassert(mas == 'm' || mas == 'a' || mas == 's');
+   /* Compute
+         sat1q = vecN.D[is2] *sq vecM.d[is2] *q 2
+         sat1n = vecN.D[is2] *s  vecM.d[is2] *  2
+      IOW take either the low or high halves of vecN and vecM, signed widen,
+      multiply, double that, and signedly saturate.  Also compute the same
+      but without saturation.
+   */
+   vassert(sat2q && *sat2q == IRTemp_INVALID);
+   vassert(sat2n && *sat2n == IRTemp_INVALID);
+   newTempsV128_3(sat1q, sat1n, res);
+   IRTemp tq = math_BINARY_WIDENING_V128(is2, mkVecQDMULLS(size),
+                                         mkexpr(vecN), mkexpr(vecM));
+   IRTemp tn = math_BINARY_WIDENING_V128(is2, mkVecMULLS(size),
+                                         mkexpr(vecN), mkexpr(vecM));
+   assign(*sat1q, mkexpr(tq));
+   assign(*sat1n, binop(mkVecADD(size+1), mkexpr(tn), mkexpr(tn)));
+
+   /* If there is no accumulation, the final result is sat1q,
+      and there's no assignment to sat2q or sat2n. */
+   if (mas == 'm') {
+      assign(*res, mkexpr(*sat1q));
+      return;
+   }
+
+   /* Compute
+         sat2q  = vecD +sq/-sq sat1q
+         sat2n  = vecD +/-     sat1n
+         result = sat2q
+   */
+   newTempsV128_2(sat2q, sat2n);
+   assign(*sat2q, binop(mas == 'a' ? mkVecQADDS(size+1) : mkVecQSUBS(size+1),
+                        mkexpr(vecD), mkexpr(*sat1q)));
+   assign(*sat2n, binop(mas == 'a' ? mkVecADD(size+1) : mkVecSUB(size+1),
+                        mkexpr(vecD), mkexpr(*sat1n)));
+   assign(*res, mkexpr(*sat2q));
+}
+
+
 /* QCFLAG tracks the SIMD sticky saturation status.  Update the status
    thusly: if |qres| and |nres| hold the same value, leave QCFLAG
    unchanged.  Otherwise, set it (implicitly) to 1. */
@@ -7151,6 +7281,7 @@
 {
    /* 31 29 28    23   21 20 15     10 9 4
       01 U  11110 size 1  m  opcode 1  n d
+      Decode fields: u,size,opcode
    */
 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
    if (INSN(31,30) != BITS2(0,1)
@@ -7167,6 +7298,42 @@
    UInt dd     = INSN(4,0);
    vassert(size < 4);
 
+   if (opcode == BITS5(0,0,0,0,1) || opcode == BITS5(0,0,1,0,1)) {
+      /* -------- 0,xx,00001 SQADD std4_std4_std4 -------- */
+      /* -------- 1,xx,00001 UQADD std4_std4_std4 -------- */
+      /* -------- 0,xx,00101 SQSUB std4_std4_std4 -------- */
+      /* -------- 1,xx,00101 UQSUB std4_std4_std4 -------- */
+      Bool isADD = opcode == BITS5(0,0,0,0,1);
+      Bool isU   = bitU == 1;
+      IROp qop   = Iop_INVALID;
+      IROp nop   = Iop_INVALID;
+      if (isADD) {
+         qop = isU ? mkVecQADDU(size) : mkVecQADDS(size);
+         nop = mkVecADD(size);
+      } else {
+         qop = isU ? mkVecQSUBU(size) : mkVecQSUBS(size);
+         nop = mkVecSUB(size);
+      }
+      IRTemp argL = newTempV128();
+      IRTemp argR = newTempV128();
+      IRTemp qres = newTempV128();
+      IRTemp nres = newTempV128();
+      assign(argL, getQReg128(nn));
+      assign(argR, getQReg128(mm));
+      assign(qres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
+                             binop(qop, mkexpr(argL), mkexpr(argR)), size)));
+      assign(nres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
+                             binop(nop, mkexpr(argL), mkexpr(argR)), size)));
+      putQReg128(dd, mkexpr(qres));
+      updateQCFLAGwithDifference(qres, nres);
+      const HChar* nm  = isADD ? (isU ? "uqadd" : "sqadd") 
+                               : (isU ? "uqsub" : "sqsub");
+      const HChar  arr = "bhsd"[size];
+      DIP("%s %s.%c, %s.%c, %s.%c\n", nm,
+          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+      return True;
+   }
+
    if (size == X11 && opcode == BITS5(0,0,1,1,0)) {
       /* -------- 0,11,00110 CMGT d_d_d -------- */ // >s
       /* -------- 1,11,00110 CMHI d_d_d -------- */ // >u
@@ -7277,16 +7444,19 @@
    UInt dd     = INSN(4,0);
    vassert(size < 4);
 
-   if (bitU == 0 && opcode == BITS5(0,0,1,1,1)) {
+   if (opcode == BITS5(0,0,1,1,1)) {
       /* -------- 0,xx,00111 SQABS std4_std4 -------- */
-      IRTemp qabs = IRTemp_INVALID, nabs = IRTemp_INVALID;
-      math_SQABS(&qabs, &nabs, getQReg128(nn), size);
-      IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(qabs, size);
-      IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(nabs, size);
+      /* -------- 1,xx,00111 SQNEG std4_std4 -------- */
+      Bool isNEG = bitU == 1;
+      IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
+      (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
+                                         getQReg128(nn), size );
+      IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(mkexpr(qresFW), size);
+      IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(mkexpr(nresFW), size);
       putQReg128(dd, mkexpr(qres));
       updateQCFLAGwithDifference(qres, nres);
       const HChar arr = "bhsd"[size];
-      DIP("%s %c%u, %c%u\n", "sqabs", arr, dd, arr, nn);
+      DIP("%s %c%u, %c%u\n", isNEG ? "sqneg" : "sqabs", arr, dd, arr, nn);
       return True;
    }
 
@@ -7776,15 +7946,16 @@
       vassert(ks >= 0 && ks <= 2);
       if (size == X11) return False;
       vassert(size <= 2);
-      Bool   isU   = bitU == 1;
-      IROp   mulOp = isU ? mkVecMULLU(size) : mkVecMULLS(size);
-      IROp   accOp = (ks == 1) ? mkVecADD(size+1)
-                               : (ks == 2 ? mkVecSUB(size+1) : Iop_INVALID);
-      IRTemp mul = math_BINARY_WIDENING_V128(is2, mulOp,
-                                             getQReg128(nn), getQReg128(mm));
-      IRTemp res = newTempV128();
-      assign(res, ks == 0 ? mkexpr(mul) 
-                          : binop(accOp, getQReg128(dd), mkexpr(mul)));
+      Bool   isU  = bitU == 1;
+      IRTemp vecN = newTempV128();
+      IRTemp vecM = newTempV128();
+      IRTemp vecD = newTempV128();
+      assign(vecN, getQReg128(nn));
+      assign(vecM, getQReg128(mm));
+      assign(vecD, getQReg128(dd));
+      IRTemp res = IRTemp_INVALID;
+      math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
+                    vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
       putQReg128(dd, mkexpr(res));
       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
@@ -8493,18 +8664,21 @@
       return True;
    }
 
-   if (bitU == 0 && opcode == BITS5(0,0,1,1,1)) {
+   if (opcode == BITS5(0,0,1,1,1)) {
       /* -------- 0,xx,00111 SQABS std7_std7 -------- */
+      /* -------- 1,xx,00111 SQNEG std7_std7 -------- */
       if (bitQ == 0 && size == X11) return False; // implied 1d case
-      IRTemp qabs = IRTemp_INVALID, nabs = IRTemp_INVALID;
-      math_SQABS(&qabs, &nabs, getQReg128(nn), size);
+      Bool   isNEG  = bitU == 1;
+      IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
+      (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
+                                         getQReg128(nn), size );
       IRTemp qres = newTempV128(), nres = newTempV128();
-      assign(qres, math_MAYBE_ZERO_HI64(bitQ, qabs));
-      assign(nres, math_MAYBE_ZERO_HI64(bitQ, nabs));
+      assign(qres, math_MAYBE_ZERO_HI64(bitQ, qresFW));
+      assign(nres, math_MAYBE_ZERO_HI64(bitQ, nresFW));
       putQReg128(dd, mkexpr(qres));
       updateQCFLAGwithDifference(qres, nres);
       const HChar* arr = nameArr_Q_SZ(bitQ, size);
-      DIP("%s %s.%s, %s.%s\n", "sqabs",
+      DIP("%s %s.%s, %s.%s\n", isNEG ? "sqneg" : "sqabs",
           nameQReg128(dd), arr, nameQReg128(nn), arr);
       return True;
    }
@@ -8840,19 +9014,14 @@
             vassert(0);
       }
       vassert(mm < 32 && ix < 16);
-      IROp   mulOp = isU ? mkVecMULLU(size) : mkVecMULLS(size);
-      IROp   accOp = (ks == 1) ? mkVecADD(size+1) 
-                               : (ks == 2 ? mkVecSUB(size+1) : Iop_INVALID);
+      IRTemp vecN  = newTempV128();
       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
       IRTemp vecD  = newTempV128();
-      IRTemp vecN  = newTempV128();
-      assign(vecD, getQReg128(dd));
       assign(vecN, getQReg128(nn));
-      IRTemp mul   = math_BINARY_WIDENING_V128(is2, mulOp, 
-                                             mkexpr(vecN), mkexpr(vecM));
-      IRTemp res = newTempV128();
-      assign(res, ks == 0 ? mkexpr(mul) 
-                          : binop(accOp, getQReg128(dd), mkexpr(mul)));
+      assign(vecD, getQReg128(dd));
+      IRTemp res = IRTemp_INVALID;
+      math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
+                    vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
       putQReg128(dd, mkexpr(res));
       const HChar* nm        = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
@@ -8865,6 +9034,64 @@
       return True;
    }
 
+   if (bitU == 0 
+       && (opcode == BITS4(1,0,1,1)
+           || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
+      /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
+      /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
+      /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
+      /* Widens, and size refers to the narrowed lanes. */
+      UInt ks = 3;
+      switch (opcode) {
+         case BITS4(1,0,1,1): ks = 0; break;
+         case BITS4(0,0,1,1): ks = 1; break;
+         case BITS4(0,1,1,1): ks = 2; break;
+         default: vassert(0);
+      }
+      vassert(ks >= 0 && ks <= 2);
+      Bool is2 = bitQ == 1;
+      UInt mm  = 32; // invalid
+      UInt ix  = 16; // invalid
+      switch (size) {
+         case X00:
+            return False; // h_b_b[] case is not allowed
+         case X01:
+            mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
+         case X10:
+            mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
+         case X11:
+            return False; // q_d_d[] case is not allowed
+         default:
+            vassert(0);
+      }
+      vassert(mm < 32 && ix < 16);
+      IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
+      vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
+      newTempsV128_2(&vecN, &vecD);
+      assign(vecN, getQReg128(nn));
+      IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
+      assign(vecD, getQReg128(dd));
+      math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
+                       is2, size, "mas"[ks],
+                       vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
+      putQReg128(dd, mkexpr(res));
+      vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
+      updateQCFLAGwithDifference(sat1q, sat1n);
+      if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
+         updateQCFLAGwithDifference(sat2q, sat2n);
+      }
+      const HChar* nm        = ks == 0 ? "sqmull"
+                                       : (ks == 1 ? "sqdmlal" : "sqdmlsl");
+      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
+      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
+      HChar ch               = size == X01 ? 'h' : 's';
+      DIP("%s%s %s.%s, %s.%s, %s.%c[%u]\n",
+          nm, is2 ? "2" : "",
+          nameQReg128(dd), arrWide,
+          nameQReg128(nn), arrNarrow, nameQReg128(dd), ch, ix);
+      return True;
+   }
+
    return False;
 #  undef INSN
 }
diff --git a/priv/guest_arm_toIR.c b/priv/guest_arm_toIR.c
index c0bf0a7..7a53059 100644
--- a/priv/guest_arm_toIR.c
+++ b/priv/guest_arm_toIR.c
@@ -5132,7 +5132,7 @@
             case 0: case 3:
                return False;
             case 1:
-               op = Iop_QDMulLong16Sx4;
+               op = Iop_QDMull16Sx4;
                cmp = Iop_CmpEQ16x4;
                add = P ? Iop_QSub32Sx4 : Iop_QAdd32Sx4;
                op2 = P ? Iop_Sub32x4 : Iop_Add32x4;
@@ -5141,7 +5141,7 @@
                imm = (imm << 32) | imm;
                break;
             case 2:
-               op = Iop_QDMulLong32Sx2;
+               op = Iop_QDMull32Sx2;
                cmp = Iop_CmpEQ32x2;
                add = P ? Iop_QSub64Sx2 : Iop_QAdd64Sx2;
                op2 = P ? Iop_Sub64x2 : Iop_Add64x2;
@@ -5206,14 +5206,14 @@
             case 3:
                return False;
             case 1:
-               op = Iop_QDMulLong16Sx4;
+               op = Iop_QDMull16Sx4;
                op2 = Iop_CmpEQ16x4;
                imm = 1LL << 15;
                imm = (imm << 16) | imm;
                imm = (imm << 32) | imm;
                break;
             case 2:
-               op = Iop_QDMulLong32Sx2;
+               op = Iop_QDMull32Sx2;
                op2 = Iop_CmpEQ32x2;
                imm = 1LL << 31;
                imm = (imm << 32) | imm;
@@ -5454,7 +5454,7 @@
          case 3:
             return False;
          case 1:
-            op = Iop_QDMulLong16Sx4;
+            op = Iop_QDMull16Sx4;
             cmp = Iop_CmpEQ16x4;
             add = P ? Iop_QSub32Sx4 : Iop_QAdd32Sx4;
             op2 = P ? Iop_Sub32x4 : Iop_Add32x4;
@@ -5463,7 +5463,7 @@
             imm = (imm << 32) | imm;
             break;
          case 2:
-            op = Iop_QDMulLong32Sx2;
+            op = Iop_QDMull32Sx2;
             cmp = Iop_CmpEQ32x2;
             add = P ? Iop_QSub64Sx2 : Iop_QAdd64Sx2;
             op2 = P ? Iop_Sub64x2 : Iop_Add64x2;
@@ -5667,14 +5667,14 @@
          case 3:
             return False;
          case 1:
-            op = Iop_QDMulLong16Sx4;
+            op = Iop_QDMull16Sx4;
             op2 = Iop_CmpEQ16x4;
             imm = 1LL << 15;
             imm = (imm << 16) | imm;
             imm = (imm << 32) | imm;
             break;
          case 2:
-            op = Iop_QDMulLong32Sx2;
+            op = Iop_QDMull32Sx2;
             op2 = Iop_CmpEQ32x2;
             imm = 1LL << 31;
             imm = (imm << 32) | imm;
diff --git a/priv/host_arm64_defs.c b/priv/host_arm64_defs.c
index b55be22..c636982 100644
--- a/priv/host_arm64_defs.c
+++ b/priv/host_arm64_defs.c
@@ -854,97 +854,99 @@
 static void showARM64VecBinOp(/*OUT*/const HChar** nm,
                               /*OUT*/const HChar** ar, ARM64VecBinOp op ) {
    switch (op) {
-      case ARM64vecb_ADD64x2:    *nm = "add  ";  *ar = "2d";   return;
-      case ARM64vecb_ADD32x4:    *nm = "add  ";  *ar = "4s";   return;
-      case ARM64vecb_ADD16x8:    *nm = "add  ";  *ar = "8h";   return;
-      case ARM64vecb_ADD8x16:    *nm = "add  ";  *ar = "16b";  return;
-      case ARM64vecb_SUB64x2:    *nm = "sub  ";  *ar = "2d";   return;
-      case ARM64vecb_SUB32x4:    *nm = "sub  ";  *ar = "4s";   return;
-      case ARM64vecb_SUB16x8:    *nm = "sub  ";  *ar = "8h";   return;
-      case ARM64vecb_SUB8x16:    *nm = "sub  ";  *ar = "16b";  return;
-      case ARM64vecb_MUL32x4:    *nm = "mul  ";  *ar = "4s";   return;
-      case ARM64vecb_MUL16x8:    *nm = "mul  ";  *ar = "8h";   return;
-      case ARM64vecb_MUL8x16:    *nm = "mul  ";  *ar = "16b";  return;
-      case ARM64vecb_FADD64x2:   *nm = "fadd ";  *ar = "2d";   return;
-      case ARM64vecb_FSUB64x2:   *nm = "fsub ";  *ar = "2d";   return;
-      case ARM64vecb_FMUL64x2:   *nm = "fmul ";  *ar = "2d";   return;
-      case ARM64vecb_FDIV64x2:   *nm = "fdiv ";  *ar = "2d";   return;
-      case ARM64vecb_FADD32x4:   *nm = "fadd ";  *ar = "4s";   return;
-      case ARM64vecb_FSUB32x4:   *nm = "fsub ";  *ar = "4s";   return;
-      case ARM64vecb_FMUL32x4:   *nm = "fmul ";  *ar = "4s";   return;
-      case ARM64vecb_FDIV32x4:   *nm = "fdiv ";  *ar = "4s";   return;
-      case ARM64vecb_UMAX32x4:   *nm = "umax ";  *ar = "4s";   return;
-      case ARM64vecb_UMAX16x8:   *nm = "umax ";  *ar = "8h";   return;
-      case ARM64vecb_UMAX8x16:   *nm = "umax ";  *ar = "16b";  return;
-      case ARM64vecb_UMIN32x4:   *nm = "umin ";  *ar = "4s";   return;
-      case ARM64vecb_UMIN16x8:   *nm = "umin ";  *ar = "8h";   return;
-      case ARM64vecb_UMIN8x16:   *nm = "umin ";  *ar = "16b";  return;
-      case ARM64vecb_SMAX32x4:   *nm = "smax ";  *ar = "4s";   return;
-      case ARM64vecb_SMAX16x8:   *nm = "smax ";  *ar = "8h";   return;
-      case ARM64vecb_SMAX8x16:   *nm = "smax ";  *ar = "16b";  return;
-      case ARM64vecb_SMIN32x4:   *nm = "smin ";  *ar = "4s";   return;
-      case ARM64vecb_SMIN16x8:   *nm = "smin ";  *ar = "8h";   return;
-      case ARM64vecb_SMIN8x16:   *nm = "smin ";  *ar = "16b";  return;
-      case ARM64vecb_AND:        *nm = "and  ";  *ar = "16b";  return;
-      case ARM64vecb_ORR:        *nm = "orr  ";  *ar = "16b";  return;
-      case ARM64vecb_XOR:        *nm = "eor  ";  *ar = "16b";  return;
-      case ARM64vecb_CMEQ64x2:   *nm = "cmeq ";  *ar = "2d";   return;
-      case ARM64vecb_CMEQ32x4:   *nm = "cmeq ";  *ar = "4s";   return;
-      case ARM64vecb_CMEQ16x8:   *nm = "cmeq ";  *ar = "8h";   return;
-      case ARM64vecb_CMEQ8x16:   *nm = "cmeq ";  *ar = "16b";  return;
-      case ARM64vecb_CMHI64x2:   *nm = "cmhi ";  *ar = "2d";   return;
-      case ARM64vecb_CMHI32x4:   *nm = "cmhi ";  *ar = "4s";   return;
-      case ARM64vecb_CMHI16x8:   *nm = "cmhi ";  *ar = "8h";   return;
-      case ARM64vecb_CMHI8x16:   *nm = "cmhi ";  *ar = "16b";  return;
-      case ARM64vecb_CMGT64x2:   *nm = "cmgt ";  *ar = "2d";   return;
-      case ARM64vecb_CMGT32x4:   *nm = "cmgt ";  *ar = "4s";   return;
-      case ARM64vecb_CMGT16x8:   *nm = "cmgt ";  *ar = "8h";   return;
-      case ARM64vecb_CMGT8x16:   *nm = "cmgt ";  *ar = "16b";  return;
-      case ARM64vecb_FCMEQ64x2:  *nm = "fcmeq";  *ar = "2d";   return;
-      case ARM64vecb_FCMEQ32x4:  *nm = "fcmeq";  *ar = "4s";   return;
-      case ARM64vecb_FCMGE64x2:  *nm = "fcmge";  *ar = "2d";   return;
-      case ARM64vecb_FCMGE32x4:  *nm = "fcmge";  *ar = "4s";   return;
-      case ARM64vecb_FCMGT64x2:  *nm = "fcmgt";  *ar = "2d";   return;
-      case ARM64vecb_FCMGT32x4:  *nm = "fcmgt";  *ar = "4s";   return;
-      case ARM64vecb_TBL1:       *nm = "tbl  ";  *ar = "16b";  return;
-      case ARM64vecb_UZP164x2:   *nm = "uzp1 ";  *ar = "2d";   return;
-      case ARM64vecb_UZP132x4:   *nm = "uzp1 ";  *ar = "4s";   return;
-      case ARM64vecb_UZP116x8:   *nm = "uzp1 ";  *ar = "8h";   return;
-      case ARM64vecb_UZP18x16:   *nm = "uzp1 ";  *ar = "16b";  return;
-      case ARM64vecb_UZP264x2:   *nm = "uzp2 ";  *ar = "2d";   return;
-      case ARM64vecb_UZP232x4:   *nm = "uzp2 ";  *ar = "4s";   return;
-      case ARM64vecb_UZP216x8:   *nm = "uzp2 ";  *ar = "8h";   return;
-      case ARM64vecb_UZP28x16:   *nm = "uzp2 ";  *ar = "16b";  return;
-      case ARM64vecb_ZIP132x4:   *nm = "zip1 ";  *ar = "4s";   return;
-      case ARM64vecb_ZIP116x8:   *nm = "zip1 ";  *ar = "8h";   return;
-      case ARM64vecb_ZIP18x16:   *nm = "zip1 ";  *ar = "16b";  return;
-      case ARM64vecb_ZIP232x4:   *nm = "zip2 ";  *ar = "4s";   return;
-      case ARM64vecb_ZIP216x8:   *nm = "zip2 ";  *ar = "8h";   return;
-      case ARM64vecb_ZIP28x16:   *nm = "zip2 ";  *ar = "16b";  return;
-      case ARM64vecb_PMUL8x16:   *nm = "pmul ";  *ar = "16b";  return;
-      case ARM64vecb_PMULL8x8:   *nm = "pmull";  *ar = "8hbb"; return;
-      case ARM64vecb_UMULL2DSS:  *nm = "umull";  *ar = "2dss"; return;
-      case ARM64vecb_UMULL4SHH:  *nm = "umull";  *ar = "4shh"; return;
-      case ARM64vecb_UMULL8HBB:  *nm = "umull";  *ar = "8hbb"; return;
-      case ARM64vecb_SMULL2DSS:  *nm = "smull";  *ar = "2dss"; return;
-      case ARM64vecb_SMULL4SHH:  *nm = "smull";  *ar = "4shh"; return;
-      case ARM64vecb_SMULL8HBB:  *nm = "smull";  *ar = "8hbb"; return;
-      case ARM64vecb_SQADD64x2:  *nm = "sqadd";  *ar = "2d";   return;
-      case ARM64vecb_SQADD32x4:  *nm = "sqadd";  *ar = "4s";   return;
-      case ARM64vecb_SQADD16x8:  *nm = "sqadd";  *ar = "8h";   return;
-      case ARM64vecb_SQADD8x16:  *nm = "sqadd";  *ar = "16b";  return;
-      case ARM64vecb_UQADD64x2:  *nm = "uqadd";  *ar = "2d";   return;
-      case ARM64vecb_UQADD32x4:  *nm = "uqadd";  *ar = "4s";   return;
-      case ARM64vecb_UQADD16x8:  *nm = "uqadd";  *ar = "8h";   return;
-      case ARM64vecb_UQADD8x16:  *nm = "uqadd";  *ar = "16b";  return;
-      case ARM64vecb_SQSUB64x2:  *nm = "sqsub";  *ar = "2d";   return;
-      case ARM64vecb_SQSUB32x4:  *nm = "sqsub";  *ar = "4s";   return;
-      case ARM64vecb_SQSUB16x8:  *nm = "sqsub";  *ar = "8h";   return;
-      case ARM64vecb_SQSUB8x16:  *nm = "sqsub";  *ar = "16b";  return;
-      case ARM64vecb_UQSUB64x2:  *nm = "uqsub";  *ar = "2d";   return;
-      case ARM64vecb_UQSUB32x4:  *nm = "uqsub";  *ar = "4s";   return;
-      case ARM64vecb_UQSUB16x8:  *nm = "uqsub";  *ar = "8h";   return;
-      case ARM64vecb_UQSUB8x16:  *nm = "uqsub";  *ar = "16b";  return;
+      case ARM64vecb_ADD64x2:      *nm = "add  ";    *ar = "2d";   return;
+      case ARM64vecb_ADD32x4:      *nm = "add  ";    *ar = "4s";   return;
+      case ARM64vecb_ADD16x8:      *nm = "add  ";    *ar = "8h";   return;
+      case ARM64vecb_ADD8x16:      *nm = "add  ";    *ar = "16b";  return;
+      case ARM64vecb_SUB64x2:      *nm = "sub  ";    *ar = "2d";   return;
+      case ARM64vecb_SUB32x4:      *nm = "sub  ";    *ar = "4s";   return;
+      case ARM64vecb_SUB16x8:      *nm = "sub  ";    *ar = "8h";   return;
+      case ARM64vecb_SUB8x16:      *nm = "sub  ";    *ar = "16b";  return;
+      case ARM64vecb_MUL32x4:      *nm = "mul  ";    *ar = "4s";   return;
+      case ARM64vecb_MUL16x8:      *nm = "mul  ";    *ar = "8h";   return;
+      case ARM64vecb_MUL8x16:      *nm = "mul  ";    *ar = "16b";  return;
+      case ARM64vecb_FADD64x2:     *nm = "fadd ";    *ar = "2d";   return;
+      case ARM64vecb_FSUB64x2:     *nm = "fsub ";    *ar = "2d";   return;
+      case ARM64vecb_FMUL64x2:     *nm = "fmul ";    *ar = "2d";   return;
+      case ARM64vecb_FDIV64x2:     *nm = "fdiv ";    *ar = "2d";   return;
+      case ARM64vecb_FADD32x4:     *nm = "fadd ";    *ar = "4s";   return;
+      case ARM64vecb_FSUB32x4:     *nm = "fsub ";    *ar = "4s";   return;
+      case ARM64vecb_FMUL32x4:     *nm = "fmul ";    *ar = "4s";   return;
+      case ARM64vecb_FDIV32x4:     *nm = "fdiv ";    *ar = "4s";   return;
+      case ARM64vecb_UMAX32x4:     *nm = "umax ";    *ar = "4s";   return;
+      case ARM64vecb_UMAX16x8:     *nm = "umax ";    *ar = "8h";   return;
+      case ARM64vecb_UMAX8x16:     *nm = "umax ";    *ar = "16b";  return;
+      case ARM64vecb_UMIN32x4:     *nm = "umin ";    *ar = "4s";   return;
+      case ARM64vecb_UMIN16x8:     *nm = "umin ";    *ar = "8h";   return;
+      case ARM64vecb_UMIN8x16:     *nm = "umin ";    *ar = "16b";  return;
+      case ARM64vecb_SMAX32x4:     *nm = "smax ";    *ar = "4s";   return;
+      case ARM64vecb_SMAX16x8:     *nm = "smax ";    *ar = "8h";   return;
+      case ARM64vecb_SMAX8x16:     *nm = "smax ";    *ar = "16b";  return;
+      case ARM64vecb_SMIN32x4:     *nm = "smin ";    *ar = "4s";   return;
+      case ARM64vecb_SMIN16x8:     *nm = "smin ";    *ar = "8h";   return;
+      case ARM64vecb_SMIN8x16:     *nm = "smin ";    *ar = "16b";  return;
+      case ARM64vecb_AND:          *nm = "and  ";    *ar = "16b";  return;
+      case ARM64vecb_ORR:          *nm = "orr  ";    *ar = "16b";  return;
+      case ARM64vecb_XOR:          *nm = "eor  ";    *ar = "16b";  return;
+      case ARM64vecb_CMEQ64x2:     *nm = "cmeq ";    *ar = "2d";   return;
+      case ARM64vecb_CMEQ32x4:     *nm = "cmeq ";    *ar = "4s";   return;
+      case ARM64vecb_CMEQ16x8:     *nm = "cmeq ";    *ar = "8h";   return;
+      case ARM64vecb_CMEQ8x16:     *nm = "cmeq ";    *ar = "16b";  return;
+      case ARM64vecb_CMHI64x2:     *nm = "cmhi ";    *ar = "2d";   return;
+      case ARM64vecb_CMHI32x4:     *nm = "cmhi ";    *ar = "4s";   return;
+      case ARM64vecb_CMHI16x8:     *nm = "cmhi ";    *ar = "8h";   return;
+      case ARM64vecb_CMHI8x16:     *nm = "cmhi ";    *ar = "16b";  return;
+      case ARM64vecb_CMGT64x2:     *nm = "cmgt ";    *ar = "2d";   return;
+      case ARM64vecb_CMGT32x4:     *nm = "cmgt ";    *ar = "4s";   return;
+      case ARM64vecb_CMGT16x8:     *nm = "cmgt ";    *ar = "8h";   return;
+      case ARM64vecb_CMGT8x16:     *nm = "cmgt ";    *ar = "16b";  return;
+      case ARM64vecb_FCMEQ64x2:    *nm = "fcmeq";    *ar = "2d";   return;
+      case ARM64vecb_FCMEQ32x4:    *nm = "fcmeq";    *ar = "4s";   return;
+      case ARM64vecb_FCMGE64x2:    *nm = "fcmge";    *ar = "2d";   return;
+      case ARM64vecb_FCMGE32x4:    *nm = "fcmge";    *ar = "4s";   return;
+      case ARM64vecb_FCMGT64x2:    *nm = "fcmgt";    *ar = "2d";   return;
+      case ARM64vecb_FCMGT32x4:    *nm = "fcmgt";    *ar = "4s";   return;
+      case ARM64vecb_TBL1:         *nm = "tbl  ";    *ar = "16b";  return;
+      case ARM64vecb_UZP164x2:     *nm = "uzp1 ";    *ar = "2d";   return;
+      case ARM64vecb_UZP132x4:     *nm = "uzp1 ";    *ar = "4s";   return;
+      case ARM64vecb_UZP116x8:     *nm = "uzp1 ";    *ar = "8h";   return;
+      case ARM64vecb_UZP18x16:     *nm = "uzp1 ";    *ar = "16b";  return;
+      case ARM64vecb_UZP264x2:     *nm = "uzp2 ";    *ar = "2d";   return;
+      case ARM64vecb_UZP232x4:     *nm = "uzp2 ";    *ar = "4s";   return;
+      case ARM64vecb_UZP216x8:     *nm = "uzp2 ";    *ar = "8h";   return;
+      case ARM64vecb_UZP28x16:     *nm = "uzp2 ";    *ar = "16b";  return;
+      case ARM64vecb_ZIP132x4:     *nm = "zip1 ";    *ar = "4s";   return;
+      case ARM64vecb_ZIP116x8:     *nm = "zip1 ";    *ar = "8h";   return;
+      case ARM64vecb_ZIP18x16:     *nm = "zip1 ";    *ar = "16b";  return;
+      case ARM64vecb_ZIP232x4:     *nm = "zip2 ";    *ar = "4s";   return;
+      case ARM64vecb_ZIP216x8:     *nm = "zip2 ";    *ar = "8h";   return;
+      case ARM64vecb_ZIP28x16:     *nm = "zip2 ";    *ar = "16b";  return;
+      case ARM64vecb_PMUL8x16:     *nm = "pmul ";    *ar = "16b";  return;
+      case ARM64vecb_PMULL8x8:     *nm = "pmull";    *ar = "8hbb"; return;
+      case ARM64vecb_UMULL2DSS:    *nm = "umull";    *ar = "2dss"; return;
+      case ARM64vecb_UMULL4SHH:    *nm = "umull";    *ar = "4shh"; return;
+      case ARM64vecb_UMULL8HBB:    *nm = "umull";    *ar = "8hbb"; return;
+      case ARM64vecb_SMULL2DSS:    *nm = "smull";    *ar = "2dss"; return;
+      case ARM64vecb_SMULL4SHH:    *nm = "smull";    *ar = "4shh"; return;
+      case ARM64vecb_SMULL8HBB:    *nm = "smull";    *ar = "8hbb"; return;
+      case ARM64vecb_SQADD64x2:    *nm = "sqadd";    *ar = "2d";   return;
+      case ARM64vecb_SQADD32x4:    *nm = "sqadd";    *ar = "4s";   return;
+      case ARM64vecb_SQADD16x8:    *nm = "sqadd";    *ar = "8h";   return;
+      case ARM64vecb_SQADD8x16:    *nm = "sqadd";    *ar = "16b";  return;
+      case ARM64vecb_UQADD64x2:    *nm = "uqadd";    *ar = "2d";   return;
+      case ARM64vecb_UQADD32x4:    *nm = "uqadd";    *ar = "4s";   return;
+      case ARM64vecb_UQADD16x8:    *nm = "uqadd";    *ar = "8h";   return;
+      case ARM64vecb_UQADD8x16:    *nm = "uqadd";    *ar = "16b";  return;
+      case ARM64vecb_SQSUB64x2:    *nm = "sqsub";    *ar = "2d";   return;
+      case ARM64vecb_SQSUB32x4:    *nm = "sqsub";    *ar = "4s";   return;
+      case ARM64vecb_SQSUB16x8:    *nm = "sqsub";    *ar = "8h";   return;
+      case ARM64vecb_SQSUB8x16:    *nm = "sqsub";    *ar = "16b";  return;
+      case ARM64vecb_UQSUB64x2:    *nm = "uqsub";    *ar = "2d";   return;
+      case ARM64vecb_UQSUB32x4:    *nm = "uqsub";    *ar = "4s";   return;
+      case ARM64vecb_UQSUB16x8:    *nm = "uqsub";    *ar = "8h";   return;
+      case ARM64vecb_UQSUB8x16:    *nm = "uqsub";    *ar = "16b";  return;
+      case ARM64vecb_SQDMULL2DSS:  *nm = "sqdmull";  *ar = "2dss"; return;
+      case ARM64vecb_SQDMULL4SHH:  *nm = "sqdmull";  *ar = "4shh"; return;
       default: vpanic("showARM64VecBinOp");
    }
 }
@@ -3507,6 +3509,7 @@
 #define X101110  BITS8(0,0, 1,0,1,1,1,0)
 #define X110000  BITS8(0,0, 1,1,0,0,0,0)
 #define X110001  BITS8(0,0, 1,1,0,0,0,1)
+#define X110100  BITS8(0,0, 1,1,0,1,0,0)
 #define X110101  BITS8(0,0, 1,1,0,1,0,1)
 #define X110111  BITS8(0,0, 1,1,0,1,1,1)
 #define X111000  BITS8(0,0, 1,1,1,0,0,0)
@@ -5189,6 +5192,9 @@
             011 01110 10 1 m  001011 n d   UQSUB Vd.4s,  Vn.4s,  Vm.4s
             011 01110 01 1 m  001011 n d   UQSUB Vd.8h,  Vn.8h,  Vm.8h
             011 01110 00 1 m  001011 n d   UQSUB Vd.16b, Vn.16b, Vm.16b
+
+            000 01110 10 1 m  110100 n d   SQDMULL Vd.2d, Vn.2s, Vm.2s
+            000 01110 01 1 m  110100 n d   SQDMULL Vd.4s, Vn.4h, Vm.4h
          */
          UInt vD = qregNo(i->ARM64in.VBinV.dst);
          UInt vN = qregNo(i->ARM64in.VBinV.argL);
@@ -5492,6 +5498,13 @@
                *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X001011, vN, vD);
                break;
 
+            case ARM64vecb_SQDMULL2DSS:
+               *p++ = X_3_8_5_6_5_5(X000, X01110101, vM, X110100, vN, vD);
+               break;
+            case ARM64vecb_SQDMULL4SHH:
+               *p++ = X_3_8_5_6_5_5(X000, X01110011, vM, X110100, vN, vD);
+               break;
+
             default:
                goto bad;
          }
diff --git a/priv/host_arm64_defs.h b/priv/host_arm64_defs.h
index 3795c27..bce1911 100644
--- a/priv/host_arm64_defs.h
+++ b/priv/host_arm64_defs.h
@@ -350,14 +350,17 @@
       ARM64vecb_UMULL4SHH,   ARM64vecb_UMULL8HBB,
                              ARM64vecb_SMULL2DSS,
       ARM64vecb_SMULL4SHH,   ARM64vecb_SMULL8HBB,
-      ARM64vecb_SQADD64x2,     ARM64vecb_SQADD32x4,
-      ARM64vecb_SQADD16x8,     ARM64vecb_SQADD8x16,
-      ARM64vecb_UQADD64x2,     ARM64vecb_UQADD32x4,
-      ARM64vecb_UQADD16x8,     ARM64vecb_UQADD8x16,
-      ARM64vecb_SQSUB64x2,     ARM64vecb_SQSUB32x4,
-      ARM64vecb_SQSUB16x8,     ARM64vecb_SQSUB8x16,
-      ARM64vecb_UQSUB64x2,     ARM64vecb_UQSUB32x4,
-      ARM64vecb_UQSUB16x8,     ARM64vecb_UQSUB8x16,
+      ARM64vecb_SQADD64x2,   ARM64vecb_SQADD32x4,
+      ARM64vecb_SQADD16x8,   ARM64vecb_SQADD8x16,
+      ARM64vecb_UQADD64x2,   ARM64vecb_UQADD32x4,
+      ARM64vecb_UQADD16x8,   ARM64vecb_UQADD8x16,
+      ARM64vecb_SQSUB64x2,   ARM64vecb_SQSUB32x4,
+      ARM64vecb_SQSUB16x8,   ARM64vecb_SQSUB8x16,
+      ARM64vecb_UQSUB64x2,   ARM64vecb_UQSUB32x4,
+      ARM64vecb_UQSUB16x8,   ARM64vecb_UQSUB8x16,
+                             ARM64vecb_SQDMULL2DSS,
+      ARM64vecb_SQDMULL4SHH,
+
       ARM64vecb_INVALID
    }
    ARM64VecBinOp;
diff --git a/priv/host_arm64_isel.c b/priv/host_arm64_isel.c
index eafd8e3..97cfde0 100644
--- a/priv/host_arm64_isel.c
+++ b/priv/host_arm64_isel.c
@@ -5658,6 +5658,8 @@
          case Iop_Mull32Sx2:
          case Iop_Mull16Sx4:
          case Iop_Mull8Sx8:
+         case Iop_QDMull32Sx2:
+         case Iop_QDMull16Sx4:
          {
             HReg iSrcL = iselIntExpr_R(env, e->Iex.Binop.arg1);
             HReg iSrcR = iselIntExpr_R(env, e->Iex.Binop.arg2);
@@ -5666,13 +5668,15 @@
             HReg dst   = newVRegV(env);
             ARM64VecBinOp op = ARM64vecb_INVALID;
             switch (e->Iex.Binop.op) {
-               case Iop_PolynomialMull8x8: op = ARM64vecb_PMULL8x8;  break;
-               case Iop_Mull32Ux2:         op = ARM64vecb_UMULL2DSS; break;
-               case Iop_Mull16Ux4:         op = ARM64vecb_UMULL4SHH; break;
-               case Iop_Mull8Ux8:          op = ARM64vecb_UMULL8HBB; break;
-               case Iop_Mull32Sx2:         op = ARM64vecb_SMULL2DSS; break;
-               case Iop_Mull16Sx4:         op = ARM64vecb_SMULL4SHH; break;
-               case Iop_Mull8Sx8:          op = ARM64vecb_SMULL8HBB; break;
+               case Iop_PolynomialMull8x8: op = ARM64vecb_PMULL8x8;    break;
+               case Iop_Mull32Ux2:         op = ARM64vecb_UMULL2DSS;   break;
+               case Iop_Mull16Ux4:         op = ARM64vecb_UMULL4SHH;   break;
+               case Iop_Mull8Ux8:          op = ARM64vecb_UMULL8HBB;   break;
+               case Iop_Mull32Sx2:         op = ARM64vecb_SMULL2DSS;   break;
+               case Iop_Mull16Sx4:         op = ARM64vecb_SMULL4SHH;   break;
+               case Iop_Mull8Sx8:          op = ARM64vecb_SMULL8HBB;   break;
+               case Iop_QDMull32Sx2:       op = ARM64vecb_SQDMULL2DSS; break;
+               case Iop_QDMull16Sx4:       op = ARM64vecb_SQDMULL4SHH; break;
                default: vassert(0);
             }
             addInstr(env, ARM64Instr_VQfromXX(vSrcL, iSrcL, iSrcL));
diff --git a/priv/host_arm_isel.c b/priv/host_arm_isel.c
index 75723b3..537a355 100644
--- a/priv/host_arm_isel.c
+++ b/priv/host_arm_isel.c
@@ -5118,15 +5118,15 @@
             return res;
          }
 
-         case Iop_QDMulLong16Sx4:
-         case Iop_QDMulLong32Sx2: {
+         case Iop_QDMull16Sx4:
+         case Iop_QDMull32Sx2: {
             HReg res = newVRegV(env);
             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
             UInt size = 0;
             switch(e->Iex.Binop.op) {
-               case Iop_QDMulLong16Sx4: size = 1; break;
-               case Iop_QDMulLong32Sx2: size = 2; break;
+               case Iop_QDMull16Sx4: size = 1; break;
+               case Iop_QDMull32Sx2: size = 2; break;
                default: vassert(0);
             }
             addInstr(env, ARMInstr_NBinary(ARMneon_VQDMULL,
diff --git a/priv/ir_defs.c b/priv/ir_defs.c
index 27ccf8f..71c8f28 100644
--- a/priv/ir_defs.c
+++ b/priv/ir_defs.c
@@ -470,8 +470,8 @@
       case Iop_QDMulHi32Sx2: vex_printf("QDMulHi32Sx2"); return;
       case Iop_QRDMulHi16Sx4: vex_printf("QRDMulHi16Sx4"); return;
       case Iop_QRDMulHi32Sx2: vex_printf("QRDMulHi32Sx2"); return;
-      case Iop_QDMulLong16Sx4: vex_printf("QDMulLong16Sx4"); return;
-      case Iop_QDMulLong32Sx2: vex_printf("QDMulLong32Sx2"); return;
+      case Iop_QDMull16Sx4: vex_printf("QDMull16Sx4"); return;
+      case Iop_QDMull32Sx2: vex_printf("QDMull32Sx2"); return;
       case Iop_Avg8Ux8: vex_printf("Avg8Ux8"); return;
       case Iop_Avg16Ux4: vex_printf("Avg16Ux4"); return;
       case Iop_Max8Sx8: vex_printf("Max8Sx8"); return;
@@ -2979,7 +2979,7 @@
       case Iop_BCDAdd:
       case Iop_BCDSub:
          TERNARY(Ity_V128,Ity_V128, Ity_I8, Ity_V128);
-      case Iop_QDMulLong16Sx4: case Iop_QDMulLong32Sx2:
+      case Iop_QDMull16Sx4: case Iop_QDMull32Sx2:
          BINARY(Ity_I64, Ity_I64, Ity_V128);
 
       /* s390 specific */
diff --git a/pub/libvex_ir.h b/pub/libvex_ir.h
index a4fb614..843ca09 100644
--- a/pub/libvex_ir.h
+++ b/pub/libvex_ir.h
@@ -1411,20 +1411,24 @@
       /* (widening signed/unsigned of even lanes, with lowest lane=zero) */
       Iop_MullEven8Ux16, Iop_MullEven16Ux8, Iop_MullEven32Ux4,
       Iop_MullEven8Sx16, Iop_MullEven16Sx8, Iop_MullEven32Sx4,
-      /* FIXME: document these */
+
+      /* Widening multiplies, all of the form (I64, I64) -> V128 */
       Iop_Mull8Ux8, Iop_Mull8Sx8,
       Iop_Mull16Ux4, Iop_Mull16Sx4,
       Iop_Mull32Ux2, Iop_Mull32Sx2,
+
+      /* Signed doubling saturating widening multiplies, (I64, I64) -> V128 */
+      Iop_QDMull16Sx4, Iop_QDMull32Sx2,
+
       /* Vector Saturating Doubling Multiply Returning High Half and
          Vector Saturating Rounding Doubling Multiply Returning High Half */
-      /* These IROp's multiply corresponding elements in two vectors, double
+      /* These IROps multiply corresponding elements in two vectors, double
          the results, and place the most significant half of the final results
          in the destination vector. The results are truncated or rounded. If
          any of the results overflow, they are saturated. */
       Iop_QDMulHi16Sx8, Iop_QDMulHi32Sx4,
       Iop_QRDMulHi16Sx8, Iop_QRDMulHi32Sx4,
-      /* Doubling saturating multiplication (long) (I64, I64) -> V128 */
-      Iop_QDMulLong16Sx4, Iop_QDMulLong32Sx2,
+
       /* Polynomial multiplication treats its arguments as
          coefficients of polynomials over {0, 1}. */
       Iop_PolynomialMul8x16, /* (V128, V128) -> V128 */