arm64: implement: shll #imm, shrn #imm, rshrn #imm,
{smlal,umlal,smlsl,umlsl,smull,umull} (elem)


git-svn-id: svn://svn.valgrind.org/vex/trunk@2898 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_arm64_toIR.c b/priv/guest_arm64_toIR.c
index c084abe..8c4858f 100644
--- a/priv/guest_arm64_toIR.c
+++ b/priv/guest_arm64_toIR.c
@@ -618,6 +618,22 @@
    return ops[size];
 }
 
+static IROp mkVecINTERLEAVELO ( UInt size ) {
+   const IROp ops[4]
+      = { Iop_InterleaveLO8x16, Iop_InterleaveLO16x8,
+          Iop_InterleaveLO32x4, Iop_InterleaveLO64x2 };
+   vassert(size < 4);
+   return ops[size];
+}
+
+static IROp mkVecINTERLEAVEHI ( UInt size ) {
+   const IROp ops[4]
+      = { Iop_InterleaveHI8x16, Iop_InterleaveHI16x8,
+          Iop_InterleaveHI32x4, Iop_InterleaveHI64x2 };
+   vassert(size < 4);
+   return ops[size];
+}
+
 static IROp mkVecMAXU ( UInt size ) {
    const IROp ops[4]
       = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4, Iop_Max64Ux2 };
@@ -646,6 +662,27 @@
    return ops[size];
 }
 
+static IROp mkVecMUL ( UInt size ) {
+   const IROp ops[4]
+      = { Iop_Mul8x16, Iop_Mul16x8, Iop_Mul32x4, Iop_INVALID };
+   vassert(size < 3);
+   return ops[size];
+}
+
+static IROp mkVecMULLU ( UInt sizeNarrow ) {
+   const IROp ops[4]
+      = { Iop_Mull8Ux8, Iop_Mull16Ux4, Iop_Mull32Ux2, Iop_INVALID };
+   vassert(sizeNarrow < 3);
+   return ops[sizeNarrow];
+}
+
+static IROp mkVecMULLS ( UInt sizeNarrow ) {
+   const IROp ops[4]
+      = { Iop_Mull8Sx8, Iop_Mull16Sx4, Iop_Mull32Sx2, Iop_INVALID };
+   vassert(sizeNarrow < 3);
+   return ops[sizeNarrow];
+}
+
 static IRExpr* mkU ( IRType ty, ULong imm ) {
    switch (ty) {
       case Ity_I32: return mkU32((UInt)(imm & 0xFFFFFFFFULL));
@@ -5899,6 +5936,130 @@
 }
 
 
+/* Return a temp which holds the vector dup of the lane of width
+   (1 << size) obtained from src[laneNo]. */
+static
+IRTemp math_DUP_VEC_ELEM ( IRExpr* src, UInt size, UInt laneNo )
+{
+   vassert(size <= 3);
+   /* Normalise |laneNo| so it is of the form
+      x000 for D, xx00 for S, xxx0 for H, and xxxx for B.
+      This puts the bits we want to inspect at constant offsets
+      regardless of the value of |size|.
+   */
+   UInt ix = laneNo << size;
+   vassert(ix <= 15);
+   IROp ops[4] = { Iop_INVALID, Iop_INVALID, Iop_INVALID, Iop_INVALID };
+   switch (size) {
+      case 0: /* B */
+         ops[0] = (ix & 1) ? Iop_CatOddLanes8x16 : Iop_CatEvenLanes8x16;
+         /* fallthrough */
+      case 1: /* H */
+         ops[1] = (ix & 2) ? Iop_CatOddLanes16x8 : Iop_CatEvenLanes16x8;
+         /* fallthrough */
+      case 2: /* S */
+         ops[2] = (ix & 4) ? Iop_CatOddLanes32x4 : Iop_CatEvenLanes32x4;
+         /* fallthrough */
+      case 3: /* D */
+         ops[3] = (ix & 8) ? Iop_InterleaveHI64x2 : Iop_InterleaveLO64x2;
+         break;
+      default:
+         vassert(0);
+   }
+   IRTemp res = newTemp(Ity_V128);
+   assign(res, src);
+   Int i;
+   for (i = 3; i >= 0; i--) {
+      if (ops[i] == Iop_INVALID)
+         break;
+      IRTemp tmp = newTemp(Ity_V128);
+      assign(tmp, binop(ops[i], mkexpr(res), mkexpr(res)));
+      res = tmp;
+   }
+   return res;
+}
+
+
+/* Let |srcV| be a V128 value, and let |imm5| be a lane-and-size
+   selector encoded as shown below.  Return a new V128 holding the
+   selected lane from |srcV| dup'd out to V128, and also return the
+   lane number, log2 of the lane size in bytes, and width-character via
+   *laneNo, *laneSzLg2 and *laneCh respectively.  It may be that imm5
+   is an invalid selector, in which case return
+   IRTemp_INVALID, 0, 0 and '?' respectively.
+
+   imm5 = xxxx1   signifies .b[xxxx]
+        = xxx10   .h[xxx]
+        = xx100   .s[xx]
+        = x1000   .d[x]
+        otherwise invalid     
+*/
+static
+IRTemp handle_DUP_VEC_ELEM ( /*OUT*/UInt* laneNo,
+                             /*OUT*/UInt* laneSzLg2, /*OUT*/HChar* laneCh,
+                             IRExpr* srcV, UInt imm5 )
+{
+   *laneNo    = 0;
+   *laneSzLg2 = 0;
+   *laneCh    = '?';
+
+   if (imm5 & 1) {
+      *laneNo    = (imm5 >> 1) & 15;
+      *laneSzLg2 = 0;
+      *laneCh    = 'b';
+   }
+   else if (imm5 & 2) {
+      *laneNo    = (imm5 >> 2) & 7;
+      *laneSzLg2 = 1;
+      *laneCh    = 'h';
+   }
+   else if (imm5 & 4) {
+      *laneNo    = (imm5 >> 3) & 3;
+      *laneSzLg2 = 2;
+      *laneCh    = 's';
+   }
+   else if (imm5 & 8) {
+      *laneNo    = (imm5 >> 4) & 1;
+      *laneSzLg2 = 3;
+      *laneCh    = 'd';
+   }
+   else {
+      /* invalid */
+      return IRTemp_INVALID;
+   }
+
+   return math_DUP_VEC_ELEM(srcV, *laneSzLg2, *laneNo);
+}
+
+
+/* Clone |imm| to every lane of a V128, with lane size log2 of |size|. */
+static
+IRTemp math_VEC_DUP_IMM ( UInt size, ULong imm )
+{
+   IRType ty  = Ity_INVALID;
+   IRTemp rcS = IRTemp_INVALID;
+   switch (size) {
+      case X01:
+         vassert(imm <= 0xFFFFULL);
+         ty  = Ity_I16;
+         rcS = newTemp(ty); assign(rcS, mkU16( (UShort)imm ));
+         break;
+      case X10:
+         vassert(imm <= 0xFFFFFFFFULL);
+         ty  = Ity_I32;
+         rcS = newTemp(ty); assign(rcS, mkU32( (UInt)imm ));
+         break;
+      case X11:
+         ty  = Ity_I64;
+         rcS = newTemp(ty); assign(rcS, mkU64(imm)); break;
+      default:
+         vassert(0);
+   }
+   IRTemp rcV = math_DUP_TO_V128(rcS, ty);
+   return rcV;
+}
+
+
 /* Let |new64| be a V128 in which only the lower 64 bits are interesting,
    and the upper can contain any value -- it is ignored.  If |is2| is False,
    generate IR to put |new64| in the lower half of vector reg |dd| and zero
@@ -6222,55 +6383,20 @@
       0q0 01110000 imm5 000001 n d  DUP Vd.T, Vn.Ts[index]
    */
    if (bitOP == 0 && imm4 == BITS4(0,0,0,0)) {
-      Bool   isQ  = bitQ == 1;
-      IRTemp w0   = newTemp(Ity_I64);
-      const HChar* arT  = "??";
-      const HChar* arTs = "??";
-      IRType laneTy = Ity_INVALID;
-      UInt   laneNo = 16; /* invalid */
-      if (imm5 & 1) {
-         arT    = isQ ? "16b" : "8b";
-         arTs   = "b";
-         laneNo = (imm5 >> 1) & 15;
-         laneTy = Ity_I8;
-         assign(w0, unop(Iop_8Uto64, getQRegLane(nn, laneNo, laneTy)));
-      }
-      else if (imm5 & 2) {
-         arT    = isQ ? "8h" : "4h";
-         arTs   = "h";
-         laneNo = (imm5 >> 2) & 7;
-         laneTy = Ity_I16;
-         assign(w0, unop(Iop_16Uto64, getQRegLane(nn, laneNo, laneTy)));
-      }
-      else if (imm5 & 4) {
-         arT    = isQ ? "4s" : "2s";
-         arTs   = "s";
-         laneNo = (imm5 >> 3) & 3;
-         laneTy = Ity_I32;
-         assign(w0, unop(Iop_32Uto64, getQRegLane(nn, laneNo, laneTy)));
-      }
-      else if ((imm5 & 8) && isQ) {
-         arT  = "2d";
-         arTs = "d";
-         laneNo = (imm5 >> 4) & 1;
-         laneTy = Ity_I64;
-         assign(w0, getQRegLane(nn, laneNo, laneTy));
-      }
-      else {
-         /* invalid; leave laneTy unchanged. */
-      }
-      /* */
-      if (laneTy != Ity_INVALID) {
-         vassert(laneNo < 16);
-         IRTemp w1 = math_DUP_TO_64(w0, laneTy);
-         putQReg128(dd, binop(Iop_64HLtoV128,
-                              isQ ? mkexpr(w1) : mkU64(0), mkexpr(w1)));
-         DIP("dup %s.%s, %s.%s[%u]\n",
-             nameQReg128(dd), arT, nameQReg128(nn), arTs, laneNo);
-         return True;
-      }
-      /* invalid */
-      return False;
+      UInt   laneNo    = 0;
+      UInt   laneSzLg2 = 0;
+      HChar  laneCh    = '?';
+      IRTemp res       = handle_DUP_VEC_ELEM(&laneNo, &laneSzLg2, &laneCh,
+                                             getQReg128(nn), imm5);
+      if (res == IRTemp_INVALID)
+         return False;
+      if (bitQ == 0 && laneSzLg2 == X11)
+         return False; /* .1d case */
+      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
+      const HChar* arT = nameArr_Q_SZ(bitQ, laneSzLg2);
+      DIP("dup %s.%s, %s.%c[%u]\n",
+           nameQReg128(dd), arT, nameQReg128(nn), laneCh, laneNo);
+      return True;
    }
 
    /* -------- x,0,0001: DUP (general, vector) -------- */
@@ -7060,10 +7186,6 @@
                          1xxx:xxx -> D, SHR:64-xxxxxx
                          other    -> invalid
       */
-      const IROp opsSHRN[4]
-         = { Iop_ShrN8x16, Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2 };
-      const IROp opsSARN[4]
-         = { Iop_SarN8x16, Iop_SarN16x8, Iop_SarN32x4, Iop_SarN64x2 };
       UInt size  = 0;
       UInt shift = 0;
       Bool isQ   = bitQ == 1;
@@ -7072,7 +7194,7 @@
       vassert(size >= 0 && size <= 3);
       if (ok && size < 4 && shift > 0 && shift < (8 << size)
           && !(size == 3/*64bit*/ && !isQ)) {
-         IROp    op  = isU ? opsSHRN[size] : opsSARN[size];
+         IROp    op  = isU ? mkVecSHRN(size) : mkVecSARN(size);
          IRExpr* src = getQReg128(nn);
          IRTemp  res = newTemp(Ity_V128);
          assign(res, binop(op, src, mkU8(shift)));
@@ -7125,6 +7247,36 @@
       return False;
    }
 
+   if (bitU == 0
+       && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1))) {
+      /* -------- 0,10000  SHRN{,2} #imm -------- */
+      /* -------- 0,10001 RSHRN{,2} #imm -------- */
+      /* Narrows, and size is the narrow size. */
+      UInt size  = 0;
+      UInt shift = 0;
+      Bool is2   = bitQ == 1;
+      Bool isR   = opcode == BITS5(1,0,0,0,1);
+      Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
+      if (!ok || size == X11) return False;
+      vassert(shift >= 1);
+      IRTemp t1 = newTemp(Ity_V128);
+      IRTemp t2 = newTemp(Ity_V128);
+      IRTemp t3 = newTemp(Ity_V128);
+      assign(t1, getQReg128(nn));
+      assign(t2, isR ? binop(mkVecADD(size+1),
+                             mkexpr(t1),
+                             mkexpr(math_VEC_DUP_IMM(size+1, 1ULL<<(shift-1))))
+                     : mkexpr(t1));
+      assign(t3, binop(mkVecSHRN(size+1), mkexpr(t2), mkU8(shift)));
+      IRTemp t4 = math_NARROW_LANES(t3, t3, size);
+      putLO64andZUorPutHI64(is2, dd, t4);
+      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
+      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
+      DIP("%s %s.%s, %s.%s, #%u\n", isR ? "rshrn" : "shrn",
+          nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide, shift);
+      return True;
+   }
+
    if (opcode == BITS5(1,0,1,0,0)) {
       /* -------- 0,10100 SSHLL{,2} #imm -------- */
       /* -------- 1,10100 USHLL{,2} #imm -------- */
@@ -7284,40 +7436,26 @@
       /* Narrows, and size refers to the narrowed lanes. */
       if (size == X11) return False;
       vassert(size <= 2);
-      const IROp opADD[3] = { Iop_Add16x8,  Iop_Add32x4,  Iop_Add64x2  };
-      const IROp opSUB[3] = { Iop_Sub16x8,  Iop_Sub32x4,  Iop_Sub64x2  };
-      const IROp opSHR[3] = { Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2 };
-      const UInt shift[3] = { 8,            16,           32           };
-      const IROp opCAT[3] = { Iop_CatEvenLanes8x16, Iop_CatEvenLanes16x8,
-                              Iop_CatEvenLanes32x4 };
+      const UInt shift[3] = { 8, 16, 32 };
       Bool isADD = opcode == BITS4(0,1,0,0);
       Bool isR   = bitU == 1;
       /* Combined elements in wide lanes */
       IRTemp  wide  = newTemp(Ity_V128);
-      IRExpr* wideE = binop(isADD ? opADD[size] : opSUB[size],
+      IRExpr* wideE = binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
                             getQReg128(nn), getQReg128(mm));
       if (isR) {
-         IRType ty  = Ity_INVALID;
-         IRTemp rcS = IRTemp_INVALID;
-         switch (size) {
-            case X00: ty  = Ity_I16;
-                      rcS = newTemp(ty); assign(rcS, mkU16(0x80)); break;
-            case X01: ty  = Ity_I32;
-                      rcS = newTemp(ty); assign(rcS, mkU32(0x8000)); break;
-            case X10: ty  = Ity_I64;
-                      rcS = newTemp(ty); assign(rcS, mkU64(0x80000000)); break;
-            default:  vassert(0);
-         }
-         IRTemp rcV = math_DUP_TO_V128(rcS, ty);
-         wideE = binop(opADD[size], wideE, mkexpr(rcV));
+         wideE = binop(mkVecADD(size+1),
+                       wideE,
+                       mkexpr(math_VEC_DUP_IMM(size+1,
+                                               1ULL << (shift[size]-1))));
       }
       assign(wide, wideE);
       /* Top halves of elements, still in wide lanes */
       IRTemp shrd = newTemp(Ity_V128);
-      assign(shrd, binop(opSHR[size], mkexpr(wide), mkU8(shift[size])));
+      assign(shrd, binop(mkVecSHRN(size+1), mkexpr(wide), mkU8(shift[size])));
       /* Elements now compacted into lower 64 bits */
       IRTemp new64 = newTemp(Ity_V128);
-      assign(new64, binop(opCAT[size], mkexpr(shrd), mkexpr(shrd)));
+      assign(new64, binop(mkVecCATEVENLANES(size), mkexpr(shrd), mkexpr(shrd)));
       putLO64andZUorPutHI64(is2, dd, new64);
       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
@@ -7359,40 +7497,36 @@
 
    if (opcode == BITS4(1,1,0,0)
        || opcode == BITS4(1,0,0,0) || opcode == BITS4(1,0,1,0)) {
-      /* -------- 0,1100  SMULL{2} -------- */ // 0 (ix)
+      /* -------- 0,1100  SMULL{2} -------- */ // 0 (ks)
       /* -------- 1,1100  UMULL{2} -------- */ // 0
       /* -------- 0,1000  SMLAL{2} -------- */ // 1
       /* -------- 1,1000  UMLAL{2} -------- */ // 1
       /* -------- 0,1010  SMLSL{2} -------- */ // 2
       /* -------- 1,1010  UMLSL{2} -------- */ // 2
       /* Widens, and size refers to the narrowed lanes. */
-      UInt ix = 3;
+      UInt ks = 3;
       switch (opcode) {
-         case BITS4(1,1,0,0): ix = 0; break;
-         case BITS4(1,0,0,0): ix = 1; break;
-         case BITS4(1,0,1,0): ix = 2; break;
+         case BITS4(1,1,0,0): ks = 0; break;
+         case BITS4(1,0,0,0): ks = 1; break;
+         case BITS4(1,0,1,0): ks = 2; break;
          default: vassert(0);
       }
-      vassert(ix >= 0 && ix <= 2);
-      const IROp opMULLU[3] = { Iop_Mull8Ux8, Iop_Mull16Ux4, Iop_Mull32Ux2 };
-      const IROp opMULLS[3] = { Iop_Mull8Sx8, Iop_Mull16Sx4, Iop_Mull32Sx2 };
-      const IROp opADD[3]   = { Iop_Add16x8,  Iop_Add32x4,   Iop_Add64x2   };
-      const IROp opSUB[3]   = { Iop_Sub16x8,  Iop_Sub32x4,   Iop_Sub64x2   };
+      vassert(ks >= 0 && ks <= 2);
       if (size == X11) return False;
       vassert(size <= 2);
       Bool   isU   = bitU == 1;
-      IROp   mulOp = isU ? opMULLU[size] : opMULLS[size];
-      IROp   accOp = (ix == 1) ? opADD[size] 
-                               : (ix == 2 ? opSUB[size] : Iop_INVALID);
+      IROp   mulOp = isU ? mkVecMULLU(size) : mkVecMULLS(size);
+      IROp   accOp = (ks == 1) ? mkVecADD(size+1)
+                               : (ks == 2 ? mkVecSUB(size+1) : Iop_INVALID);
       IRTemp mul = math_BINARY_WIDENING_V128(is2, mulOp,
                                              getQReg128(nn), getQReg128(mm));
       IRTemp res = newTemp(Ity_V128);
-      assign(res, ix == 0 ? mkexpr(mul) 
+      assign(res, ks == 0 ? mkexpr(mul) 
                           : binop(accOp, getQReg128(dd), mkexpr(mul)));
       putQReg128(dd, mkexpr(res));
       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
-      const HChar* nm        = ix == 0 ? "mull" : (ix == 1 ? "mlal" : "mlsl");
+      const HChar* nm        = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
       DIP("%c%s%s %s.%s, %s.%s, %s.%s\n", isU ? 'u' : 's', nm, is2 ? "2" : "",
           nameQReg128(dd), arrWide,
           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
@@ -8259,6 +8393,26 @@
       return False;
    }
 
+   if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
+      /* -------- 1,xx,10011 SHLL{2} #lane-width -------- */
+      /* Widens, and size is the narrow size. */
+      if (size == X11) return False;
+      Bool is2   = bitQ == 1;
+      IROp opINT = is2 ? mkVecINTERLEAVEHI(size) : mkVecINTERLEAVELO(size);
+      IROp opSHL = mkVecSHLN(size+1);
+      IRTemp src = newTemp(Ity_V128);
+      IRTemp res = newTemp(Ity_V128);
+      assign(src, getQReg128(nn));
+      assign(res, binop(opSHL, binop(opINT, mkexpr(src), mkexpr(src)),
+                               mkU8(8 << size)));
+      putQReg128(dd, mkexpr(res));
+      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
+      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
+      DIP("shll%s %s.%s, %s.%s, #%u\n", is2 ? "2" : "", 
+          nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow, 8 << size);
+      return True;
+   }
+
    if (bitU == 0 && size == X01 && opcode == BITS5(1,0,1,1,0)) {
       /* -------- 0,01,10110: FCVTN 2s/4s_2d -------- */
       IRTemp  rm    = mk_get_IR_rounding_mode();
@@ -8391,14 +8545,11 @@
             vassert(0);
       }
       vassert(mm < 32 && ix < 16);
-      IROp   opMUL = size == X01 ? Iop_Mul16x8 : Iop_Mul32x4;
-      IROp   opADD = size == X01 ? Iop_Add16x8 : Iop_Add32x4;
-      IROp   opSUB = size == X01 ? Iop_Sub16x8 : Iop_Sub32x4;
-      IRType ity   = size == X01 ? Ity_I16 : Ity_I32;
+      IROp   opMUL = mkVecMUL(size);
+      IROp   opADD = mkVecADD(size);
+      IROp   opSUB = mkVecSUB(size);
       HChar  ch    = size == X01 ? 'h' : 's';
-      IRTemp elemM = newTemp(ity);
-      assign(elemM, getQRegLane(mm, ix, ity));
-      IRTemp vecM  = math_DUP_TO_V128(elemM, ity);
+      IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
       IRTemp vecD  = newTemp(Ity_V128);
       IRTemp vecN  = newTemp(Ity_V128);
       IRTemp res   = newTemp(Ity_V128);
@@ -8419,6 +8570,65 @@
       return True;
    }
 
+   if (opcode == BITS4(1,0,1,0)
+       || opcode == BITS4(0,0,1,0) || opcode == BITS4(0,1,1,0)) {
+      /* -------- 0,xx,1010 SMULL s/h variants only -------- */ // 0 (ks)
+      /* -------- 1,xx,1010 UMULL s/h variants only -------- */ // 0
+      /* -------- 0,xx,0010 SMLAL s/h variants only -------- */ // 1
+      /* -------- 1,xx,0010 UMLAL s/h variants only -------- */ // 1
+      /* -------- 0,xx,0110 SMLSL s/h variants only -------- */ // 2
+      /* -------- 1,xx,0110 SMLSL s/h variants only -------- */ // 2
+      /* Widens, and size refers to the narrowed lanes. */
+      UInt ks = 3;
+      switch (opcode) {
+         case BITS4(1,0,1,0): ks = 0; break;
+         case BITS4(0,0,1,0): ks = 1; break;
+         case BITS4(0,1,1,0): ks = 2; break;
+         default: vassert(0);
+      }
+      vassert(ks >= 0 && ks <= 2);
+      Bool isU = bitU == 1;
+      Bool is2 = bitQ == 1;
+      UInt mm  = 32; // invalid
+      UInt ix  = 16; // invalid
+      switch (size) {
+         case X00:
+            return False; // h_b_b[] case is not allowed
+         case X01:
+            mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
+         case X10:
+            mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
+         case X11:
+            return False; // q_d_d[] case is not allowed
+         default:
+            vassert(0);
+      }
+      vassert(mm < 32 && ix < 16);
+      IROp   mulOp = isU ? mkVecMULLU(size) : mkVecMULLS(size);
+      IROp   accOp = (ks == 1) ? mkVecADD(size+1) 
+                               : (ks == 2 ? mkVecSUB(size+1) : Iop_INVALID);
+      IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
+      IRTemp vecD  = newTemp(Ity_V128);
+      IRTemp vecN  = newTemp(Ity_V128);
+      assign(vecD, getQReg128(dd));
+      assign(vecN, getQReg128(nn));
+      IRTemp mul   = math_BINARY_WIDENING_V128(is2, mulOp, 
+                                             mkexpr(vecN), mkexpr(vecM));
+      IRTemp res = newTemp(Ity_V128);
+      assign(res, ks == 0 ? mkexpr(mul) 
+                          : binop(accOp, getQReg128(dd), mkexpr(mul)));
+      putQReg128(dd, mkexpr(res));
+      const HChar* nm        = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
+      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
+      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
+      HChar ch               = size == X01 ? 'h' : 's';
+      DIP("%c%s%s %s.%s, %s.%s, %s.%c[%u]\n",
+          isU ? 'u' : 's', nm, is2 ? "2" : "",
+          nameQReg128(dd), arrWide,
+          nameQReg128(nn), arrNarrow, nameQReg128(dd), ch, ix);
+      return True;
+   }
+
    return False;
 #  undef INSN
 }
diff --git a/pub/libvex_ir.h b/pub/libvex_ir.h
index 7812847..a4fb614 100644
--- a/pub/libvex_ir.h
+++ b/pub/libvex_ir.h
@@ -1572,7 +1572,9 @@
       Iop_InterleaveOddLanes32x4, Iop_InterleaveEvenLanes32x4,
 
       /* CONCATENATION -- build a new value by concatenating either
-         the even or odd lanes of both operands. */
+         the even or odd lanes of both operands.  Note that
+         Cat{Odd,Even}Lanes64x2 are identical to Interleave{HI,LO}64x2
+         and so are omitted. */
       Iop_CatOddLanes8x16, Iop_CatOddLanes16x8, Iop_CatOddLanes32x4,
       Iop_CatEvenLanes8x16, Iop_CatEvenLanes16x8, Iop_CatEvenLanes32x4,