arm64: implement
  sshr, ushr, ssra, usra (scalar, imm) 
  srshr, urshr, srsra, ursra (scalar, imm) 
  srshr, urshr, srsra, ursra (vector, imm) 
  sshl, srshl, ushl, urshl (scalar, imm) 
  sshl, srshl, ushl, urshl (vector, vector) 
  ssra, usra (vector, imm) 


git-svn-id: svn://svn.valgrind.org/vex/trunk@2926 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_arm64_toIR.c b/priv/guest_arm64_toIR.c
index 9e27d90..e72eaf7 100644
--- a/priv/guest_arm64_toIR.c
+++ b/priv/guest_arm64_toIR.c
@@ -841,6 +841,34 @@
    return ops[size];
 }
 
+static IROp mkVecSHU ( UInt size ) {
+   const IROp ops[4]
+      = { Iop_Sh8Ux16, Iop_Sh16Ux8, Iop_Sh32Ux4, Iop_Sh64Ux2 };
+   vassert(size < 4);
+   return ops[size];
+}
+
+static IROp mkVecSHS ( UInt size ) {
+   const IROp ops[4]
+      = { Iop_Sh8Sx16, Iop_Sh16Sx8, Iop_Sh32Sx4, Iop_Sh64Sx2 };
+   vassert(size < 4);
+   return ops[size];
+}
+
+static IROp mkVecRSHU ( UInt size ) {
+   const IROp ops[4]
+      = { Iop_Rsh8Ux16, Iop_Rsh16Ux8, Iop_Rsh32Ux4, Iop_Rsh64Ux2 };
+   vassert(size < 4);
+   return ops[size];
+}
+
+static IROp mkVecRSHS ( UInt size ) {
+   const IROp ops[4]
+      = { Iop_Rsh8Sx16, Iop_Rsh16Sx8, Iop_Rsh32Sx4, Iop_Rsh64Sx2 };
+   vassert(size < 4);
+   return ops[size];
+}
+
 static IROp mkVecNARROWUN ( UInt sizeNarrow ) {
    const IROp ops[4]
       = { Iop_NarrowUn16to8x8, Iop_NarrowUn32to16x4,
@@ -7538,16 +7566,64 @@
    UInt dd     = INSN(4,0);
    UInt immhb  = (immh << 3) | immb;
 
-   if (bitU == 1 && (immh & 8) == 8 && opcode == BITS5(0,0,0,0,0)) {
-      /* -------- 1,1xxx,00000 SHR d_d_#imm -------- */
-      UInt sh = 128 - immhb;
+   if ((immh & 8) == 8
+       && (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,0,1,0))) {
+      /* -------- 0,1xxx,00000 SSHR d_d_#imm -------- */
+      /* -------- 1,1xxx,00000 USHR d_d_#imm -------- */
+      /* -------- 0,1xxx,00010 SSRA d_d_#imm -------- */
+      /* -------- 1,1xxx,00010 USRA d_d_#imm -------- */
+      Bool isU   = bitU == 1;
+      Bool isAcc = opcode == BITS5(0,0,0,1,0);
+      UInt sh    = 128 - immhb;
       vassert(sh >= 1 && sh <= 64);
-      /* Don't generate an out of range IR shift */
-      putQReg128(dd, sh == 64
-                        ? mkV128(0x0000)
-                        : unop(Iop_ZeroHI64ofV128,
-                               binop(Iop_ShrN64x2, getQReg128(nn), mkU8(sh))));
-      DIP("shr d%u, d%u, #%u\n", dd, nn, sh);
+      IROp    op  = isU ? Iop_ShrN64x2 : Iop_SarN64x2;
+      IRExpr* src = getQReg128(nn);
+      IRTemp  shf = newTempV128();
+      IRTemp  res = newTempV128();
+      if (sh == 64 && isU) {
+         assign(shf, mkV128(0x0000));
+      } else {
+         UInt nudge = 0;
+         if (sh == 64) {
+            vassert(!isU);
+            nudge = 1;
+         }
+         assign(shf, binop(op, src, mkU8(sh - nudge)));
+      }
+      assign(res, isAcc ? binop(Iop_Add64x2, getQReg128(dd), mkexpr(shf))
+                        : mkexpr(shf));
+      putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
+      const HChar* nm = isAcc ? (isU ? "usra" : "ssra")
+                              : (isU ? "ushr" : "sshr");
+      DIP("%s d%u, d%u, #%u\n", nm, dd, nn, sh);
+      return True;
+   }
+
+   if ((immh & 8) == 8
+       && (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,1,0))) {
+      /* -------- 0,1xxx,00100 SRSHR d_d_#imm -------- */
+      /* -------- 1,1xxx,00100 URSHR d_d_#imm -------- */
+      /* -------- 0,1xxx,00110 SRSRA d_d_#imm -------- */
+      /* -------- 1,1xxx,00110 URSRA d_d_#imm -------- */
+      Bool isU   = bitU == 1;
+      Bool isAcc = opcode == BITS5(0,0,1,1,0);
+      UInt sh    = 128 - immhb;
+      vassert(sh >= 1 && sh <= 64);
+      IROp    op  = isU ? Iop_Rsh64Ux2 : Iop_Rsh64Sx2;
+      vassert(sh >= 1 && sh <= 64);
+      IRExpr* src  = getQReg128(nn);
+      IRTemp  imm8 = newTemp(Ity_I8);
+      assign(imm8, mkU8((UChar)(-sh)));
+      IRExpr* amt  = mkexpr(math_DUP_TO_V128(imm8, Ity_I8));
+      IRTemp  shf  = newTempV128();
+      IRTemp  res  = newTempV128();
+      assign(shf, binop(op, src, amt));
+      assign(res, isAcc ? binop(Iop_Add64x2, getQReg128(dd), mkexpr(shf))
+                        : mkexpr(shf));
+      putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
+      const HChar* nm = isAcc ? (isU ? "ursra" : "srsra")
+                              : (isU ? "urshr" : "srshr");
+      DIP("%s d%u, d%u, #%u\n", nm, dd, nn, sh);
       return True;
    }
 
@@ -7862,6 +7938,27 @@
       return True;
    }
 
+   if (size == X11 && (opcode == BITS5(0,1,0,0,0)
+                       || opcode == BITS5(0,1,0,1,0))) {
+      /* -------- 0,xx,01000 SSHL  d_d_d -------- */
+      /* -------- 0,xx,01010 SRSHL d_d_d -------- */
+      /* -------- 1,xx,01000 USHL  d_d_d -------- */
+      /* -------- 1,xx,01010 URSHL d_d_d -------- */
+      Bool isU = bitU == 1;
+      Bool isR = opcode == BITS5(0,1,0,1,0);
+      IROp op  = isR ? (isU ? mkVecRSHU(size) : mkVecRSHS(size))
+                     : (isU ? mkVecSHU(size)  : mkVecSHS(size));
+      IRTemp res = newTempV128();
+      assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
+      putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
+      const HChar* nm  = isR ? (isU ? "urshl" : "srshl")
+                             : (isU ? "ushl"  : "sshl");
+      DIP("%s %s, %s, %s\n", nm,
+          nameQRegLO(dd, Ity_I64),
+          nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
+      return True;
+   }
+
    if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
       /* -------- 0,xx,01001 SQSHL  std4_std4_std4 -------- */
       /* -------- 0,xx,01011 SQRSHL std4_std4_std4 -------- */
@@ -8248,9 +8345,11 @@
    UInt nn     = INSN(9,5);
    UInt dd     = INSN(4,0);
 
-   if (opcode == BITS5(0,0,0,0,0)) {
+   if (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,0,1,0)) {
       /* -------- 0,00000 SSHR std7_std7_#imm -------- */
       /* -------- 1,00000 USHR std7_std7_#imm -------- */
+      /* -------- 0,00010 SSRA std7_std7_#imm -------- */
+      /* -------- 1,00010 USRA std7_std7_#imm -------- */
       /* laneTy, shift = case immh:immb of
                          0001:xxx -> B, SHR:8-xxx
                          001x:xxx -> H, SHR:16-xxxx
@@ -8262,6 +8361,7 @@
       UInt shift = 0;
       Bool isQ   = bitQ == 1;
       Bool isU   = bitU == 1;
+      Bool isAcc = opcode == BITS5(0,0,0,1,0);
       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
       if (!ok || (bitQ == 0 && size == X11)) return False;
       vassert(size >= 0 && size <= 3);
@@ -8269,21 +8369,68 @@
       vassert(shift >= 1 && shift <= lanebits);
       IROp    op  = isU ? mkVecSHRN(size) : mkVecSARN(size);
       IRExpr* src = getQReg128(nn);
+      IRTemp  shf = newTempV128();
       IRTemp  res = newTempV128();
       if (shift == lanebits && isU) {
-         assign(res, mkV128(0x0000));
+         assign(shf, mkV128(0x0000));
       } else {
          UInt nudge = 0;
          if (shift == lanebits) {
             vassert(!isU);
             nudge = 1;
          }
-         assign(res, binop(op, src, mkU8(shift - nudge)));
+         assign(shf, binop(op, src, mkU8(shift - nudge)));
       }
+      assign(res, isAcc ? binop(mkVecADD(size), getQReg128(dd), mkexpr(shf))
+                        : mkexpr(shf));
       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
       HChar laneCh = "bhsd"[size];
       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
-      const HChar* nm = isU ? "ushr" : "sshr";
+      const HChar* nm = isAcc ? (isU ? "usra" : "ssra")
+                              : (isU ? "ushr" : "sshr");
+      DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
+          nameQReg128(dd), nLanes, laneCh,
+          nameQReg128(nn), nLanes, laneCh, shift);
+      return True;
+   }
+
+   if (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,1,0)) {
+      /* -------- 0,00100 SRSHR std7_std7_#imm -------- */
+      /* -------- 1,00100 URSHR std7_std7_#imm -------- */
+      /* -------- 0,00110 SRSRA std7_std7_#imm -------- */
+      /* -------- 1,00110 URSRA std7_std7_#imm -------- */
+      /* laneTy, shift = case immh:immb of
+                         0001:xxx -> B, SHR:8-xxx
+                         001x:xxx -> H, SHR:16-xxxx
+                         01xx:xxx -> S, SHR:32-xxxxx
+                         1xxx:xxx -> D, SHR:64-xxxxxx
+                         other    -> invalid
+      */
+      UInt size  = 0;
+      UInt shift = 0;
+      Bool isQ   = bitQ == 1;
+      Bool isU   = bitU == 1;
+      Bool isAcc = opcode == BITS5(0,0,1,1,0);
+      Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
+      if (!ok || (bitQ == 0 && size == X11)) return False;
+      vassert(size >= 0 && size <= 3);
+      UInt lanebits = 8 << size;
+      vassert(shift >= 1 && shift <= lanebits);
+      IROp    op   = isU ? mkVecRSHU(size) : mkVecRSHS(size);
+      IRExpr* src  = getQReg128(nn);
+      IRTemp  imm8 = newTemp(Ity_I8);
+      assign(imm8, mkU8((UChar)(-shift)));
+      IRExpr* amt  = mkexpr(math_DUP_TO_V128(imm8, Ity_I8));
+      IRTemp  shf  = newTempV128();
+      IRTemp  res  = newTempV128();
+      assign(shf, binop(op, src, amt));
+      assign(res, isAcc ? binop(mkVecADD(size), getQReg128(dd), mkexpr(shf))
+                        : mkexpr(shf));
+      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
+      HChar laneCh = "bhsd"[size];
+      UInt  nLanes = (isQ ? 128 : 64) / lanebits;
+      const HChar* nm = isAcc ? (isU ? "ursra" : "srsra")
+                              : (isU ? "urshr" : "srshr");
       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
           nameQReg128(dd), nLanes, laneCh,
           nameQReg128(nn), nLanes, laneCh, shift);
@@ -9030,6 +9177,27 @@
       return True;
    }
 
+   if (opcode == BITS5(0,1,0,0,0) || opcode == BITS5(0,1,0,1,0)) {
+      /* -------- 0,xx,01000 SSHL  std7_std7_std7 -------- */
+      /* -------- 0,xx,01010 SRSHL std7_std7_std7 -------- */
+      /* -------- 1,xx,01000 USHL  std7_std7_std7 -------- */
+      /* -------- 1,xx,01010 URSHL std7_std7_std7 -------- */
+      if (bitQ == 0 && size == X11) return False; // implied 1d case
+      Bool isU = bitU == 1;
+      Bool isR = opcode == BITS5(0,1,0,1,0);
+      IROp op  = isR ? (isU ? mkVecRSHU(size) : mkVecRSHS(size))
+                     : (isU ? mkVecSHU(size)  : mkVecSHS(size));
+      IRTemp res = newTempV128();
+      assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
+      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
+      const HChar* nm  = isR ? (isU ? "urshl" : "srshl")
+                             : (isU ? "ushl"  : "sshl");
+      const HChar* arr = nameArr_Q_SZ(bitQ, size);
+      DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
+          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+      return True;
+   }
+
    if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
       /* -------- 0,xx,01001 SQSHL  std7_std7_std7 -------- */
       /* -------- 0,xx,01011 SQRSHL std7_std7_std7 -------- */
diff --git a/priv/host_arm64_defs.c b/priv/host_arm64_defs.c
index c94ffbb..b977d19 100644
--- a/priv/host_arm64_defs.c
+++ b/priv/host_arm64_defs.c
@@ -682,6 +682,22 @@
       case ARM64vecb_UQRSHL32x4:   *nm = "uqrshl";    *ar = "4s";   return;
       case ARM64vecb_UQRSHL16x8:   *nm = "uqrshl";    *ar = "8h";   return;
       case ARM64vecb_UQRSHL8x16:   *nm = "uqrshl";    *ar = "16b";  return;
+      case ARM64vecb_SSHL64x2:     *nm = "sshl";      *ar = "2d";   return;
+      case ARM64vecb_SSHL32x4:     *nm = "sshl";      *ar = "4s";   return;
+      case ARM64vecb_SSHL16x8:     *nm = "sshl";      *ar = "8h";   return;
+      case ARM64vecb_SSHL8x16:     *nm = "sshl";      *ar = "16b";  return;
+      case ARM64vecb_USHL64x2:     *nm = "ushl";      *ar = "2d";   return;
+      case ARM64vecb_USHL32x4:     *nm = "ushl";      *ar = "4s";   return;
+      case ARM64vecb_USHL16x8:     *nm = "ushl";      *ar = "8h";   return;
+      case ARM64vecb_USHL8x16:     *nm = "ushl";      *ar = "16b";  return;
+      case ARM64vecb_SRSHL64x2:    *nm = "srshl";     *ar = "2d";   return;
+      case ARM64vecb_SRSHL32x4:    *nm = "srshl";     *ar = "4s";   return;
+      case ARM64vecb_SRSHL16x8:    *nm = "srshl";     *ar = "8h";   return;
+      case ARM64vecb_SRSHL8x16:    *nm = "srshl";     *ar = "16b";  return;
+      case ARM64vecb_URSHL64x2:    *nm = "urshl";     *ar = "2d";   return;
+      case ARM64vecb_URSHL32x4:    *nm = "urshl";     *ar = "4s";   return;
+      case ARM64vecb_URSHL16x8:    *nm = "urshl";     *ar = "8h";   return;
+      case ARM64vecb_URSHL8x16:    *nm = "urshl";     *ar = "16b";  return;
       default: vpanic("showARM64VecBinOp");
    }
 }
@@ -717,54 +733,54 @@
    }
 }
 
-static void showARM64VecShiftOp(/*OUT*/const HChar** nm,
-                                /*OUT*/const HChar** ar,
-                                ARM64VecShiftOp op )
+static void showARM64VecShiftImmOp(/*OUT*/const HChar** nm,
+                                   /*OUT*/const HChar** ar,
+                                   ARM64VecShiftImmOp op )
 {
    switch (op) {
-      case ARM64vecsh_USHR64x2:    *nm = "ushr  ";   *ar = "2d";  return;
-      case ARM64vecsh_USHR32x4:    *nm = "ushr  ";   *ar = "4s";  return;
-      case ARM64vecsh_USHR16x8:    *nm = "ushr  ";   *ar = "8h";  return;
-      case ARM64vecsh_USHR8x16:    *nm = "ushr  ";   *ar = "16b"; return;
-      case ARM64vecsh_SSHR64x2:    *nm = "sshr  ";   *ar = "2d";  return;
-      case ARM64vecsh_SSHR32x4:    *nm = "sshr  ";   *ar = "4s";  return;
-      case ARM64vecsh_SSHR16x8:    *nm = "sshr  ";   *ar = "8h";  return;
-      case ARM64vecsh_SSHR8x16:    *nm = "sshr  ";   *ar = "16b"; return;
-      case ARM64vecsh_SHL64x2:     *nm = "shl   ";   *ar = "2d";  return;
-      case ARM64vecsh_SHL32x4:     *nm = "shl   ";   *ar = "4s";  return;
-      case ARM64vecsh_SHL16x8:     *nm = "shl   ";   *ar = "8h";  return;
-      case ARM64vecsh_SHL8x16:     *nm = "shl   ";   *ar = "16b"; return;
-      case ARM64vecsh_SQSHRN2SD:   *nm = "sqshrn";   *ar = "2sd"; return;
-      case ARM64vecsh_SQSHRN4HS:   *nm = "sqshrn";   *ar = "4hs"; return;
-      case ARM64vecsh_SQSHRN8BH:   *nm = "sqshrn";   *ar = "8bh"; return;
-      case ARM64vecsh_UQSHRN2SD:   *nm = "uqshrn";   *ar = "2sd"; return;
-      case ARM64vecsh_UQSHRN4HS:   *nm = "uqshrn";   *ar = "4hs"; return;
-      case ARM64vecsh_UQSHRN8BH:   *nm = "uqshrn";   *ar = "8bh"; return;
-      case ARM64vecsh_SQSHRUN2SD:  *nm = "sqshrun";  *ar = "2sd"; return;
-      case ARM64vecsh_SQSHRUN4HS:  *nm = "sqshrun";  *ar = "4hs"; return;
-      case ARM64vecsh_SQSHRUN8BH:  *nm = "sqshrun";  *ar = "8bh"; return;
-      case ARM64vecsh_SQRSHRN2SD:  *nm = "sqrshrn";  *ar = "2sd"; return;
-      case ARM64vecsh_SQRSHRN4HS:  *nm = "sqrshrn";  *ar = "4hs"; return;
-      case ARM64vecsh_SQRSHRN8BH:  *nm = "sqrshrn";  *ar = "8bh"; return;
-      case ARM64vecsh_UQRSHRN2SD:  *nm = "uqrshrn";  *ar = "2sd"; return;
-      case ARM64vecsh_UQRSHRN4HS:  *nm = "uqrshrn";  *ar = "4hs"; return;
-      case ARM64vecsh_UQRSHRN8BH:  *nm = "uqrshrn";  *ar = "8bh"; return;
-      case ARM64vecsh_SQRSHRUN2SD: *nm = "sqrshrun"; *ar = "2sd"; return;
-      case ARM64vecsh_SQRSHRUN4HS: *nm = "sqrshrun"; *ar = "4hs"; return;
-      case ARM64vecsh_SQRSHRUN8BH: *nm = "sqrshrun"; *ar = "8bh"; return;
-      case ARM64vecsh_UQSHL64x2:   *nm = "uqshl ";   *ar = "2d";  return;
-      case ARM64vecsh_UQSHL32x4:   *nm = "uqshl ";   *ar = "4s";  return;
-      case ARM64vecsh_UQSHL16x8:   *nm = "uqshl ";   *ar = "8h";  return;
-      case ARM64vecsh_UQSHL8x16:   *nm = "uqshl ";   *ar = "16b"; return;
-      case ARM64vecsh_SQSHL64x2:   *nm = "sqshl ";   *ar = "2d";  return;
-      case ARM64vecsh_SQSHL32x4:   *nm = "sqshl ";   *ar = "4s";  return;
-      case ARM64vecsh_SQSHL16x8:   *nm = "sqshl ";   *ar = "8h";  return;
-      case ARM64vecsh_SQSHL8x16:   *nm = "sqshl ";   *ar = "16b"; return;
-      case ARM64vecsh_SQSHLU64x2:  *nm = "sqshlu";   *ar = "2d";  return;
-      case ARM64vecsh_SQSHLU32x4:  *nm = "sqshlu";   *ar = "4s";  return;
-      case ARM64vecsh_SQSHLU16x8:  *nm = "sqshlu";   *ar = "8h";  return;
-      case ARM64vecsh_SQSHLU8x16:  *nm = "sqshlu";   *ar = "16b"; return;
-      default: vpanic("showARM64VecShiftOp");
+      case ARM64vecshi_USHR64x2:    *nm = "ushr  ";   *ar = "2d";  return;
+      case ARM64vecshi_USHR32x4:    *nm = "ushr  ";   *ar = "4s";  return;
+      case ARM64vecshi_USHR16x8:    *nm = "ushr  ";   *ar = "8h";  return;
+      case ARM64vecshi_USHR8x16:    *nm = "ushr  ";   *ar = "16b"; return;
+      case ARM64vecshi_SSHR64x2:    *nm = "sshr  ";   *ar = "2d";  return;
+      case ARM64vecshi_SSHR32x4:    *nm = "sshr  ";   *ar = "4s";  return;
+      case ARM64vecshi_SSHR16x8:    *nm = "sshr  ";   *ar = "8h";  return;
+      case ARM64vecshi_SSHR8x16:    *nm = "sshr  ";   *ar = "16b"; return;
+      case ARM64vecshi_SHL64x2:     *nm = "shl   ";   *ar = "2d";  return;
+      case ARM64vecshi_SHL32x4:     *nm = "shl   ";   *ar = "4s";  return;
+      case ARM64vecshi_SHL16x8:     *nm = "shl   ";   *ar = "8h";  return;
+      case ARM64vecshi_SHL8x16:     *nm = "shl   ";   *ar = "16b"; return;
+      case ARM64vecshi_SQSHRN2SD:   *nm = "sqshrn";   *ar = "2sd"; return;
+      case ARM64vecshi_SQSHRN4HS:   *nm = "sqshrn";   *ar = "4hs"; return;
+      case ARM64vecshi_SQSHRN8BH:   *nm = "sqshrn";   *ar = "8bh"; return;
+      case ARM64vecshi_UQSHRN2SD:   *nm = "uqshrn";   *ar = "2sd"; return;
+      case ARM64vecshi_UQSHRN4HS:   *nm = "uqshrn";   *ar = "4hs"; return;
+      case ARM64vecshi_UQSHRN8BH:   *nm = "uqshrn";   *ar = "8bh"; return;
+      case ARM64vecshi_SQSHRUN2SD:  *nm = "sqshrun";  *ar = "2sd"; return;
+      case ARM64vecshi_SQSHRUN4HS:  *nm = "sqshrun";  *ar = "4hs"; return;
+      case ARM64vecshi_SQSHRUN8BH:  *nm = "sqshrun";  *ar = "8bh"; return;
+      case ARM64vecshi_SQRSHRN2SD:  *nm = "sqrshrn";  *ar = "2sd"; return;
+      case ARM64vecshi_SQRSHRN4HS:  *nm = "sqrshrn";  *ar = "4hs"; return;
+      case ARM64vecshi_SQRSHRN8BH:  *nm = "sqrshrn";  *ar = "8bh"; return;
+      case ARM64vecshi_UQRSHRN2SD:  *nm = "uqrshrn";  *ar = "2sd"; return;
+      case ARM64vecshi_UQRSHRN4HS:  *nm = "uqrshrn";  *ar = "4hs"; return;
+      case ARM64vecshi_UQRSHRN8BH:  *nm = "uqrshrn";  *ar = "8bh"; return;
+      case ARM64vecshi_SQRSHRUN2SD: *nm = "sqrshrun"; *ar = "2sd"; return;
+      case ARM64vecshi_SQRSHRUN4HS: *nm = "sqrshrun"; *ar = "4hs"; return;
+      case ARM64vecshi_SQRSHRUN8BH: *nm = "sqrshrun"; *ar = "8bh"; return;
+      case ARM64vecshi_UQSHL64x2:   *nm = "uqshl ";   *ar = "2d";  return;
+      case ARM64vecshi_UQSHL32x4:   *nm = "uqshl ";   *ar = "4s";  return;
+      case ARM64vecshi_UQSHL16x8:   *nm = "uqshl ";   *ar = "8h";  return;
+      case ARM64vecshi_UQSHL8x16:   *nm = "uqshl ";   *ar = "16b"; return;
+      case ARM64vecshi_SQSHL64x2:   *nm = "sqshl ";   *ar = "2d";  return;
+      case ARM64vecshi_SQSHL32x4:   *nm = "sqshl ";   *ar = "4s";  return;
+      case ARM64vecshi_SQSHL16x8:   *nm = "sqshl ";   *ar = "8h";  return;
+      case ARM64vecshi_SQSHL8x16:   *nm = "sqshl ";   *ar = "16b"; return;
+      case ARM64vecshi_SQSHLU64x2:  *nm = "sqshlu";   *ar = "2d";  return;
+      case ARM64vecshi_SQSHLU32x4:  *nm = "sqshlu";   *ar = "4s";  return;
+      case ARM64vecshi_SQSHLU16x8:  *nm = "sqshlu";   *ar = "8h";  return;
+      case ARM64vecshi_SQSHLU8x16:  *nm = "sqshlu";   *ar = "16b"; return;
+      default: vpanic("showARM64VecShiftImmOp");
    }
 }
 
@@ -1120,7 +1136,7 @@
    vassert(dszBlg2 == 0 || dszBlg2 == 1 || dszBlg2 == 2);
    return i;
 }
-ARM64Instr* ARM64Instr_VShiftImmV ( ARM64VecShiftOp op,
+ARM64Instr* ARM64Instr_VShiftImmV ( ARM64VecShiftImmOp op,
                                     HReg dst, HReg src, UInt amt ) {
    ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
    i->tag                    = ARM64in_VShiftImmV;
@@ -1134,41 +1150,41 @@
       /* For right shifts, the allowed shift amounts are 1 .. lane_size.
          For left shifts,  the allowed shift amounts are 0 .. lane_size-1. 
       */
-      case ARM64vecsh_USHR64x2: case ARM64vecsh_SSHR64x2:
-      case ARM64vecsh_UQSHRN2SD: case ARM64vecsh_SQSHRN2SD:
-      case ARM64vecsh_SQSHRUN2SD:
-      case ARM64vecsh_UQRSHRN2SD: case ARM64vecsh_SQRSHRN2SD:
-      case ARM64vecsh_SQRSHRUN2SD:
+      case ARM64vecshi_USHR64x2: case ARM64vecshi_SSHR64x2:
+      case ARM64vecshi_UQSHRN2SD: case ARM64vecshi_SQSHRN2SD:
+      case ARM64vecshi_SQSHRUN2SD:
+      case ARM64vecshi_UQRSHRN2SD: case ARM64vecshi_SQRSHRN2SD:
+      case ARM64vecshi_SQRSHRUN2SD:
          minSh = 1; maxSh = 64; break;
-      case ARM64vecsh_SHL64x2:
-      case ARM64vecsh_UQSHL64x2: case ARM64vecsh_SQSHL64x2:
-      case ARM64vecsh_SQSHLU64x2:
+      case ARM64vecshi_SHL64x2:
+      case ARM64vecshi_UQSHL64x2: case ARM64vecshi_SQSHL64x2:
+      case ARM64vecshi_SQSHLU64x2:
          minSh = 0; maxSh = 63; break;
-      case ARM64vecsh_USHR32x4: case ARM64vecsh_SSHR32x4:
-      case ARM64vecsh_UQSHRN4HS: case ARM64vecsh_SQSHRN4HS:
-      case ARM64vecsh_SQSHRUN4HS:
-      case ARM64vecsh_UQRSHRN4HS: case ARM64vecsh_SQRSHRN4HS:
-      case ARM64vecsh_SQRSHRUN4HS:
+      case ARM64vecshi_USHR32x4: case ARM64vecshi_SSHR32x4:
+      case ARM64vecshi_UQSHRN4HS: case ARM64vecshi_SQSHRN4HS:
+      case ARM64vecshi_SQSHRUN4HS:
+      case ARM64vecshi_UQRSHRN4HS: case ARM64vecshi_SQRSHRN4HS:
+      case ARM64vecshi_SQRSHRUN4HS:
          minSh = 1; maxSh = 32; break;
-      case ARM64vecsh_SHL32x4:
-      case ARM64vecsh_UQSHL32x4: case ARM64vecsh_SQSHL32x4:
-      case ARM64vecsh_SQSHLU32x4:
+      case ARM64vecshi_SHL32x4:
+      case ARM64vecshi_UQSHL32x4: case ARM64vecshi_SQSHL32x4:
+      case ARM64vecshi_SQSHLU32x4:
          minSh = 0; maxSh = 31; break;
-      case ARM64vecsh_USHR16x8: case ARM64vecsh_SSHR16x8:
-      case ARM64vecsh_UQSHRN8BH: case ARM64vecsh_SQSHRN8BH:
-      case ARM64vecsh_SQSHRUN8BH:
-      case ARM64vecsh_UQRSHRN8BH: case ARM64vecsh_SQRSHRN8BH:
-      case ARM64vecsh_SQRSHRUN8BH:
+      case ARM64vecshi_USHR16x8: case ARM64vecshi_SSHR16x8:
+      case ARM64vecshi_UQSHRN8BH: case ARM64vecshi_SQSHRN8BH:
+      case ARM64vecshi_SQSHRUN8BH:
+      case ARM64vecshi_UQRSHRN8BH: case ARM64vecshi_SQRSHRN8BH:
+      case ARM64vecshi_SQRSHRUN8BH:
          minSh = 1; maxSh = 16; break;
-      case ARM64vecsh_SHL16x8:
-      case ARM64vecsh_UQSHL16x8: case ARM64vecsh_SQSHL16x8:
-      case ARM64vecsh_SQSHLU16x8:
+      case ARM64vecshi_SHL16x8:
+      case ARM64vecshi_UQSHL16x8: case ARM64vecshi_SQSHL16x8:
+      case ARM64vecshi_SQSHLU16x8:
          minSh = 0; maxSh = 15; break;
-      case ARM64vecsh_USHR8x16: case ARM64vecsh_SSHR8x16:
+      case ARM64vecshi_USHR8x16: case ARM64vecshi_SSHR8x16:
          minSh = 1; maxSh = 8; break;
-      case ARM64vecsh_SHL8x16:
-      case ARM64vecsh_UQSHL8x16: case ARM64vecsh_SQSHL8x16:
-      case ARM64vecsh_SQSHLU8x16:
+      case ARM64vecshi_SHL8x16:
+      case ARM64vecshi_UQSHL8x16: case ARM64vecshi_SQSHL8x16:
+      case ARM64vecshi_SQSHLU8x16:
          minSh = 0; maxSh = 7; break;
       default:
          vassert(0);
@@ -1649,7 +1665,7 @@
       case ARM64in_VShiftImmV: {
          const HChar* nm = "??";
          const HChar* ar = "??";
-         showARM64VecShiftOp(&nm, &ar, i->ARM64in.VShiftImmV.op);
+         showARM64VecShiftImmOp(&nm, &ar, i->ARM64in.VShiftImmV.op);
          vex_printf("%s ", nm);
          ppHRegARM64(i->ARM64in.VShiftImmV.dst);
          vex_printf(".%s, ", ar);
@@ -4040,6 +4056,11 @@
             010 01110 sz 1 m  010111 n d   SQRSHL@sz  Vd, Vn, Vm
             011 01110 sz 1 m  010011 n d   UQSHL@sz   Vd, Vn, Vm
             011 01110 sz 1 m  010111 n d   URQSHL@sz  Vd, Vn, Vm
+
+            010 01110 sz 1 m  010001 n d   SSHL@sz   Vd, Vn, Vm
+            010 01110 sz 1 m  010101 n d   SRSHL@sz  Vd, Vn, Vm
+            011 01110 sz 1 m  010001 n d   USHL@sz   Vd, Vn, Vm
+            011 01110 sz 1 m  010101 n d   URSHL@sz  Vd, Vn, Vm
          */
          UInt vD = qregNo(i->ARM64in.VBinV.dst);
          UInt vN = qregNo(i->ARM64in.VBinV.argL);
@@ -4415,6 +4436,58 @@
                *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X010111, vN, vD);
                break;
 
+            case ARM64vecb_SSHL64x2:
+               *p++ = X_3_8_5_6_5_5(X010, X01110111, vM, X010001, vN, vD);
+               break;
+            case ARM64vecb_SSHL32x4:
+               *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X010001, vN, vD);
+               break;
+            case ARM64vecb_SSHL16x8:
+               *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X010001, vN, vD);
+               break;
+            case ARM64vecb_SSHL8x16:
+               *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X010001, vN, vD);
+               break;
+
+            case ARM64vecb_SRSHL64x2:
+               *p++ = X_3_8_5_6_5_5(X010, X01110111, vM, X010101, vN, vD);
+               break;
+            case ARM64vecb_SRSHL32x4:
+               *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X010101, vN, vD);
+               break;
+            case ARM64vecb_SRSHL16x8:
+               *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X010101, vN, vD);
+               break;
+            case ARM64vecb_SRSHL8x16:
+               *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X010101, vN, vD);
+               break;
+
+            case ARM64vecb_USHL64x2:
+               *p++ = X_3_8_5_6_5_5(X011, X01110111, vM, X010001, vN, vD);
+               break;
+            case ARM64vecb_USHL32x4:
+               *p++ = X_3_8_5_6_5_5(X011, X01110101, vM, X010001, vN, vD);
+               break;
+            case ARM64vecb_USHL16x8:
+               *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X010001, vN, vD);
+               break;
+            case ARM64vecb_USHL8x16:
+               *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X010001, vN, vD);
+               break;
+
+            case ARM64vecb_URSHL64x2:
+               *p++ = X_3_8_5_6_5_5(X011, X01110111, vM, X010101, vN, vD);
+               break;
+            case ARM64vecb_URSHL32x4:
+               *p++ = X_3_8_5_6_5_5(X011, X01110101, vM, X010101, vN, vD);
+               break;
+            case ARM64vecb_URSHL16x8:
+               *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X010101, vN, vD);
+               break;
+            case ARM64vecb_URSHL8x16:
+               *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X010101, vN, vD);
+               break;
+
             default:
                goto bad;
          }
@@ -4641,48 +4714,48 @@
             = X_3_6_7_6_5_5(X011, X011110, 0, X011001, vN, vD);
 
          switch (i->ARM64in.VShiftImmV.op) {
-            case ARM64vecsh_SSHR64x2:    tmpl = tmpl_SSHR;     goto right64x2;
-            case ARM64vecsh_USHR64x2:    tmpl = tmpl_USHR;     goto right64x2;
-            case ARM64vecsh_SHL64x2:     tmpl = tmpl_SHL;      goto left64x2;
-            case ARM64vecsh_UQSHL64x2:   tmpl = tmpl_UQSHL;    goto left64x2;
-            case ARM64vecsh_SQSHL64x2:   tmpl = tmpl_SQSHL;    goto left64x2;
-            case ARM64vecsh_SQSHLU64x2:  tmpl = tmpl_SQSHLU;   goto left64x2;
-            case ARM64vecsh_SSHR32x4:    tmpl = tmpl_SSHR;     goto right32x4;
-            case ARM64vecsh_USHR32x4:    tmpl = tmpl_USHR;     goto right32x4;
-            case ARM64vecsh_UQSHRN2SD:   tmpl = tmpl_UQSHRN;   goto right32x4;
-            case ARM64vecsh_SQSHRN2SD:   tmpl = tmpl_SQSHRN;   goto right32x4;
-            case ARM64vecsh_SQSHRUN2SD:  tmpl = tmpl_SQSHRUN;  goto right32x4;
-            case ARM64vecsh_UQRSHRN2SD:  tmpl = tmpl_UQRSHRN;  goto right32x4;
-            case ARM64vecsh_SQRSHRN2SD:  tmpl = tmpl_SQRSHRN;  goto right32x4;
-            case ARM64vecsh_SQRSHRUN2SD: tmpl = tmpl_SQRSHRUN; goto right32x4;
-            case ARM64vecsh_SHL32x4:     tmpl = tmpl_SHL;      goto left32x4;
-            case ARM64vecsh_UQSHL32x4:   tmpl = tmpl_UQSHL;    goto left32x4;
-            case ARM64vecsh_SQSHL32x4:   tmpl = tmpl_SQSHL;    goto left32x4;
-            case ARM64vecsh_SQSHLU32x4:  tmpl = tmpl_SQSHLU;   goto left32x4;
-            case ARM64vecsh_SSHR16x8:    tmpl = tmpl_SSHR;     goto right16x8;
-            case ARM64vecsh_USHR16x8:    tmpl = tmpl_USHR;     goto right16x8;
-            case ARM64vecsh_UQSHRN4HS:   tmpl = tmpl_UQSHRN;   goto right16x8;
-            case ARM64vecsh_SQSHRN4HS:   tmpl = tmpl_SQSHRN;   goto right16x8;
-            case ARM64vecsh_SQSHRUN4HS:  tmpl = tmpl_SQSHRUN;  goto right16x8;
-            case ARM64vecsh_UQRSHRN4HS:  tmpl = tmpl_UQRSHRN;  goto right16x8;
-            case ARM64vecsh_SQRSHRN4HS:  tmpl = tmpl_SQRSHRN;  goto right16x8;
-            case ARM64vecsh_SQRSHRUN4HS: tmpl = tmpl_SQRSHRUN; goto right16x8;
-            case ARM64vecsh_SHL16x8:     tmpl = tmpl_SHL;      goto left16x8;
-            case ARM64vecsh_UQSHL16x8:   tmpl = tmpl_UQSHL;    goto left16x8;
-            case ARM64vecsh_SQSHL16x8:   tmpl = tmpl_SQSHL;    goto left16x8;
-            case ARM64vecsh_SQSHLU16x8:  tmpl = tmpl_SQSHLU;   goto left16x8;
-            case ARM64vecsh_SSHR8x16:    tmpl = tmpl_SSHR;     goto right8x16;
-            case ARM64vecsh_USHR8x16:    tmpl = tmpl_USHR;     goto right8x16;
-            case ARM64vecsh_UQSHRN8BH:   tmpl = tmpl_UQSHRN;   goto right8x16;
-            case ARM64vecsh_SQSHRN8BH:   tmpl = tmpl_SQSHRN;   goto right8x16;
-            case ARM64vecsh_SQSHRUN8BH:  tmpl = tmpl_SQSHRUN;  goto right8x16;
-            case ARM64vecsh_UQRSHRN8BH:  tmpl = tmpl_UQRSHRN;  goto right8x16;
-            case ARM64vecsh_SQRSHRN8BH:  tmpl = tmpl_SQRSHRN;  goto right8x16;
-            case ARM64vecsh_SQRSHRUN8BH: tmpl = tmpl_SQRSHRUN; goto right8x16;
-            case ARM64vecsh_SHL8x16:     tmpl = tmpl_SHL;      goto left8x16;
-            case ARM64vecsh_UQSHL8x16:   tmpl = tmpl_UQSHL;    goto left8x16;
-            case ARM64vecsh_SQSHL8x16:   tmpl = tmpl_SQSHL;    goto left8x16;
-            case ARM64vecsh_SQSHLU8x16:  tmpl = tmpl_SQSHLU;   goto left8x16;
+            case ARM64vecshi_SSHR64x2:    tmpl = tmpl_SSHR;     goto right64x2;
+            case ARM64vecshi_USHR64x2:    tmpl = tmpl_USHR;     goto right64x2;
+            case ARM64vecshi_SHL64x2:     tmpl = tmpl_SHL;      goto left64x2;
+            case ARM64vecshi_UQSHL64x2:   tmpl = tmpl_UQSHL;    goto left64x2;
+            case ARM64vecshi_SQSHL64x2:   tmpl = tmpl_SQSHL;    goto left64x2;
+            case ARM64vecshi_SQSHLU64x2:  tmpl = tmpl_SQSHLU;   goto left64x2;
+            case ARM64vecshi_SSHR32x4:    tmpl = tmpl_SSHR;     goto right32x4;
+            case ARM64vecshi_USHR32x4:    tmpl = tmpl_USHR;     goto right32x4;
+            case ARM64vecshi_UQSHRN2SD:   tmpl = tmpl_UQSHRN;   goto right32x4;
+            case ARM64vecshi_SQSHRN2SD:   tmpl = tmpl_SQSHRN;   goto right32x4;
+            case ARM64vecshi_SQSHRUN2SD:  tmpl = tmpl_SQSHRUN;  goto right32x4;
+            case ARM64vecshi_UQRSHRN2SD:  tmpl = tmpl_UQRSHRN;  goto right32x4;
+            case ARM64vecshi_SQRSHRN2SD:  tmpl = tmpl_SQRSHRN;  goto right32x4;
+            case ARM64vecshi_SQRSHRUN2SD: tmpl = tmpl_SQRSHRUN; goto right32x4;
+            case ARM64vecshi_SHL32x4:     tmpl = tmpl_SHL;      goto left32x4;
+            case ARM64vecshi_UQSHL32x4:   tmpl = tmpl_UQSHL;    goto left32x4;
+            case ARM64vecshi_SQSHL32x4:   tmpl = tmpl_SQSHL;    goto left32x4;
+            case ARM64vecshi_SQSHLU32x4:  tmpl = tmpl_SQSHLU;   goto left32x4;
+            case ARM64vecshi_SSHR16x8:    tmpl = tmpl_SSHR;     goto right16x8;
+            case ARM64vecshi_USHR16x8:    tmpl = tmpl_USHR;     goto right16x8;
+            case ARM64vecshi_UQSHRN4HS:   tmpl = tmpl_UQSHRN;   goto right16x8;
+            case ARM64vecshi_SQSHRN4HS:   tmpl = tmpl_SQSHRN;   goto right16x8;
+            case ARM64vecshi_SQSHRUN4HS:  tmpl = tmpl_SQSHRUN;  goto right16x8;
+            case ARM64vecshi_UQRSHRN4HS:  tmpl = tmpl_UQRSHRN;  goto right16x8;
+            case ARM64vecshi_SQRSHRN4HS:  tmpl = tmpl_SQRSHRN;  goto right16x8;
+            case ARM64vecshi_SQRSHRUN4HS: tmpl = tmpl_SQRSHRUN; goto right16x8;
+            case ARM64vecshi_SHL16x8:     tmpl = tmpl_SHL;      goto left16x8;
+            case ARM64vecshi_UQSHL16x8:   tmpl = tmpl_UQSHL;    goto left16x8;
+            case ARM64vecshi_SQSHL16x8:   tmpl = tmpl_SQSHL;    goto left16x8;
+            case ARM64vecshi_SQSHLU16x8:  tmpl = tmpl_SQSHLU;   goto left16x8;
+            case ARM64vecshi_SSHR8x16:    tmpl = tmpl_SSHR;     goto right8x16;
+            case ARM64vecshi_USHR8x16:    tmpl = tmpl_USHR;     goto right8x16;
+            case ARM64vecshi_UQSHRN8BH:   tmpl = tmpl_UQSHRN;   goto right8x16;
+            case ARM64vecshi_SQSHRN8BH:   tmpl = tmpl_SQSHRN;   goto right8x16;
+            case ARM64vecshi_SQSHRUN8BH:  tmpl = tmpl_SQSHRUN;  goto right8x16;
+            case ARM64vecshi_UQRSHRN8BH:  tmpl = tmpl_UQRSHRN;  goto right8x16;
+            case ARM64vecshi_SQRSHRN8BH:  tmpl = tmpl_SQRSHRN;  goto right8x16;
+            case ARM64vecshi_SQRSHRUN8BH: tmpl = tmpl_SQRSHRUN; goto right8x16;
+            case ARM64vecshi_SHL8x16:     tmpl = tmpl_SHL;      goto left8x16;
+            case ARM64vecshi_UQSHL8x16:   tmpl = tmpl_UQSHL;    goto left8x16;
+            case ARM64vecshi_SQSHL8x16:   tmpl = tmpl_SQSHL;    goto left8x16;
+            case ARM64vecshi_SQSHLU8x16:  tmpl = tmpl_SQSHLU;   goto left8x16;
 
             default: break;
 
diff --git a/priv/host_arm64_defs.h b/priv/host_arm64_defs.h
index 8c07bda..9755b52 100644
--- a/priv/host_arm64_defs.h
+++ b/priv/host_arm64_defs.h
@@ -372,6 +372,14 @@
       ARM64vecb_SQRSHL16x8,  ARM64vecb_SQRSHL8x16,
       ARM64vecb_UQRSHL64x2,  ARM64vecb_UQRSHL32x4,
       ARM64vecb_UQRSHL16x8,  ARM64vecb_UQRSHL8x16,
+      ARM64vecb_SSHL64x2,    ARM64vecb_SSHL32x4,
+      ARM64vecb_SSHL16x8,    ARM64vecb_SSHL8x16, 
+      ARM64vecb_USHL64x2,    ARM64vecb_USHL32x4,
+      ARM64vecb_USHL16x8,    ARM64vecb_USHL8x16, 
+      ARM64vecb_SRSHL64x2,   ARM64vecb_SRSHL32x4,
+      ARM64vecb_SRSHL16x8,   ARM64vecb_SRSHL8x16, 
+      ARM64vecb_URSHL64x2,   ARM64vecb_URSHL32x4,
+      ARM64vecb_URSHL16x8,   ARM64vecb_URSHL8x16, 
       ARM64vecb_INVALID
    }
    ARM64VecBinOp;
@@ -396,30 +404,30 @@
 
 typedef
    enum {
-      ARM64vecsh_USHR64x2=350, ARM64vecsh_USHR32x4,
-      ARM64vecsh_USHR16x8,     ARM64vecsh_USHR8x16,
-      ARM64vecsh_SSHR64x2,     ARM64vecsh_SSHR32x4,
-      ARM64vecsh_SSHR16x8,     ARM64vecsh_SSHR8x16,
-      ARM64vecsh_SHL64x2,      ARM64vecsh_SHL32x4,
-      ARM64vecsh_SHL16x8,      ARM64vecsh_SHL8x16,
+      ARM64vecshi_USHR64x2=350, ARM64vecshi_USHR32x4,
+      ARM64vecshi_USHR16x8,     ARM64vecshi_USHR8x16,
+      ARM64vecshi_SSHR64x2,     ARM64vecshi_SSHR32x4,
+      ARM64vecshi_SSHR16x8,     ARM64vecshi_SSHR8x16,
+      ARM64vecshi_SHL64x2,      ARM64vecshi_SHL32x4,
+      ARM64vecshi_SHL16x8,      ARM64vecshi_SHL8x16,
       /* These narrowing shifts zero out the top half of the destination
          register. */
-      ARM64vecsh_SQSHRN2SD,    ARM64vecsh_SQSHRN4HS,   ARM64vecsh_SQSHRN8BH,
-      ARM64vecsh_UQSHRN2SD,    ARM64vecsh_UQSHRN4HS,   ARM64vecsh_UQSHRN8BH,
-      ARM64vecsh_SQSHRUN2SD,   ARM64vecsh_SQSHRUN4HS,  ARM64vecsh_SQSHRUN8BH,
-      ARM64vecsh_SQRSHRN2SD,   ARM64vecsh_SQRSHRN4HS,  ARM64vecsh_SQRSHRN8BH,
-      ARM64vecsh_UQRSHRN2SD,   ARM64vecsh_UQRSHRN4HS,  ARM64vecsh_UQRSHRN8BH,
-      ARM64vecsh_SQRSHRUN2SD,  ARM64vecsh_SQRSHRUN4HS, ARM64vecsh_SQRSHRUN8BH,
+      ARM64vecshi_SQSHRN2SD,    ARM64vecshi_SQSHRN4HS,   ARM64vecshi_SQSHRN8BH,
+      ARM64vecshi_UQSHRN2SD,    ARM64vecshi_UQSHRN4HS,   ARM64vecshi_UQSHRN8BH,
+      ARM64vecshi_SQSHRUN2SD,   ARM64vecshi_SQSHRUN4HS,  ARM64vecshi_SQSHRUN8BH,
+      ARM64vecshi_SQRSHRN2SD,   ARM64vecshi_SQRSHRN4HS,  ARM64vecshi_SQRSHRN8BH,
+      ARM64vecshi_UQRSHRN2SD,   ARM64vecshi_UQRSHRN4HS,  ARM64vecshi_UQRSHRN8BH,
+      ARM64vecshi_SQRSHRUN2SD,  ARM64vecshi_SQRSHRUN4HS, ARM64vecshi_SQRSHRUN8BH,
       /* Saturating left shifts, of various flavours. */
-      ARM64vecsh_UQSHL64x2,    ARM64vecsh_UQSHL32x4,
-      ARM64vecsh_UQSHL16x8,    ARM64vecsh_UQSHL8x16, 
-      ARM64vecsh_SQSHL64x2,    ARM64vecsh_SQSHL32x4,
-      ARM64vecsh_SQSHL16x8,    ARM64vecsh_SQSHL8x16, 
-      ARM64vecsh_SQSHLU64x2,   ARM64vecsh_SQSHLU32x4,
-      ARM64vecsh_SQSHLU16x8,   ARM64vecsh_SQSHLU8x16, 
-      ARM64vecsh_INVALID
+      ARM64vecshi_UQSHL64x2,    ARM64vecshi_UQSHL32x4,
+      ARM64vecshi_UQSHL16x8,    ARM64vecshi_UQSHL8x16, 
+      ARM64vecshi_SQSHL64x2,    ARM64vecshi_SQSHL32x4,
+      ARM64vecshi_SQSHL16x8,    ARM64vecshi_SQSHL8x16, 
+      ARM64vecshi_SQSHLU64x2,   ARM64vecshi_SQSHLU32x4,
+      ARM64vecshi_SQSHLU16x8,   ARM64vecshi_SQSHLU8x16, 
+      ARM64vecshi_INVALID
    }
-   ARM64VecShiftOp;
+   ARM64VecShiftImmOp;
 
 typedef
    enum {
@@ -758,10 +766,10 @@
            |amt| must be > 0 and <= implied lane size of |op|.  Shifts
            beyond these ranges are not allowed. */
         struct {
-           ARM64VecShiftOp op;
-           HReg            dst;
-           HReg            src;
-           UInt            amt;
+           ARM64VecShiftImmOp op;
+           HReg               dst;
+           HReg               src;
+           UInt               amt;
         } VShiftImmV;
         struct {
            HReg dst;
@@ -866,7 +874,7 @@
 extern ARM64Instr* ARM64Instr_VUnaryV ( ARM64VecUnaryOp op, HReg, HReg );
 extern ARM64Instr* ARM64Instr_VNarrowV ( ARM64VecNarrowOp op, UInt dszBlg2,
                                          HReg dst, HReg src );
-extern ARM64Instr* ARM64Instr_VShiftImmV ( ARM64VecShiftOp op,
+extern ARM64Instr* ARM64Instr_VShiftImmV ( ARM64VecShiftImmOp op,
                                            HReg dst, HReg src, UInt amt );
 extern ARM64Instr* ARM64Instr_VExtV   ( HReg dst,
                                         HReg srcLo, HReg srcHi, UInt amtB );
diff --git a/priv/host_arm64_isel.c b/priv/host_arm64_isel.c
index 6a5c349..0787419 100644
--- a/priv/host_arm64_isel.c
+++ b/priv/host_arm64_isel.c
@@ -2335,8 +2335,16 @@
          case Iop_QSub16Sx8: case Iop_QSub8Sx16:
          case Iop_QSub64Ux2: case Iop_QSub32Ux4:
          case Iop_QSub16Ux8: case Iop_QSub8Ux16:
-         case Iop_QDMulHi32Sx4:  case Iop_QDMulHi16Sx8: 
-         case Iop_QRDMulHi32Sx4: case Iop_QRDMulHi16Sx8: 
+         case Iop_QDMulHi32Sx4:  case Iop_QDMulHi16Sx8:
+         case Iop_QRDMulHi32Sx4: case Iop_QRDMulHi16Sx8:
+         case Iop_Sh8Sx16:  case Iop_Sh16Sx8:
+         case Iop_Sh32Sx4:  case Iop_Sh64Sx2:
+         case Iop_Sh8Ux16:  case Iop_Sh16Ux8:
+         case Iop_Sh32Ux4:  case Iop_Sh64Ux2:
+         case Iop_Rsh8Sx16: case Iop_Rsh16Sx8:
+         case Iop_Rsh32Sx4: case Iop_Rsh64Sx2:
+         case Iop_Rsh8Ux16: case Iop_Rsh16Ux8:
+         case Iop_Rsh32Ux4: case Iop_Rsh64Ux2:
          {
             HReg res  = newVRegV(env);
             HReg argL = iselV128Expr(env, e->Iex.Binop.arg1);
@@ -2438,6 +2446,22 @@
                case Iop_QDMulHi16Sx8:   op = ARM64vecb_SQDMULH16x8; break;
                case Iop_QRDMulHi32Sx4:  op = ARM64vecb_SQRDMULH32x4; break;
                case Iop_QRDMulHi16Sx8:  op = ARM64vecb_SQRDMULH16x8; break;
+               case Iop_Sh8Sx16:        op = ARM64vecb_SSHL8x16; break;
+               case Iop_Sh16Sx8:        op = ARM64vecb_SSHL16x8; break;
+               case Iop_Sh32Sx4:        op = ARM64vecb_SSHL32x4; break;
+               case Iop_Sh64Sx2:        op = ARM64vecb_SSHL64x2; break;
+               case Iop_Sh8Ux16:        op = ARM64vecb_USHL8x16; break;
+               case Iop_Sh16Ux8:        op = ARM64vecb_USHL16x8; break;
+               case Iop_Sh32Ux4:        op = ARM64vecb_USHL32x4; break;
+               case Iop_Sh64Ux2:        op = ARM64vecb_USHL64x2; break;
+               case Iop_Rsh8Sx16:       op = ARM64vecb_SRSHL8x16; break;
+               case Iop_Rsh16Sx8:       op = ARM64vecb_SRSHL16x8; break;
+               case Iop_Rsh32Sx4:       op = ARM64vecb_SRSHL32x4; break;
+               case Iop_Rsh64Sx2:       op = ARM64vecb_SRSHL64x2; break;
+               case Iop_Rsh8Ux16:       op = ARM64vecb_URSHL8x16; break;
+               case Iop_Rsh16Ux8:       op = ARM64vecb_URSHL16x8; break;
+               case Iop_Rsh32Ux4:       op = ARM64vecb_URSHL32x4; break;
+               case Iop_Rsh64Ux2:       op = ARM64vecb_URSHL64x2; break;
                default: vassert(0);
             }
             if (sw) {
@@ -2466,33 +2490,33 @@
                UInt amt   = argR->Iex.Const.con->Ico.U8;
                UInt limLo = 0;
                UInt limHi = 0;
-               ARM64VecShiftOp op = ARM64vecsh_INVALID;
+               ARM64VecShiftImmOp op = ARM64vecshi_INVALID;
                /* Establish the instruction to use. */
                switch (e->Iex.Binop.op) {
-                  case Iop_ShrN64x2:       op = ARM64vecsh_USHR64x2;   break;
-                  case Iop_ShrN32x4:       op = ARM64vecsh_USHR32x4;   break;
-                  case Iop_ShrN16x8:       op = ARM64vecsh_USHR16x8;   break;
-                  case Iop_ShrN8x16:       op = ARM64vecsh_USHR8x16;   break;
-                  case Iop_SarN64x2:       op = ARM64vecsh_SSHR64x2;   break;
-                  case Iop_SarN32x4:       op = ARM64vecsh_SSHR32x4;   break;
-                  case Iop_SarN16x8:       op = ARM64vecsh_SSHR16x8;   break;
-                  case Iop_SarN8x16:       op = ARM64vecsh_SSHR8x16;   break;
-                  case Iop_ShlN64x2:       op = ARM64vecsh_SHL64x2;    break;
-                  case Iop_ShlN32x4:       op = ARM64vecsh_SHL32x4;    break;
-                  case Iop_ShlN16x8:       op = ARM64vecsh_SHL16x8;    break;
-                  case Iop_ShlN8x16:       op = ARM64vecsh_SHL8x16;    break;
-                  case Iop_QShlNsatUU64x2: op = ARM64vecsh_UQSHL64x2;  break;
-                  case Iop_QShlNsatUU32x4: op = ARM64vecsh_UQSHL32x4;  break;
-                  case Iop_QShlNsatUU16x8: op = ARM64vecsh_UQSHL16x8;  break;
-                  case Iop_QShlNsatUU8x16: op = ARM64vecsh_UQSHL8x16;  break;
-                  case Iop_QShlNsatSS64x2: op = ARM64vecsh_SQSHL64x2;  break;
-                  case Iop_QShlNsatSS32x4: op = ARM64vecsh_SQSHL32x4;  break;
-                  case Iop_QShlNsatSS16x8: op = ARM64vecsh_SQSHL16x8;  break;
-                  case Iop_QShlNsatSS8x16: op = ARM64vecsh_SQSHL8x16;  break;
-                  case Iop_QShlNsatSU64x2: op = ARM64vecsh_SQSHLU64x2; break;
-                  case Iop_QShlNsatSU32x4: op = ARM64vecsh_SQSHLU32x4; break;
-                  case Iop_QShlNsatSU16x8: op = ARM64vecsh_SQSHLU16x8; break;
-                  case Iop_QShlNsatSU8x16: op = ARM64vecsh_SQSHLU8x16; break;
+                  case Iop_ShrN64x2:       op = ARM64vecshi_USHR64x2;   break;
+                  case Iop_ShrN32x4:       op = ARM64vecshi_USHR32x4;   break;
+                  case Iop_ShrN16x8:       op = ARM64vecshi_USHR16x8;   break;
+                  case Iop_ShrN8x16:       op = ARM64vecshi_USHR8x16;   break;
+                  case Iop_SarN64x2:       op = ARM64vecshi_SSHR64x2;   break;
+                  case Iop_SarN32x4:       op = ARM64vecshi_SSHR32x4;   break;
+                  case Iop_SarN16x8:       op = ARM64vecshi_SSHR16x8;   break;
+                  case Iop_SarN8x16:       op = ARM64vecshi_SSHR8x16;   break;
+                  case Iop_ShlN64x2:       op = ARM64vecshi_SHL64x2;    break;
+                  case Iop_ShlN32x4:       op = ARM64vecshi_SHL32x4;    break;
+                  case Iop_ShlN16x8:       op = ARM64vecshi_SHL16x8;    break;
+                  case Iop_ShlN8x16:       op = ARM64vecshi_SHL8x16;    break;
+                  case Iop_QShlNsatUU64x2: op = ARM64vecshi_UQSHL64x2;  break;
+                  case Iop_QShlNsatUU32x4: op = ARM64vecshi_UQSHL32x4;  break;
+                  case Iop_QShlNsatUU16x8: op = ARM64vecshi_UQSHL16x8;  break;
+                  case Iop_QShlNsatUU8x16: op = ARM64vecshi_UQSHL8x16;  break;
+                  case Iop_QShlNsatSS64x2: op = ARM64vecshi_SQSHL64x2;  break;
+                  case Iop_QShlNsatSS32x4: op = ARM64vecshi_SQSHL32x4;  break;
+                  case Iop_QShlNsatSS16x8: op = ARM64vecshi_SQSHL16x8;  break;
+                  case Iop_QShlNsatSS8x16: op = ARM64vecshi_SQSHL8x16;  break;
+                  case Iop_QShlNsatSU64x2: op = ARM64vecshi_SQSHLU64x2; break;
+                  case Iop_QShlNsatSU32x4: op = ARM64vecshi_SQSHLU32x4; break;
+                  case Iop_QShlNsatSU16x8: op = ARM64vecshi_SQSHLU16x8; break;
+                  case Iop_QShlNsatSU8x16: op = ARM64vecshi_SQSHLU8x16; break;
                   default: vassert(0);
                }
                /* Establish the shift limits, for sanity check purposes only. */
@@ -2526,7 +2550,7 @@
                /* For left shifts, the allowable amt values are
                   0 .. lane_bits-1.  For right shifts the allowable
                   values are 1 .. lane_bits. */
-               if (op != ARM64vecsh_INVALID && amt >= limLo && amt <= limHi) {
+               if (op != ARM64vecshi_INVALID && amt >= limLo && amt <= limHi) {
                   HReg src = iselV128Expr(env, argL);
                   HReg dst = newVRegV(env);
                   addInstr(env, ARM64Instr_VShiftImmV(op, dst, src, amt));
@@ -2581,55 +2605,55 @@
             if (argR->tag == Iex_Const && argR->Iex.Const.con->tag == Ico_U8) {
                UInt amt   = argR->Iex.Const.con->Ico.U8;
                UInt limit = 0;
-               ARM64VecShiftOp op = ARM64vecsh_INVALID;
+               ARM64VecShiftImmOp op = ARM64vecshi_INVALID;
                switch (e->Iex.Binop.op) {
                   /* uu */
                   case Iop_QandQShrNnarrow64Uto32Ux2:
-                     op = ARM64vecsh_UQSHRN2SD; limit = 64; break;
+                     op = ARM64vecshi_UQSHRN2SD; limit = 64; break;
                   case Iop_QandQShrNnarrow32Uto16Ux4:
-                     op = ARM64vecsh_UQSHRN4HS; limit = 32; break;
+                     op = ARM64vecshi_UQSHRN4HS; limit = 32; break;
                   case Iop_QandQShrNnarrow16Uto8Ux8:
-                     op = ARM64vecsh_UQSHRN8BH; limit = 16; break;
+                     op = ARM64vecshi_UQSHRN8BH; limit = 16; break;
                   /* ss */
                   case Iop_QandQSarNnarrow64Sto32Sx2:
-                     op = ARM64vecsh_SQSHRN2SD; limit = 64; break;
+                     op = ARM64vecshi_SQSHRN2SD; limit = 64; break;
                   case Iop_QandQSarNnarrow32Sto16Sx4:
-                     op = ARM64vecsh_SQSHRN4HS; limit = 32; break;
+                     op = ARM64vecshi_SQSHRN4HS; limit = 32; break;
                   case Iop_QandQSarNnarrow16Sto8Sx8:
-                     op = ARM64vecsh_SQSHRN8BH; limit = 16; break;
+                     op = ARM64vecshi_SQSHRN8BH; limit = 16; break;
                   /* su */
                   case Iop_QandQSarNnarrow64Sto32Ux2:
-                     op = ARM64vecsh_SQSHRUN2SD; limit = 64; break;
+                     op = ARM64vecshi_SQSHRUN2SD; limit = 64; break;
                   case Iop_QandQSarNnarrow32Sto16Ux4:
-                     op = ARM64vecsh_SQSHRUN4HS; limit = 32; break;
+                     op = ARM64vecshi_SQSHRUN4HS; limit = 32; break;
                   case Iop_QandQSarNnarrow16Sto8Ux8:
-                     op = ARM64vecsh_SQSHRUN8BH; limit = 16; break;
+                     op = ARM64vecshi_SQSHRUN8BH; limit = 16; break;
                   /* ruu */
                   case Iop_QandQRShrNnarrow64Uto32Ux2:
-                     op = ARM64vecsh_UQRSHRN2SD; limit = 64; break;
+                     op = ARM64vecshi_UQRSHRN2SD; limit = 64; break;
                   case Iop_QandQRShrNnarrow32Uto16Ux4:
-                     op = ARM64vecsh_UQRSHRN4HS; limit = 32; break;
+                     op = ARM64vecshi_UQRSHRN4HS; limit = 32; break;
                   case Iop_QandQRShrNnarrow16Uto8Ux8:
-                     op = ARM64vecsh_UQRSHRN8BH; limit = 16; break;
+                     op = ARM64vecshi_UQRSHRN8BH; limit = 16; break;
                   /* rss */
                   case Iop_QandQRSarNnarrow64Sto32Sx2:
-                     op = ARM64vecsh_SQRSHRN2SD; limit = 64; break;
+                     op = ARM64vecshi_SQRSHRN2SD; limit = 64; break;
                   case Iop_QandQRSarNnarrow32Sto16Sx4:
-                     op = ARM64vecsh_SQRSHRN4HS; limit = 32; break;
+                     op = ARM64vecshi_SQRSHRN4HS; limit = 32; break;
                   case Iop_QandQRSarNnarrow16Sto8Sx8:
-                     op = ARM64vecsh_SQRSHRN8BH; limit = 16; break;
+                     op = ARM64vecshi_SQRSHRN8BH; limit = 16; break;
                   /* rsu */
                   case Iop_QandQRSarNnarrow64Sto32Ux2:
-                     op = ARM64vecsh_SQRSHRUN2SD; limit = 64; break;
+                     op = ARM64vecshi_SQRSHRUN2SD; limit = 64; break;
                   case Iop_QandQRSarNnarrow32Sto16Ux4:
-                     op = ARM64vecsh_SQRSHRUN4HS; limit = 32; break;
+                     op = ARM64vecshi_SQRSHRUN4HS; limit = 32; break;
                   case Iop_QandQRSarNnarrow16Sto8Ux8:
-                     op = ARM64vecsh_SQRSHRUN8BH; limit = 16; break;
+                     op = ARM64vecshi_SQRSHRUN8BH; limit = 16; break;
                   /**/
                   default:
                      vassert(0);
                }
-               if (op != ARM64vecsh_INVALID && amt >= 1 && amt <= limit) {
+               if (op != ARM64vecshi_INVALID && amt >= 1 && amt <= limit) {
                   HReg src  = iselV128Expr(env, argL);
                   HReg dst  = newVRegV(env);
                   HReg fpsr = newVRegI(env);
diff --git a/priv/ir_defs.c b/priv/ir_defs.c
index 50f161d..c305c22 100644
--- a/priv/ir_defs.c
+++ b/priv/ir_defs.c
@@ -920,6 +920,23 @@
       case Iop_QandSQRsh32x4: vex_printf("QandSQRsh32x4"); return;
       case Iop_QandSQRsh64x2: vex_printf("QandSQRsh64x2"); return;
 
+      case Iop_Sh8Sx16: vex_printf("Sh8Sx16"); return;
+      case Iop_Sh16Sx8: vex_printf("Sh16Sx8"); return;
+      case Iop_Sh32Sx4: vex_printf("Sh32Sx4"); return;
+      case Iop_Sh64Sx2: vex_printf("Sh64Sx2"); return;
+      case Iop_Sh8Ux16: vex_printf("Sh8Ux16"); return;
+      case Iop_Sh16Ux8: vex_printf("Sh16Ux8"); return;
+      case Iop_Sh32Ux4: vex_printf("Sh32Ux4"); return;
+      case Iop_Sh64Ux2: vex_printf("Sh64Ux2"); return;
+      case Iop_Rsh8Sx16: vex_printf("Rsh8Sx16"); return;
+      case Iop_Rsh16Sx8: vex_printf("Rsh16Sx8"); return;
+      case Iop_Rsh32Sx4: vex_printf("Rsh32Sx4"); return;
+      case Iop_Rsh64Sx2: vex_printf("Rsh64Sx2"); return;
+      case Iop_Rsh8Ux16: vex_printf("Rsh8Ux16"); return;
+      case Iop_Rsh16Ux8: vex_printf("Rsh16Ux8"); return;
+      case Iop_Rsh32Ux4: vex_printf("Rsh32Ux4"); return;
+      case Iop_Rsh64Ux2: vex_printf("Rsh64Ux2"); return;
+
       case Iop_QandQShrNnarrow16Uto8Ux8:
          vex_printf("QandQShrNnarrow16Uto8Ux8"); return;
       case Iop_QandQShrNnarrow32Uto16Ux4:
@@ -2940,6 +2957,14 @@
       case Iop_CipherLV128:
       case Iop_NCipherV128:
       case Iop_NCipherLV128:
+      case Iop_Sh8Sx16: case Iop_Sh16Sx8:
+      case Iop_Sh32Sx4: case Iop_Sh64Sx2:
+      case Iop_Sh8Ux16: case Iop_Sh16Ux8:
+      case Iop_Sh32Ux4: case Iop_Sh64Ux2:
+      case Iop_Rsh8Sx16: case Iop_Rsh16Sx8:
+      case Iop_Rsh32Sx4: case Iop_Rsh64Sx2:
+      case Iop_Rsh8Ux16: case Iop_Rsh16Ux8:
+      case Iop_Rsh32Ux4: case Iop_Rsh64Ux2:
          BINARY(Ity_V128,Ity_V128, Ity_V128);
 
       case Iop_PolynomialMull8x8:
diff --git a/pub/libvex_ir.h b/pub/libvex_ir.h
index 9431acb..9f66681 100644
--- a/pub/libvex_ir.h
+++ b/pub/libvex_ir.h
@@ -1545,6 +1545,7 @@
       Iop_QShlNsatSS32x4, Iop_QShlNsatSS64x2,
 
       /* VECTOR x VECTOR BIDIRECTIONAL SATURATING (& MAYBE ROUNDING) SHIFT */
+      /* All of type (V128, V128) -> V256. */
       /* The least significant 8 bits of each lane of the second
          operand are used as the shift amount, and interpreted signedly.
          Positive values mean a shift left, negative a shift right.  The
@@ -1572,6 +1573,34 @@
       Iop_QandSQRsh8x16, Iop_QandSQRsh16x8,
       Iop_QandSQRsh32x4, Iop_QandSQRsh64x2,
 
+      /* VECTOR x VECTOR BIDIRECTIONAL (& MAYBE ROUNDING) SHIFT */
+      /* All of type (V128, V128) -> V128 */
+      /* The least significant 8 bits of each lane of the second
+         operand are used as the shift amount, and interpreted signedly.
+         Positive values mean a shift left, negative a shift right.
+         There are also rounding variants, which add 2^(shift_amount-1)
+         to the value before shifting, but only in the shift-right case.
+
+         For left shifts, the vacated places are filled with zeroes.
+         For right shifts, the vacated places are filled with zeroes
+         for the U variants and sign bits for the S variants. */
+      // Signed and unsigned, non-rounding
+      Iop_Sh8Sx16, Iop_Sh16Sx8, Iop_Sh32Sx4, Iop_Sh64Sx2,
+      Iop_Sh8Ux16, Iop_Sh16Ux8, Iop_Sh32Ux4, Iop_Sh64Ux2,
+
+      // Signed and unsigned, rounding
+      Iop_Rsh8Sx16, Iop_Rsh16Sx8, Iop_Rsh32Sx4, Iop_Rsh64Sx2,
+      Iop_Rsh8Ux16, Iop_Rsh16Ux8, Iop_Rsh32Ux4, Iop_Rsh64Ux2,
+
+      /* The least significant 8 bits of each lane of the second
+         operand are used as the shift amount, and interpreted signedly.
+         Positive values mean a shift left, negative a shift right.  The
+         result is signedly or unsignedly saturated.  There are also
+         rounding variants, which add 2^(shift_amount-1) to the value before
+         shifting, but only in the shift-right case.  Vacated positions
+         are filled with zeroes.  IOW, it's either SHR or SHL, but not SAR.
+      */
+
       /* VECTOR x SCALAR SATURATING (& MAYBE ROUNDING) NARROWING SHIFT RIGHT */
       /* All of type (V128, I8) -> V128 */
       /* The first argument is shifted right, then narrowed to half the width